1. 程式人生 > >多執行緒、程序池、協程

多執行緒、程序池、協程

程序池

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from lxml import etree
#import threading

# multiprocessing.dummy 是多程序類庫裡裡的一個多執行緒模組,有一個類Pool,表示執行緒池
from multiprocessing.dummy import Pool
import requests
import Queue
import time

class Douban(object):
    def __init__(self):
        self.base_url = "https://movie.douban.com/top250?start="
        self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)]
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}

        # 建立佇列儲存資料
        self.data_queue = Queue.Queue()
        self.count = 0

    def send_request(self, url):
        print url
        html = requests.get(url, headers = self.headers).content
        time.sleep(1)
        self.parse_page(html)


    def parse_page(self, html):
        html_obj = etree.HTML(html)
        node_list = html_obj.xpath("//div[@class='info']")

        for node in node_list:
            # 獲取電影標題
            title = node.xpath("./div[@class='hd']/a/span/text()")[0]
            # 獲取電影評分
            score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]

            # 將資料儲存到佇列裡
            self.data_queue.put(score + "\t" + title)
            #print score, title

    def start_work(self):
        #for url in self.url_list:
            # 傳送請求
        #    html = self.send_request(url)
            # 解析響應

        # 建立執行緒池
        pool = Pool(len(self.url_list))
        pool.map(self.send_request, self.url_list)
        pool.close()
        # 主執行緒等待所有子執行緒執行結束,主執行緒再執行後面的程式碼
        pool.join()

        while not self.data_queue.empty():
            print self.data_queue.get()
            self.count += 1

        print "\n%d" % self.count


if __name__ == "__main__":
    douban = Douban()
    start = time.time()
    douban.start_work()

    print "[INFO]: Useing %f secend" % (time.time() - start)

多執行緒

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from lxml import etree
import requests
import threading
import Queue
import time

class Douban(object):
    def __init__(self):
        self.base_url = "https://movie.douban.com/top250?start="
        self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)]
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}

        # 建立佇列儲存資料
        self.data_queue = Queue.Queue()
        self.count = 0

    def send_request(self, url):
        print url
        html = requests.get(url, headers = self.headers).content
        time.sleep(1)
        self.parse_page(html)


    def parse_page(self, html):
        html_obj = etree.HTML(html)
        node_list = html_obj.xpath("//div[@class='info']")

        for node in node_list:
            # 獲取電影標題
            title = node.xpath("./div[@class='hd']/a/span/text()")[0]
            # 獲取電影評分
            score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]

            # 將資料儲存到佇列裡
            self.data_queue.put(score + "\t" + title)
            #print score, title

    def start_work(self):
        thread_list = []
        for url in self.url_list:
            # 傳送請求
            #html = self.send_request(url)
            thread = threading.Thread(target = self.send_request, args = [url])
            thread.start()
            thread_list.append(thread)


        # 讓主執行緒等待,等待所有子執行緒執行結束,再向下執行程式碼
        for thread in thread_list:
            thread.join()

        while not self.data_queue.empty():
            print self.data_queue.get()
            self.count += 1

        print "\n%d" % self.count


if __name__ == "__main__":
    douban = Douban()
    start = time.time()
    douban.start_work()

    print "[INFO]: Useing %f secend" % (time.time() - start)

協程

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from lxml import etree
import requests
import Queue
import time

import gevent
from gevent import monkey
monkey.patch_all()
# gevent 可以用同步的語法寫非同步的程式。
# monkey.patch_all() 在Python程式執行的時候,會動態的將網路庫(socket, select)打個補丁,變為非同步的庫。
# 讓程式在進行網路操作的時候,都變為非同步的方式去執行。

class Douban(object):
    def __init__(self):
        self.base_url = "https://movie.douban.com/top250?start="
        self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)]
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}

        # 建立佇列儲存資料
        self.data_queue = Queue.Queue()
        self.count = 0

    def send_request(self, url):
        print url
        html = requests.get(url, headers = self.headers).content
        time.sleep(1)
        self.parse_page(html)


    def parse_page(self, html):
        html_obj = etree.HTML(html)
        node_list = html_obj.xpath("//div[@class='info']")

        for node in node_list:
            # 獲取電影標題
            title = node.xpath("./div[@class='hd']/a/span/text()")[0]
            # 獲取電影評分
            score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0]

            # 將資料儲存到佇列裡
            self.data_queue.put(score + "\t" + title)
            #print score, title

    def start_work(self):
        #for url in self.url_list:
            # 傳送請求
        #    html = self.send_request(url)
            # 解析響應

        # 建立任務列表,儲存所有的協程任務
        job_list = []
        for url in self.url_list:
            # 建立一個協程任務
            job = gevent.spawn(self.send_request, url)
            # 將任務新增到列表裡
            job_list.append(job)
        # 將所有的協程任務新增到任務佇列裡執行
        gevent.joinall(job_list)


        #gevent.joinall([gevent.spawn(self.send_request, url) for url in self.url_list])
        #job_list = [gevent.spawn(self.send_request, url) for url in self.url_list]
        #gevent.joinall(job_list)

        while not self.data_queue.empty():
            print self.data_queue.get()
            self.count += 1

        print "\n%d" % self.count


if __name__ == "__main__":
    douban = Douban()
    start = time.time()
    douban.start_work()

    print "[INFO]: Useing %f secend" % (time.time() - start)