1. 程式人生 > >多執行緒爬取鬥圖圖片

多執行緒爬取鬥圖圖片

結果演示

 

程式碼:

#encoding:utf-8
# __author__ = 'donghao'
# __time__ = 2018/12/24 15:20
import requests
import threading
import urllib.request
import urllib3
import os
import re
import time
from lxml import etree
from queue import Queue

#負責解析圖片
class Producer(threading.Thread):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'
    }

    def __init__(self,img_queue,page_queue,*args,**kwargs):
        super(Producer, self).__init__(*args,**kwargs)
        self.img_queue = img_queue
        self.page_queue = page_queue

    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_page(url)

    def parse_page(self,url):
        resp = requests.get(url=url,headers=self.headers)
        text = resp.text
        html = etree.HTML(text)
        imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
        for img in imgs:
            #獲取圖片url
            img_url = img.get('data-original')
            #獲取圖片Url的字尾名
            end = os.path.splitext(img_url)[1]
            #替換掉url中特殊字元
            end = re.sub(r'[,。??,/\\·]','',end)
            # 獲取圖片描述,並加上字尾
            name = img.get('alt')+end
            
            #儲存圖片url和檔名佇列
            self.img_queue.put((img_url,name))
            
#負責下載圖片
class Consumer(threading.Thread):
    def __init__(self,img_queue,page_queue,*args,**kwargs):
        super(Consumer, self).__init__(*args,**kwargs)
        self.img_queue = img_queue
        self.page_queue = page_queue
    def run(self):
        while True:
            if self.img_queue.empty() and self.page_queue.empty():
                break
            img_url,filename = self.img_queue.get()
            urllib.request.urlretrieve(img_url, 'images/' + filename)
            print(filename+'張圖片下載完成')

def main():
    #爬取10頁
    start = time.time()

    image_queue = Queue(1000)
    page_queue = Queue(100)
    tsk = []
    for x in range(1,10):
        url = 'http://www.doutula.com/photo/list/?page=%d'%x
        #儲存頁面資訊
        page_queue.put(url)

    for x in range(5):
        t = Producer(image_queue,page_queue)
        t.start()
        tsk.append(t)

    for x in range(5):
        t = Consumer(image_queue,page_queue)
        t.start()
        tsk.append(t)
        
    #終止執行,統計時間
    for t in tsk:
        t.join()

    end = time.time()
    print('耗時:%0.002fs' % (end - start))


if __name__ == '__main__':
    main()