1. 程式人生 > >爬蟲_鬥圖啦(隊列,多線程)

爬蟲_鬥圖啦(隊列,多線程)

produce rom return range while rod 爬蟲 put 2.0

 1 import threading
 2 import requests
 3 from lxml import etree
 4 from urllib import request
 5 import os
 6 import re
 7 from queue import Queue
 8 
 9 
10 class Producer(threading.Thread):
11     headers = {
12         User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36
13 } 14 def __init__(self,page_queue,img_queue,*args,**kwargs): 15 super(Producer, self).__init__(*args,**kwargs) 16 self.page_queue = page_queue 17 self.img_queue = img_queue 18 19 20 def run(self): 21 while True: 22 if self.page_queue.empty():
23 break 24 url = self.page_queue.get() 25 self.parse_page(url) 26 27 28 def parse_page(self,url): 29 response = requests.get(url,headers=self.headers) 30 text = response.text 31 html = etree.HTML(text) 32 imgs = html.xpath("
//div[@class=‘page-content text-center‘]//a//img") 33 for img in imgs: 34 if img.get(class) == gif: 35 continue 36 img_url = img.xpath(".//@data-original")[0] 37 suffix = os.path.splitext(img_url)[1] 38 alt = img.xpath(".//@alt")[0] 39 alt = re.sub(r[,。??,/\\·],‘‘,alt) 40 img_name = alt + suffix 41 self.img_queue.put((img_url,img_name)) 42 43 44 class Consumer(threading.Thread): 45 def __init__(self,page_queue,img_queue,*args,**kwargs): 46 super(Consumer, self).__init__(*args,**kwargs) 47 self.page_queue = page_queue 48 self.img_queue = img_queue 49 50 51 def run(self): 52 while True: 53 if self.img_queue.empty(): 54 if self.page_queue.empty(): 55 return 56 img = self.img_queue.get(block=True) 57 url,filename = img 58 request.urlretrieve(url,images/+filename) 59 print(filename+ 下載) 60 61 62 def main(): 63 page_queue = Queue(100) 64 img_queue = Queue(500) 65 66 for x in range(1,101): 67 url = "http://www.doutula.com/photo/list/?page=%d" % x 68 page_queue.put(url) 69 for x in range(5): 70 t = Producer(page_queue,img_queue) 71 t.start() 72 for x in range(5): 73 t = Consumer(page_queue,img_queue) 74 t.start() 75 76 77 if __name__ == __main__: 78 main()

下載是相當快啊

爬蟲_鬥圖啦(隊列,多線程)