1. 程式人生 > >python3爬蟲 -----爬取百思不得姐信息-------http://www.budejie.com/

python3爬蟲 -----爬取百思不得姐信息-------http://www.budejie.com/

chrom tree www cti mozilla from tar 2-0 sum

 1 # -*- coding:utf-8 -*-
 2 # author:zxy
 3 # Date:2018-10-21
 4 
 5 import request
 6 from lxml import etree
 7 import threading
 8 from queue import Queue
 9 import csv
10 import requests
11 
12 class Produce(threading.Thread):
13     headers = {
14         User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) 
15 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36, 16 Cookie: __cfduid=ddb28ef1934faef742f7fb8911d7b33bd1540080067; UM_distinctid=16693ece9945b2-0b031da4b19f32-333b5602-1fa400-16693ece9958e4; 17 _ga=GA1.2.1950184368.1540080070; _gid=GA1.2.1249143498.1540080070; _gat=1
18 } 19 def __init__(self,page_queue,joke_queue,*args,**kwargs): 20 super(Produce, self).__init__(*args,**kwargs) 21 self.base_domain="http://www.budejie.com" 22 self.page_queue = page_queue 23 self.joke_queue = joke_queue 24 def run(self): 25 while
True: 26 if self.page_queue.empty(): 27 break 28 url=self.page_queue.get() 29 self.parse_url(url) 30 31 def parse_url(self,url): 32 reponse=requests.get(url,headers=self.headers) 33 text=reponse.text 34 html=etree.HTML(text) 35 descs=html.xpath("//div[@class=‘j-r-list-c-desc‘]") 36 for desc in descs: 37 jokes=desc.xpath(".//text()") 38 joke="\n".join(jokes).strip() 39 link=self.base_domain+desc.xpath(".//a/@href")[0] 40 self.joke_queue.put((joke,link)) 41 print("="*30+"第%s頁下載完成!"%url.split(/)[-1]+"="*30) 42 43 44 class Consumer(threading.Thread): 45 headers = { 46 User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) 47 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36, 48 Cookie: __cfduid=ddb28ef1934faef742f7fb8911d7b33bd1540080067; UM_distinctid=16693ece9945b2-0b031da4b19f32-333b5602-1fa400-16693ece9958e4; 49 _ga=GA1.2.1950184368.1540080070; _gid=GA1.2.1249143498.1540080070; _gat=1 50 } 51 def __init__(self,joke_queue,write,gLock,*args,**kwargs): 52 super(Consumer, self).__init__(*args,**kwargs) 53 self.joke_queue=joke_queue 54 self.write=write 55 self.gLock=gLock 56 57 def run(self): 58 while True: 59 try: 60 joke_info=self.joke_queue.get(timeout=40) 61 joke,link=joke_info 62 self.gLock.acquire() 63 self.write.writerow((joke,link)) 64 self.gLock.release() 65 except: 66 break 67 68 69 def main(): 70 page_queue=Queue(100) 71 joke_queue=Queue(1000) 72 gLock=threading.Lock() 73 fp=open(baisibudejie.csv,a,newline=‘‘,encoding=utf-8) 74 writer=csv.writer(fp) 75 writer.writerow((content,link)) 76 77 for x in range(1,11): 78 url="http://www.budejie.com/%d"%x 79 page_queue.put(url) 80 81 for x in range(5): 82 t=Produce(page_queue,joke_queue) 83 t.start() 84 85 for x in range(3): 86 t=Consumer(joke_queue,writer,gLock) 87 t.start() 88 89 90 if __name__ == __main__: 91 main()

python3爬蟲 -----爬取百思不得姐信息-------http://www.budejie.com/