爬蟲系列---多線程爬取實例
阿新 • • 發佈:2019-02-28
not 圖片 文件夾 nco get origin nal ade close
1.爬取站長圖片源碼
#爬取站長‘http://sc.chinaz.com/tupian/gudianmeinvtupian.html‘,所有的古典美女圖片 import os import time import random import requests from lxml import etree from multiprocessing.dummy import Pool #獲取所有頁面的url url =‘http://sc.chinaz.com/tupian/gudianmeinvtupian.html‘ page_url_list=[f‘http://sc.chinaz.com/tupian/gudianmeinvtupian_{i}.html‘ for i in range(2,7)] page_url_list.insert(0,url) headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.20 Safari/537.36‘, # ‘Content-Encoding‘:‘gzip‘, # ‘Content-Type‘: ‘text/html‘, } pig_url_list = [] def get_pig_url(url): response= requests.get(url=url, headers=headers) #xpath解析數據 tree = etree.HTML(response.content.decode()) div_list = tree.xpath(‘//div[@id="container"]/div‘) for div in div_list: url = div.xpath(‘.//img/@src2‘)[0] pig_url_list.append(url) def download(url): ‘‘‘下載圖片數據‘‘‘return requests.get(url=url,headers=headers).content def save_pig(data): ‘‘‘保存圖片‘‘‘ # name=url.split(‘/‘)[-1] name=str(random.randrange(0,1000000))+‘.jpg‘ #線程存儲文件名需改善 path=‘zhanzhangpig/‘+name with open(path,‘wb‘) as f: f.write(data) if not os.path.exists(‘zhanzhangpig‘): os.makedirs(‘zhanzhangpig‘) # 使用線程池 print(‘多線程爬取開始‘) start_time=time.time() pool=Pool(8) pool.map(get_pig_url,page_url_list) data_list=pool.map(download,pig_url_list) pool.map(save_pig,data_list) #關閉線程池 end_time=time.time() print(‘多線程爬取結束‘) print(‘耗時:‘,end_time-start_time) pool.close() pool.join()
2 爬取妹子網圖片(https://www.mzitu.com/tag/ugirls/)
import os import time import random import requests from lxml import etree from multiprocessing.dummy import Pool session=requests.session() if not os.path.exists(‘meizitu‘): os.makedirs(‘meizitu‘) url=‘https://www.mzitu.com/tag/ugirls/‘ page_url_list=[f‘https://www.mzitu.com/tag/ugirls/page/{i}/‘ for i in range(2,17)] page_url_list.insert(0,url) headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36‘, ‘Upgrade-Insecure-Requests‘: ‘1‘, ‘Referer‘: ‘https://www.mzitu.com/tag/ugirls/‘ # 反爬機制:需攜帶網頁請求的原地址 } pig_url_list = [] def get_pig_url(url): response = session.get(url=url, headers=headers) # print(response.text) #xpath解析數據 tree = etree.HTML(response.content.decode()) div_list = tree.xpath(‘//ul[@id="pins"]/li‘) for div in div_list: url = div.xpath(‘.//img/@data-original‘)[0] pig_url_list.append(url) def download(url): ‘‘‘下載圖片數據‘‘‘ # print(url) return session.get(url=url,headers=headers).content def save_pig(data): ‘‘‘保存圖片‘‘‘ name=str(random.randrange(0,1000000))+‘.jpg‘ #線程存儲文件名需改善 path=‘meizitu/‘+name with open(path,‘wb‘) as f: f.write(data) print(‘多線程爬取開始‘) start_time=time.time() #開啟線程 pool=Pool(10) # pig_url_list=get_pig_url(url=url) #單頁爬取 #多頁爬取 pool.map(get_pig_url,page_url_list) # print(pig_url_list) data_list=pool.map(download,pig_url_list) pool.map(save_pig,data_list) pool.close() pool.join() #關閉線程池 end_time=time.time() print(‘多線程爬取結束‘) print(‘耗時:‘,end_time-start_time) #--------------------統計文件夾中文件個數----------------- print(len(os.listdir(‘./meizitu‘)))
!!!384張美圖等你拿
爬蟲系列---多線程爬取實例