1. 程式人生 > >爬蟲系列---多線程爬取實例

爬蟲系列---多線程爬取實例

not 圖片 文件夾 nco get origin nal ade close

1.爬取站長圖片源碼

#爬取站長‘http://sc.chinaz.com/tupian/gudianmeinvtupian.html‘,所有的古典美女圖片
import os
import time
import random
import requests
from lxml import etree
from multiprocessing.dummy import Pool
#獲取所有頁面的url
url =http://sc.chinaz.com/tupian/gudianmeinvtupian.html
page_url_list=[fhttp://sc.chinaz.com/tupian/gudianmeinvtupian_{i}.html
for i in range(2,7)] page_url_list.insert(0,url) headers={ User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.20 Safari/537.36, # ‘Content-Encoding‘:‘gzip‘, # ‘Content-Type‘: ‘text/html‘, } pig_url_list = [] def get_pig_url(url): response
= requests.get(url=url, headers=headers) #xpath解析數據 tree = etree.HTML(response.content.decode()) div_list = tree.xpath(//div[@id="container"]/div) for div in div_list: url = div.xpath(.//img/@src2)[0] pig_url_list.append(url) def download(url): ‘‘‘下載圖片數據‘‘‘
return requests.get(url=url,headers=headers).content def save_pig(data): ‘‘‘保存圖片‘‘‘ # name=url.split(‘/‘)[-1] name=str(random.randrange(0,1000000))+.jpg #線程存儲文件名需改善 path=zhanzhangpig/+name with open(path,wb) as f: f.write(data) if not os.path.exists(zhanzhangpig): os.makedirs(zhanzhangpig) # 使用線程池 print(多線程爬取開始) start_time=time.time() pool=Pool(8) pool.map(get_pig_url,page_url_list) data_list=pool.map(download,pig_url_list) pool.map(save_pig,data_list) #關閉線程池 end_time=time.time() print(多線程爬取結束) print(耗時:,end_time-start_time) pool.close() pool.join()

技術分享圖片

技術分享圖片

2 爬取妹子網圖片(https://www.mzitu.com/tag/ugirls/)

import os
import time
import random
import requests
from lxml import etree
from multiprocessing.dummy import Pool
session=requests.session()
if not os.path.exists(meizitu):
    os.makedirs(meizitu)

url=https://www.mzitu.com/tag/ugirls/
page_url_list=[fhttps://www.mzitu.com/tag/ugirls/page/{i}/ for i in range(2,17)]
page_url_list.insert(0,url)

headers={
    User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36,
    Upgrade-Insecure-Requests: 1,
    Referer: https://www.mzitu.com/tag/ugirls/ # 反爬機制:需攜帶網頁請求的原地址
}
pig_url_list = []
def get_pig_url(url):
    response = session.get(url=url, headers=headers)
    # print(response.text)
    #xpath解析數據
    tree = etree.HTML(response.content.decode())
    div_list = tree.xpath(//ul[@id="pins"]/li)
    for div in div_list:
        url = div.xpath(.//img/@data-original)[0]
        pig_url_list.append(url)

def download(url):
    ‘‘‘下載圖片數據‘‘‘
    # print(url)
    return session.get(url=url,headers=headers).content

def save_pig(data):
    ‘‘‘保存圖片‘‘‘
    name=str(random.randrange(0,1000000))+.jpg #線程存儲文件名需改善
    path=meizitu/+name
    with open(path,wb) as f:
        f.write(data)

print(多線程爬取開始)
start_time=time.time()
#開啟線程
pool=Pool(10)
# pig_url_list=get_pig_url(url=url) #單頁爬取
#多頁爬取

pool.map(get_pig_url,page_url_list)
# print(pig_url_list)
data_list=pool.map(download,pig_url_list)
pool.map(save_pig,data_list)

pool.close()
pool.join()
#關閉線程池
end_time=time.time()
print(多線程爬取結束)
print(耗時:,end_time-start_time)
#--------------------統計文件夾中文件個數-----------------
print(len(os.listdir(./meizitu)))

技術分享圖片

!!!384張美圖等你拿

技術分享圖片

爬蟲系列---多線程爬取實例