Python 爬取百度圖片
阿新 • • 發佈:2019-02-17
百度圖片抓包資料:
引數詳情:
資料解析:
from urllib import request, parse
from http import cookiejar
import re
import time
# 1.提取資料
def main(text,start,length):
hx = hex(start)
s = str(hx)[2:len(hx)]
reqMessage = {
"tn": "resultjson_com",
"ipn": "rj",
"ct": "201326592",
"is" : "",
"fp": "result",
"queryWord": text,
"cl": "2",
"lm": "-1",
"ie": "utf-8",
"oe": "utf-8",
"adpicid": "",
"st": "",
"z": "",
"ic": "",
"word": text,
"s": "",
"se": "",
"tab": "",
"width" : "",
"height": "",
"face": "",
"istype": "",
"qc": "",
"nc": "",
"fr": "",
"cg": "head",
"pn": str(start),
"rn": str(length),
"gsm": s,
"1511330964840": ""
};
cookie=cookiejar.CookieJar()
cookie_support = request.HTTPCookieProcessor(cookie)
opener = request.build_opener(cookie_support, request.HTTPHandler)
request.install_opener(opener)
reqData = parse.urlencode(reqMessage)
req = request.Request("http://image.baidu.com/search/acjson?" + reqData, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"})
data = request.urlopen(req).read();
rm = re.compile(r'"thumbURL":"[\w/\\:.,;=&]*"')
list = re.findall(rm, data.decode())
index = start+1
result=False
for thumbURL in list:
url = thumbURL[12:len(thumbURL) - 1]
downImg(url, "F:/file/baidu/" + str(index) + ".jpg")
index += 1
result=True
return result
# 下載圖片
def downImg(url, path):
print(url)
req=request.Request(url,headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
"Referer":"http://image.baidu.com/search/acjson"})
data= request.urlopen(req).read()
file=open(path,"wb")
file.write(data)
file.close()
pass
a=0
while a!=-1:
result= main("美女圖片", a*30, 30)
print("暫停中...")
a += 1
if result==False :
a=-1
time.sleep(10)
pass
print("執行完成")
資訊不多沒有什麼太多的事情,需要注意的就是下載圖片時請求頭需要新增User-Agent以及Referer,否則百度會拒絕訪問,另外百度的圖片只能訪問一次,訪問一次過後圖片連結立即失效.還有抓取的資料以及時間限制,一次性爬取的數量有限.