1. 程式人生 > >Python 爬取百度圖片

Python 爬取百度圖片

百度圖片抓包資料:

引數詳情:

資料解析:

from urllib import request, parse
from http import cookiejar
import re
import time


# 1.提取資料
def main(text,start,length):
    hx = hex(start)
    s = str(hx)[2:len(hx)]
    reqMessage = {
        "tn": "resultjson_com",
        "ipn": "rj",
        "ct": "201326592",
        "is"
: "", "fp": "result", "queryWord": text, "cl": "2", "lm": "-1", "ie": "utf-8", "oe": "utf-8", "adpicid": "", "st": "", "z": "", "ic": "", "word": text, "s": "", "se": "", "tab": "", "width"
: "", "height": "", "face": "", "istype": "", "qc": "", "nc": "", "fr": "", "cg": "head", "pn": str(start), "rn": str(length), "gsm": s, "1511330964840": "" }; cookie=cookiejar.CookieJar() cookie_support = request.HTTPCookieProcessor(cookie) opener = request.build_opener(cookie_support, request.HTTPHandler) request.install_opener(opener) reqData = parse.urlencode(reqMessage) req = request.Request("http://image.baidu.com/search/acjson?"
+ reqData, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}) data = request.urlopen(req).read(); rm = re.compile(r'"thumbURL":"[\w/\\:.,;=&]*"') list = re.findall(rm, data.decode()) index = start+1 result=False for thumbURL in list: url = thumbURL[12:len(thumbURL) - 1] downImg(url, "F:/file/baidu/" + str(index) + ".jpg") index += 1 result=True return result # 下載圖片 def downImg(url, path): print(url) req=request.Request(url,headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", "Referer":"http://image.baidu.com/search/acjson"}) data= request.urlopen(req).read() file=open(path,"wb") file.write(data) file.close() pass a=0 while a!=-1: result= main("美女圖片", a*30, 30) print("暫停中...") a += 1 if result==False : a=-1 time.sleep(10) pass print("執行完成")

資訊不多沒有什麼太多的事情,需要注意的就是下載圖片時請求頭需要新增User-Agent以及Referer,否則百度會拒絕訪問,另外百度的圖片只能訪問一次,訪問一次過後圖片連結立即失效.還有抓取的資料以及時間限制,一次性爬取的數量有限.