1. 程式人生 > >python爬蟲爬取百度圖片

python爬蟲爬取百度圖片

爬蟲爬取百度圖片

因公司業務需要,而且公司人手不足,我這個測試工程師需要臨時客串一下其他職位,所以,由我來爬取百度圖片。

說明

1、最近稍微有點兒忙,沒顧得上整理。而且程式碼量比較少,所以註釋比較少。
2、如果需要直接使用我的程式碼,請將相應路徑檔名稱更改。具體使用方法我會在下面程式碼中詳細介紹。
3.python2.7

實現思路及功能

1.讀取excel中第一列的關鍵詞,儲存在列表中,等待遍歷
2.根據關鍵詞開啟執行緒
3.將關鍵詞傳入img中,開始獲取圖片
4.將圖片儲存在指定目錄

上程式碼

#__author__ = 'chubbysuperman'
#_*_coding=utf-8 _*
_ import requests from fake_useragent import UserAgent import xlrd from compiler.ast import flatten import os import time import threading def imgUrls(keyWord, userAgent, pn): url = 'https://image.baidu.com/search/index' params = {'tn': 'resultjson_com', 'ipn': 'rj', 'ct': '201326592', 'is': '', 'fp'
: 'result', 'queryWord': keyWord, 'cl': '2', 'lm': '-1', 'ie': 'utf-8', 'oe': 'utf-8', 'adpicid': '', 'st': '-1', 'z': '', 'ic': '0', 'word': keyWord, 's': '', 'se': '', 'tab': '', 'width': '', 'height': '', 'face': '0', 'istype': '2', 'qc': '', 'nc': '1', 'fr': '', 'pn': pn, 'rn': 200, 'gsm': '1e', '1491808945838'
: '' } rep = requests.get(url, headers={'user-Agent': userAgent}, params=params,timeout=(4,7)) if(int(rep.status_code) == 200): try: time.sleep(1) imgs = rep.json() def decodeUrl(imgUrl): longDic={'_z2C$q': ":",'_z&e3B': ".",'AzdH3F': "/"} mapDic={'w': "a",'k': "b",'v': "c",'1': "d",'j': "e",'u': "f",'2': "g",'i': "h",'t': "i",'3': "j",'h': "k",'s': "l",'4': "m",'g': "n","5": "o",'r': "p",'q': "q","6": "r",'f': "s",'p': "t","7": "u",'e': "v",'o': "w","8": "1",'d': "2",'n': "3","9": "4",'c': "5",'m': "6","0": "7",'b': "8",'l': "9",'a': "0"} for k in longDic: imgUrl=imgUrl.replace(k,longDic[k]) imgUrl=list(imgUrl) tmp=[] for i in imgUrl: if i in mapDic: tmp.append(mapDic[i]) else: tmp.append(i) return ''.join(tmp) imgUrls = [decodeUrl(imgs['data'][sec]['objURL']) for sec in range(len(imgs['data']) - 1)] result = imgUrls status = True except Exception as e: result = 'wuyunlunbi' status = False finally: return {'result': result, 'status': status} def img(keyWord, userAgent): add1=[] for i in range(100): #print(imgUrls(keyWord,userAgent,pn=i*20)['result']) add1.append(imgUrls(keyWord,userAgent,pn=i*20)['result']) add1 = flatten(add1) #return {keyWord:add1} x = keyWord print(len(add1)) print(add1) os.makedirs(r'D:\yyyyy5\%s'%x) #建立儲存目錄 for iii in range(len(add1)): print(iii) iii = add1[iii] iii = iii.replace(" ","") time.sleep(0.15) if 'wuyunlunbi' in iii: print('error_%s'%iii) elif "yuan_" in iii: print("error001_%s"%iii) else: try: a = requests.get('%s'%iii,timeout=(3,4)) img = a.content ccc =time.time() time.sleep(0.15) asdf = 'D:/yyyyy5/%s/%s.jpg'%(x,ccc) #將圖片寫入指定目錄 with open( asdf,'wb' ) as f: f.write(img) except Exception as e: pass if __name__ == '__main__': workbook = xlrd.open_workbook(r'C:\Users\Administrator\Desktop\Ashicai (2).xlsx') #這是關鍵詞儲存的excel,請將關鍵詞放在第一個sheet中的第一列 a = workbook.sheet_by_index(0).col_values(0) ua = UserAgent() urls = [] for x in range(len(a)): aa = time.time() threading.Thread(target=img,args=(a[x],ua.random)).start() time.sleep(0.05) #urls.append(img(keyWord=a[x], userAgent=ua.random)) print(aa)