Python 爬蟲 爬取京東 商品評論 資料,並存入CSV檔案
阿新 • • 發佈:2019-01-10
利用閒暇時間寫了一個抓取京東商品評論資料的爬蟲。之前寫了抓取拉勾網資料的爬蟲,請 參考1,參考2。
我的開發環境是Windows + Anaconda3(Python 3.6),家用電腦沒安裝Linux(Linux下也是可以的)。
京東的評論資料是通過介面提供的,所以先找到這個介面。
用Firefox開啟京東網站,隨便找到一個商品頁面,點選評論,然後點選頁面右上角‘三橫’標籤,找到‘WEB開發者’,再開啟'WEB控制檯',點選‘網路’標籤,然後點選’下一頁‘,找到’productPageComments.action‘這行,顯示如下。
然後在右邊方框裡找到‘訊息頭’,‘引數’,'響應',這裡能查到偽裝瀏覽器、傳遞的引數及返回的資料資訊,這些資料資訊在程式裡都能用到。
紅框裡是回撥函式,在程式裡的變數設定裡會用到,請注意一下。完整程式如下:
# -*- coding: utf-8 -*- import pandas as pd import urllib.request as req import json import sys import time import random print(sys.getdefaultencoding()) class JDCommentsCrawler: def __init__(self,productId=None,callback=None,page=1,score=0,sortType=5,pageSize=10): self.productId = productId #商品ID self.score = score # 評論型別(好:3、中:2、差:1、所有:0) self.sortType = sortType # 排序型別(推薦:5、時間:6) self.pageSize = pageSize # 每頁顯示多少條記錄(預設10) self.callback = callback # 回撥函式,每個商品都不一樣 self.page = page self.locationLink = 'https://sclub.jd.com/comment/productPageComments.action' self.paramValue = { 'callback':self.callback, 'productId':self.productId, 'score':self.score, 'sortType':self.sortType, 'pageSize':self.pageSize, } self.locationUrl = None def paramDict2Str(self,params): str1 = '' for p,v in params.items(): str1 = str1+ p+'='+str(v)+'&' return str1 def concatLinkParam(self): self.locationUrl = self.locationLink+'?'+self.paramDict2Str(self.paramValue)+'isShadowSku=0&fold=1&page=0' #print(self.locationUrl) def requestMethod(self): headers = { 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', 'Referer':'https://item.jd.com/%d.html'%(self.productId), 'Host':'sclub.jd.com' } reqs = req.Request(self.locationUrl,headers=headers) print('reqs : ',reqs) return reqs def showList(self): request_m = self.requestMethod() conn = req.urlopen(request_m) return_str = conn.read().decode('gbk') return_str = return_str[len(self.callback)+1:-2] return json.loads(return_str) def requestMethodPage(self,p): # 偽裝瀏覽器 ,開啟網站 headers = { 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', 'Referer':'https://item.jd.com/%d.html'%(self.productId), 'Host':'sclub.jd.com' } url = self.locationUrl[:-1]+str(p) print('url : ',url) reqs = req.Request(url,headers=headers) return reqs def showListPage(self,p): request_m = self.requestMethodPage(p) conn = req.urlopen(request_m) return_str = conn.read().decode('gbk') return_str = return_str[len(self.callback)+1:-2] return json.loads(return_str) def save_csv(self,df,p): # 儲存檔案 df.to_csv(path_or_buf = 'd:\\xxx\\jd\\jd_%d.csv'%p,encoding='gbk') def crawler(self): # 把抓取的資料存入CSV檔案,設定時間間隔,以免被遮蔽 dfs = [] for p in range(self.page): json_info = self.showListPage(p) tmp_list = [] #print(json_info) productCommentSummary = json_info['productCommentSummary'] productId = productCommentSummary['productId'] comments = json_info['comments'] for com in comments: tmp_list.append([com['id'],productId,com['guid'],com['content'],com['creationTime'],com['referenceId'],com['referenceTime'],com['score'],\ com['nickname'],com['userLevelName'],com['isMobile'],com['userClientShow']]) df = pd.DataFrame(tmp_list,columns=['comment_id','product_id','guid','content','create_time','reference_id','reference_time','score',\ 'nickname','user_level','is_mobile','user_client']) self.save_csv(df,p) dfs.append(df) time.sleep(random.randint(31,52)) final_df = pd.concat(dfs,ignore_index=True) self.save_csv(final_df,self.page) def jdComment(): #設定關鍵變數 page = 3 #頁數 productId = 6474492 # 商品ID callback = 'fetchJSON_comment98vv782' #回撥函式 JDC = JDCommentsCrawler(productId,callback,page) JDC.concatLinkParam() JDC.crawler() if __name__ == '__main__': jdComment()
開發環境搭建好,檔案路徑設定正確,直接複製程式碼就可以了。