1. 程式人生 > >Python 爬蟲 爬取京東 商品評論 資料,並存入CSV檔案

Python 爬蟲 爬取京東 商品評論 資料,並存入CSV檔案

利用閒暇時間寫了一個抓取京東商品評論資料的爬蟲。之前寫了抓取拉勾網資料的爬蟲,請 參考1參考2。

我的開發環境是Windows + Anaconda3(Python 3.6),家用電腦沒安裝Linux(Linux下也是可以的)。

京東的評論資料是通過介面提供的,所以先找到這個介面。

用Firefox開啟京東網站,隨便找到一個商品頁面,點選評論,然後點選頁面右上角‘三橫’標籤,找到‘WEB開發者’,再開啟'WEB控制檯',點選‘網路’標籤,然後點選’下一頁‘,找到’productPageComments.action‘這行,顯示如下。

然後在右邊方框裡找到‘訊息頭’,‘引數’,'響應',這裡能查到偽裝瀏覽器、傳遞的引數及返回的資料資訊,這些資料資訊在程式裡都能用到。

紅框裡是回撥函式,在程式裡的變數設定裡會用到,請注意一下。完整程式如下:

 # -*- coding: utf-8 -*- 
import pandas as pd
import urllib.request as req
import json
import sys
import time
import random

print(sys.getdefaultencoding())

class JDCommentsCrawler:
    
    def __init__(self,productId=None,callback=None,page=1,score=0,sortType=5,pageSize=10):
        self.productId = productId #商品ID
        self.score = score # 評論型別(好:3、中:2、差:1、所有:0)
        self.sortType = sortType # 排序型別(推薦:5、時間:6)
        self.pageSize = pageSize # 每頁顯示多少條記錄(預設10)
        self.callback = callback # 回撥函式,每個商品都不一樣
        self.page = page
        self.locationLink = 'https://sclub.jd.com/comment/productPageComments.action'
        self.paramValue = {
            'callback':self.callback,
            'productId':self.productId,
            'score':self.score,
            'sortType':self.sortType,
            'pageSize':self.pageSize,
        }        
        self.locationUrl = None
    def paramDict2Str(self,params):        
        str1 = ''
        for p,v in params.items():
            str1 = str1+ p+'='+str(v)+'&'
        return str1
    def concatLinkParam(self):
        self.locationUrl = self.locationLink+'?'+self.paramDict2Str(self.paramValue)+'isShadowSku=0&fold=1&page=0'
        #print(self.locationUrl)
        
    def requestMethod(self):
        headers = {
            'Connection': 'Keep-Alive',
            'Accept': 'text/html, application/xhtml+xml, */*',            
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
            'Referer':'https://item.jd.com/%d.html'%(self.productId),
            'Host':'sclub.jd.com'          
        }
        reqs = req.Request(self.locationUrl,headers=headers)
        print('reqs : ',reqs)
        return reqs       
    def showList(self):
        request_m = self.requestMethod()       
        conn = req.urlopen(request_m)
        return_str = conn.read().decode('gbk')
        return_str = return_str[len(self.callback)+1:-2]
        return json.loads(return_str)   
    def requestMethodPage(self,p):
        # 偽裝瀏覽器 ,開啟網站
        headers = {
            'Connection': 'Keep-Alive',
            'Accept': 'text/html, application/xhtml+xml, */*',            
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
            'Referer':'https://item.jd.com/%d.html'%(self.productId),
            'Host':'sclub.jd.com'          
        }
        url = self.locationUrl[:-1]+str(p)
        print('url : ',url)
        reqs = req.Request(url,headers=headers)
        return reqs
    def showListPage(self,p):
        request_m = self.requestMethodPage(p)       
        conn = req.urlopen(request_m)
        return_str = conn.read().decode('gbk')
        return_str = return_str[len(self.callback)+1:-2]
        return json.loads(return_str)
    def save_csv(self,df,p):
        # 儲存檔案
        df.to_csv(path_or_buf = 'd:\\xxx\\jd\\jd_%d.csv'%p,encoding='gbk') 
 
    def crawler(self):
        # 把抓取的資料存入CSV檔案,設定時間間隔,以免被遮蔽
        dfs = []
        for p in range(self.page):
            json_info = self.showListPage(p)
            tmp_list = []
            #print(json_info)
            productCommentSummary = json_info['productCommentSummary']
            productId = productCommentSummary['productId']
            comments = json_info['comments']
            for com in comments:
                tmp_list.append([com['id'],productId,com['guid'],com['content'],com['creationTime'],com['referenceId'],com['referenceTime'],com['score'],\
                                 com['nickname'],com['userLevelName'],com['isMobile'],com['userClientShow']])
            df = pd.DataFrame(tmp_list,columns=['comment_id','product_id','guid','content','create_time','reference_id','reference_time','score',\
                                                'nickname','user_level','is_mobile','user_client'])
            self.save_csv(df,p)
            dfs.append(df)
            time.sleep(random.randint(31,52))
        final_df = pd.concat(dfs,ignore_index=True)
        self.save_csv(final_df,self.page)
 
def jdComment():
    #設定關鍵變數
    page = 3 #頁數
    productId = 6474492 # 商品ID
    callback = 'fetchJSON_comment98vv782' #回撥函式
    JDC = JDCommentsCrawler(productId,callback,page)
    JDC.concatLinkParam()
    JDC.crawler()

if __name__ == '__main__':
    jdComment()

開發環境搭建好,檔案路徑設定正確,直接複製程式碼就可以了。