python 爬蟲 爬取網易嚴選全網商品價格評論資料
阿新 • • 發佈:2018-12-01
1.獲取商品目錄
在Chrome瀏覽器開發者工具中,可以找到目錄的JS地址:
http://you.163.com/xhr/globalinfo//queryTop.json
得到商品資料
def get_categoryList(): url='http://you.163.com/xhr/globalinfo//queryTop.json' headers={ 'Accept':'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'zh-CN,zh;q=0.9', 'Connection':'keep-alive', 'Host':'you.163.com', 'Referer':'http://you.163.com/?from=web_out_pz_baidu_1', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36', 'X-Requested-With':'XMLHttpRequest', } req=requests.get(url=url,headers=headers,verify=False).json() df = pd.DataFrame(columns=('一級分類ID', '一級分類', '二級分類ID', '二級分類', '三級分類ID', '三級分類')) x = 0 cateList=req['data']['cateList'] ##一級分類 for i in cateList: id1 = i['id'] name1 = i['name'] subCateGroupList=i['subCateGroupList'] ##二級分類 for j in subCateGroupList: id2 = j['id'] name2 = j['name'] categoryList=j['categoryList'] ##三級分類 for k in categoryList: id3=k['id'] name3=k['name'] df.loc[x] = [id1,name1,id2,name2,id3,name3] x=x+1 #df.to_csv('list.csv') return df
2.獲取商品ID資料
在移動端的一級目錄網址http://m.you.163.com/item/list?categoryId=1005000 的88行程式碼裡可以找到整個目錄下所有商品的資料;
把88行的文字複製出來,去掉前面的var jsonData=,和後面的;符號,貼上到json視覺化工具裡做分析(本人用的是hijson),可以找到'商品ID', '商品名稱', '商品簡介', '商品單位', '上架時間', '更新時間', '櫃檯價', '零售價', '商品URL', '商品圖片'等資料
##獲取商品ID資料 def get_items_ID(): ##移動端網站 s = requests.session() df=get_categoryList() cateList=df['一級分類ID'].values.tolist() cateList = list(set(cateList)) df_item=pd.DataFrame(columns=('三級分類ID', '商品ID', '商品名稱', '商品簡介', '商品圖片', '商品單位', '上架時間', '更新時間', '櫃檯價', '零售價','商品URL')) x=0 for i in cateList: ##一級分類目錄商品 headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': 'm.you.163.com', 'Referer': 'http://m.you.163.com/item/list?categoryId='+str(i), 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Linux; U; Android 8.0.0; zh-CN; MHA-AL00 Build/HUAWEIMHA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 UCBrowser/12.1.4.994 Mobile Safari/537.36', } s.headers.update(headers) time.sleep(1.024) ##延時 url='http://m.you.163.com/item/list?categoryId='+str(i) req=s.get(url=url,verify=False).text js=re.search('jsonData=(.*)',req).group(1).strip(';') js=json.loads(js) categoryItemList=js['categoryItemList'] ##商品清單列表 for i in categoryItemList: category_id=i['category']['id'] ##商品三級分類ID itemList=i['itemList'] ###三級分類下商品 for j in itemList: id=j['id'] ##商品ID name=j['name'] ## 商品名稱 simpleDesc=j['simpleDesc'] ##商品簡介 primaryPicUrl=j['primaryPicUrl'] ##商品圖片 pieceUnitDesc=j['pieceUnitDesc'] ##商品單位 onSaleTime=timestamp_to_date(j['onSaleTime']) ##上架時間 updateTime=timestamp_to_date(j['updateTime']) ##更新時間 counterPrice=j['counterPrice'] ##櫃檯價 retailPrice=j['retailPrice'] ##零售價 itemUrl='http://you.163.com/item/detail?id='+str(id) ##商品URL df_item.loc[x] = [category_id, id, name, simpleDesc, primaryPicUrl, pieceUnitDesc, onSaleTime, updateTime, counterPrice, retailPrice,itemUrl] x=x+1 #print(category_id, id, name, simpleDesc, primaryPicUrl, pieceUnitDesc, onSaleTime, updateTime, counterPrice, retailPrice) df['三級分類ID'] = df['三級分類ID'].apply(str) ##設定列格式 #df_item.to_csv('df_item.csv', index=False, encoding="GB18030") df_item['三級分類ID'] = df_item['三級分類ID'].apply(str) #設定列格式 items=pd.merge(df_item,df,how='left',on=['三級分類ID']) ##調整列順序 items=items[['一級分類ID', '一級分類', '二級分類ID', '二級分類', '三級分類ID', '三級分類' , '商品ID', '商品名稱', '商品簡介', '商品單位', '上架時間', '更新時間', '櫃檯價', '零售價', '商品URL','商品圖片']] print(items) items.to_csv('items.csv',index=False, encoding="GB18030") return items
3.獲取單個商品ID評論數及評論觀點資料
在電腦端的商品頁面上,開啟評論,就可以在NETWORK上刷出新的JS頁面http://you.163.com/xhr/comment/tags.json?__timestamp=1543216560605&itemId=1615007,
def get_comment(self,id): ##電腦端網站 #time.sleep(1.22) now=str(int(time.time()*1000)) url='http://you.163.com/xhr/comment/tags.json?__timestamp={}&itemId={}'.format(now,id) UserAgentlist = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.104', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36 Maxthon/5.2.5.4000', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36', ] ran = random.randint(0, len(UserAgentlist)-1) UserAgen = UserAgentlist[ran] headers={ 'Accept':'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'zh-CN,zh;q=0.9', 'Connection':'keep-alive', 'Host':'you.163.com', 'Referer':'http://you.163.com/item/detail?id={}&_stat_referer=index&_stat_area=mod_popularItem_item_1'.format(id), 'User-Agent':UserAgen, 'X-Requested-With':'XMLHttpRequest' } req = requests.get(url=url, headers=headers, verify=False).text js=json.loads(req) data=js['data'] comment='' goodCmtRate='0' commentcount='0' if data!=[]: commentcount=data[0]['strCount'] ##評論數 url='http://you.163.com/xhr/comment/itemGoodRates.json' postdata={'itemId': id} itemGoodRates=requests.post(url=url, headers=headers, data=postdata,verify=False).json() goodCmtRate=itemGoodRates['data']['goodCmtRate'] ##好評率 for i in data: comment=str(comment)+str(i['name'])+'('+str(i['strCount'])+ ')' ##評論觀點 commentdata=[commentcount,goodCmtRate,comment] print(commentdata) return commentdata
4.獲取單個商品ID的SKU資料
在移動端的商品主頁面上http://m.you.163.com/item/detail?id=1516008 88行程式碼裡可以找到SKU的資料
def get_items_data(id): ##移動端網站
url='http://m.you.163.com/item/detail?id='+str(id)
UserAgentlist = [
'Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36',
'Mozilla/5.0 (Linux; U; Android 8.0.0; zh-CN; MHA-AL00 Build/HUAWEIMHA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 UCBrowser/12.1.4.994 Mobile Safari/537.36',
'Mozilla/5.0 (Linux; Android 6.0.1; OPPO A57 Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/63.0.3239.83 Mobile Safari/537.36 T7/10.13 baiduboxapp/10.13.0.10 (Baidu; P1 6.0.1)',
'Mozilla/5.0 (iPhone 6s; CPU iPhone OS 11_4_1 like Mac OS X) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0 MQQBrowser/8.3.0 Mobile/15B87 Safari/604.1 MttCustomUA/2 QBWebViewType/1 WKType/1',
'Mozilla/5.0 (Linux; U; Android 8.1.0; zh-CN; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 UCBrowser/11.9.4.974 UWS/2.13.1.48 Mobile Safari/537.36 AliApp(DingTalk/4.5.11) com.alibaba.android.rimet/10487439 Channel/227200 language/zh-CN',
'Mozilla/5.0 (Linux; U; Android 8.0.0; zh-CN; BAC-AL00 Build/HUAWEIBAC-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 UCBrowser/11.9.4.974 UWS/2.13.1.48 Mobile Safari/537.36 AliApp(DingTalk/4.5.11) com.alibaba.android.rimet/10487439 Channel/227200 language/zh-CN',
'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Mobile/14C92 MicroMessenger/6.5.16 NetType/WIFI Language/zh_CN'
]
ran = random.randint(0, len(UserAgentlist)-1)
UserAgen = UserAgentlist[ran]
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Host':'m.you.163.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':UserAgen,
}
req=requests.get(url=url,headers=headers,verify=False).text
js=re.search('var jsonData=(.*)',req).group(1).strip(',')
js=json.loads(js)
skuList=js['skuList'] ###顏色分類列表
df = pd.DataFrame(
columns=('商品ID', '顏色分類', 'SKU櫃檯價', 'SKU零售價', 'SKU圖片'))
x = 0
for i in skuList:
itemSkuSpecValueList=i['itemSkuSpecValueList']
try:
pic=i['pic']
except:
pic=''
try:
counterPrice = i['counterPrice']
except:
counterPrice=''
try:
retailPrice=i['retailPrice']
except:
retailPrice=''
skuvalue=''
for j in itemSkuSpecValueList:
try:
skuvalue=skuvalue+j['skuSpecValue']['value']+' '
except:
skuvalue=''
skuvalue=skuvalue.strip(' ')
df.loc[x] = [id,skuvalue,counterPrice,retailPrice,pic]
x=x+1
print(id,skuvalue,counterPrice,retailPrice,pic)
#print(df)
#df.to_csv('get_items_data.csv',index=False, encoding="GB18030")
return df
綜合上述,將所有資料結合在一起,程式碼如下:
#!coding=utf-8
import requests
import re
import random
import time
import json
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import pandas as pd
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) ###禁止提醒SSL警告
###格式化時間戳
def timestamp_to_date(time_stamp, format_string="%Y-%m-%d %H:%M:%S"):
time_array = time.localtime(int(time_stamp)/1000)
str_date = time.strftime(format_string, time_array)
return str_date
class wyyx(object):
### 獲取分類
def get_categoryList(self):
url='http://you.163.com/xhr/globalinfo//queryTop.json'
headers={
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Host':'you.163.com',
'Referer':'http://you.163.com/?from=web_out_pz_baidu_1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
}
req=requests.get(url=url,headers=headers,verify=False).json()
df = pd.DataFrame(columns=('一級分類ID', '一級分類', '二級分類ID', '二級分類', '三級分類ID', '三級分類'))
x = 0
cateList=req['data']['cateList'] ##一級分類
for i in cateList:
id1 = i['id']
name1 = i['name']
subCateGroupList=i['subCateGroupList'] ##二級分類
for j in subCateGroupList:
id2 = j['id']
name2 = j['name']
categoryList=j['categoryList'] ##三級分類
for k in categoryList:
id3=k['id']
name3=k['name']
df.loc[x] = [id1,name1,id2,name2,id3,name3]
x=x+1
#df.to_csv('list.csv')
return df
##獲取商品ID資料
def get_items_ID(self): ##移動端網站
s = requests.session()
df=self.get_categoryList()
cateList=df['一級分類ID'].values.tolist()
cateList = list(set(cateList))
df_item=pd.DataFrame(columns=('三級分類ID', '商品ID', '商品名稱', '商品簡介', '商品圖片', '商品單位', '上架時間',
'更新時間', '櫃檯價', '零售價','商品URL','評論數','好評率','評論觀點'))
x=0
for i in cateList: ##一級分類目錄商品
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Host': 'm.you.163.com',
'Referer': 'http://m.you.163.com/item/list?categoryId='+str(i),
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Linux; U; Android 8.0.0; zh-CN; MHA-AL00 Build/HUAWEIMHA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 UCBrowser/12.1.4.994 Mobile Safari/537.36',
}
s.headers.update(headers)
time.sleep(1.024) ##延時
url='http://m.you.163.com/item/list?categoryId='+str(i)
req=s.get(url=url,verify=False).text
js=re.search('jsonData=(.*)',req).group(1).strip(';')
js=json.loads(js)
categoryItemList=js['categoryItemList'] ##商品清單列表
for i in categoryItemList:
category_id=i['category']['id'] ##商品三級分類ID
itemList=i['itemList'] ###三級分類下商品
for j in itemList:
id=j['id'] ##商品ID
name=j['name'] ## 商品名稱
simpleDesc=j['simpleDesc'] ##商品簡介
primaryPicUrl=j['primaryPicUrl'] ##商品圖片
pieceUnitDesc=j['pieceUnitDesc'] ##商品單位
onSaleTime=timestamp_to_date(j['onSaleTime']) ##上架時間
updateTime=timestamp_to_date(j['updateTime']) ##更新時間
counterPrice=j['counterPrice'] ##櫃檯價
retailPrice=j['retailPrice'] ##零售價
itemUrl='http://you.163.com/item/detail?id='+str(id) ##商品URL
commentdata=self.get_comment(str(id))
commentcount=commentdata[0] ##評論數
goodCmtRate=commentdata[1] ##好評率
comment=commentdata[2] ##評論觀點
df_item.loc[x] = [category_id, id, name, simpleDesc, primaryPicUrl, pieceUnitDesc, onSaleTime,
updateTime, counterPrice, retailPrice,itemUrl,commentcount,goodCmtRate,comment]
x=x+1
#print(category_id, id, name, simpleDesc, primaryPicUrl, pieceUnitDesc, onSaleTime, updateTime, counterPrice, retailPrice)
df['三級分類ID'] = df['三級分類ID'].apply(str) ##設定列格式
#df_item.to_csv('df_item.csv', index=False, encoding="GB18030")
df_item['三級分類ID'] = df_item['三級分類ID'].apply(str) #設定列格式
items=pd.merge(df_item,df,how='left',on=['三級分類ID'])
##調整列順序
items=items[['一級分類ID', '一級分類', '二級分類ID', '二級分類', '三級分類ID', '三級分類'
, '商品ID', '商品名稱', '商品簡介', '商品單位', '上架時間', '更新時間', '櫃檯價', '零售價',
'商品URL','商品圖片','評論數','好評率','評論觀點']]
print(items)
items.to_csv('items.csv',index=False, encoding="GB18030")
return items
###獲取所有資料(商品ID+SKU資料)
def all_data(self,path=''): ##path預設空值,直接呼叫self.get_items_ID()獲取資料,否則讀取path檔案的資料
if path=='':
iddata=self.get_items_ID()
else:
iddata=pd.read_csv(path, encoding="GB18030")
idlist=iddata['商品ID'].values.tolist()
skudata = pd.DataFrame( columns=('商品ID', '顏色分類', 'SKU櫃檯價', 'SKU零售價', 'SKU圖片'))
for id in idlist:
df=self.get_items_data(id)
skudata=skudata.append(df)
print(skudata)
skudata['商品ID'] = skudata['商品ID'].apply(str) ##設定列格式
iddata['商品ID'] = iddata['商品ID'].apply(str) # 設定列格式
alldata = pd.merge(skudata, iddata, how='left', on=['商品ID'])
##調整列順序
alldata = alldata[['一級分類ID', '一級分類', '二級分類ID', '二級分類', '三級分類ID', '三級分類'
, '商品ID', '商品名稱', '商品簡介', '商品單位', '上架時間', '更新時間', '櫃檯價', '零售價',
'商品URL', '商品圖片', '評論數', '好評率', '評論觀點', '顏色分類', 'SKU櫃檯價', 'SKU零售價', 'SKU圖片']]
print(alldata)
alldata.to_csv('alldata.csv', index=False, encoding="GB18030")
##獲取單個商品ID的SKU資料
def get_items_data(self,id): ##移動端網站
url='http://m.you.163.com/item/detail?id='+str(id)
UserAgentlist = [
'Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36',
'Mozilla/5.0 (Linux; U; Android 8.0.0; zh-CN; MHA-AL00 Build/HUAWEIMHA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 UCBrowser/12.1.4.994 Mobile Safari/537.36',
'Mozilla/5.0 (Linux; Android 6.0.1; OPPO A57 Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/63.0.3239.83 Mobile Safari/537.36 T7/10.13 baiduboxapp/10.13.0.10 (Baidu; P1 6.0.1)',
'Mozilla/5.0 (iPhone 6s; CPU iPhone OS 11_4_1 like Mac OS X) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0 MQQBrowser/8.3.0 Mobile/15B87 Safari/604.1 MttCustomUA/2 QBWebViewType/1 WKType/1',
'Mozilla/5.0 (Linux; U; Android 8.1.0; zh-CN; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 UCBrowser/11.9.4.974 UWS/2.13.1.48 Mobile Safari/537.36 AliApp(DingTalk/4.5.11) com.alibaba.android.rimet/10487439 Channel/227200 language/zh-CN',
'Mozilla/5.0 (Linux; U; Android 8.0.0; zh-CN; BAC-AL00 Build/HUAWEIBAC-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 UCBrowser/11.9.4.974 UWS/2.13.1.48 Mobile Safari/537.36 AliApp(DingTalk/4.5.11) com.alibaba.android.rimet/10487439 Channel/227200 language/zh-CN',
'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Mobile/14C92 MicroMessenger/6.5.16 NetType/WIFI Language/zh_CN'
]
ran = random.randint(0, len(UserAgentlist)-1)
UserAgen = UserAgentlist[ran]
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Host':'m.you.163.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':UserAgen,
}
req=requests.get(url=url,headers=headers,verify=False).text
js=re.search('var jsonData=(.*)',req).group(1).strip(',')
js=json.loads(js)
skuList=js['skuList'] ###顏色分類列表
df = pd.DataFrame(
columns=('商品ID', '顏色分類', 'SKU櫃檯價', 'SKU零售價', 'SKU圖片'))
x = 0
for i in skuList:
itemSkuSpecValueList=i['itemSkuSpecValueList']
try:
pic=i['pic']
except:
pic=''
try:
counterPrice = i['counterPrice']
except:
counterPrice=''
try:
retailPrice=i['retailPrice']
except:
retailPrice=''
skuvalue=''
for j in itemSkuSpecValueList:
try:
skuvalue=skuvalue+j['skuSpecValue']['value']+' '
except:
skuvalue=''
skuvalue=skuvalue.strip(' ')
df.loc[x] = [id,skuvalue,counterPrice,retailPrice,pic]
x=x+1
print(id,skuvalue,counterPrice,retailPrice,pic)
#print(df)
#df.to_csv('get_items_data.csv',index=False, encoding="GB18030")
return df
##獲取單個商品ID評論數及評論觀點資料
def get_comment(self,id): ##電腦端網站
#time.sleep(1.22)
now=str(int(time.time()*1000))
url='http://you.163.com/xhr/comment/tags.json?__timestamp={}&itemId={}'.format(now,id)
UserAgentlist = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.104',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36 Maxthon/5.2.5.4000',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
]
ran = random.randint(0, len(UserAgentlist)-1)
UserAgen = UserAgentlist[ran]
headers={
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Host':'you.163.com',
'Referer':'http://you.163.com/item/detail?id={}&_stat_referer=index&_stat_area=mod_popularItem_item_1'.format(id),
'User-Agent':UserAgen,
'X-Requested-With':'XMLHttpRequest'
}
req = requests.get(url=url, headers=headers, verify=False).text
js=json.loads(req)
data=js['data']
comment=''
goodCmtRate='0'
commentcount='0'
if data!=[]:
commentcount=data[0]['strCount'] ##評論數
url='http://you.163.com/xhr/comment/itemGoodRates.json'
postdata={'itemId': id}
itemGoodRates=requests.post(url=url, headers=headers, data=postdata,verify=False).json()
goodCmtRate=itemGoodRates['data']['goodCmtRate'] ##好評率
for i in data:
comment=str(comment)+str(i['name'])+'('+str(i['strCount'])+ ')' ##評論觀點
commentdata=[commentcount,goodCmtRate,comment]
print(commentdata)
return commentdata
if __name__ == '__main__':
wy=wyyx()
wy.all_data() ##
得到資料的樣本如下:
(PS:由於電腦端和手機端的爬取資料難度及資料清洗不一樣,所以使用了兩種方式結合。)