【Python】打響2019年第二炮-Python爬蟲入門(二)
阿新 • • 發佈:2019-01-03
打響2019第二炮-Python爬蟲入門
在2019年第一炮文章中獲取到了京東商城某一臺電腦的列表資訊,並儲存到CSV能夠更方便的檢視如下:
本章內容主要解決,如何多頁獲取手機&電腦資料,獲取評價以及好評率等資訊,實現效果如下:
如何獲取評論資訊?
在京東頁面搜尋手機或者電腦,隨後按f12
或者ctrl
+shift
+i
呼叫開發者工具,在英文中評論
為comment
,所以我們可以嘗試開發者工具頁面搜尋comment
開啟此頁面
通過搜尋comment
Response
返回結果來看,可以看出{}
json格式,這時候就可以嘗試獲取json
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import json
import requests
from bs4 import BeautifulSoup
def download(url, headers, num_retries=3):
print("download", url)
try:
response = requests.get(url, headers=headers)
print(response.status_code)
if response.status_code == 200:
return response.content
return None
except RequestException as e:
print(e.response)
html = ""
if hasattr(e.response, 'status_code'):
code = e.response.status_code
print('error code', code)
if num_retries > 0 and 500 <= code < 600 :
html = download(url, headers, num_retries - 1)
else:
code = None
return html
def get_json():
jd_html = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=8674557,100000769466,8443496,100000769432,100000117782,100000679465,100000863175,100000612187,8461498,8461490,8461496,7765111,100001045546,7999189,100000667974,100001045648,6072622,100000644947,100002470752,8484118,7690501,7621213,8596169,100000863245,100001045514,100001269968,100001692089,100000863247,100000400472,100001521818&callback=jQuery9848036&_=1546399791459"
headers = {
'User-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
"referer": "https://www.jd.com"
}
get = download(jd_html, headers=headers)
print(get)
if __name__ == "__main__":
get_json()
以下為輸出資料
獲取json資料時的url部分如下:
從獲取到的json來看,每一段都代表著某一臺電腦的評價及好評率等資訊,也就是每臺商品的ID 如下:
如果需要取每一臺商品的評論及好評率,目前得知可以在以下連結末尾加上某個商品的ID號,獲取json資料,進行評價及好評率分析從而獲得想要的內容
def find_Computer(url, headers):
r = download(url, headers=headers)
page = BeautifulSoup(r, "lxml")
all_items = page.find_all('li', attrs={'class' : 'gl-item'})
with open("Computer.csv", 'w', newline='') as f:
writer = csv.writer(f)
fields = ('ID', '名稱', '價格', '評論數', '好評率')
writer.writerow(fields)
for all in all_items:
# 取每臺電腦的ID
Computer_id = all["data-sku"]
print(f"電腦ID為:{Computer_id}")
# 取每臺電腦的名稱
Computer_name = all.find('div', attrs={'class':'p-name p-name-type-2'}).find('em').text
print(f"電腦的名稱為:{Computer_name}")
# 取每臺電腦的價格
Computer_price = all.find('div', attrs={'class':'p-price'}).find('i').text
print(f"電腦的價格為:{Computer_price}元")
# 取每臺電腦的Json資料(包含 評價等等資訊)
Comment = f"https://club.jd.com/comment/productCommentSummaries.action?referenceIds={Computer_id}"
comment_count, good_rate = get_json(Comment)
print('評價人數:', comment_count)
print('好評率:', good_rate)
row = []
row.append(Computer_id)
row.append(Computer_name)
row.append(str(Computer_price) + "元")
row.append(comment_count)
row.append(good_rate)
writer.writerow(row)
獲取每臺電腦商品的json資料
def get_json(url):
data = requests.get(url).json()
result = data['CommentsCount']
for i in result:
return i["CommentCountStr"], i["GoodRateShow"]
程式碼如下:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import csv
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
def download(url, headers, num_retries=3):
print("download", url)
try:
response = requests.get(url, headers=headers)
print(response.status_code)
if response.status_code == 200:
return response.content
return None
except RequestException as e:
print(e.response)
html = ""
if hasattr(e.response, 'status_code'):
code = e.response.status_code
print('error code', code)
if num_retries > 0 and 500 <= code < 600:
html = download(url, headers, num_retries - 1)
else:
code = None
return html
def find_Computer(url, headers):
r = download(url, headers=headers)
page = BeautifulSoup(r, "lxml")
all_items = page.find_all('li', attrs={'class' : 'gl-item'})
with open("Computer.csv", 'w', newline='') as f:
writer = csv.writer(f)
fields = ('ID', '名稱', '價格', '評論數', '好評率')
writer.writerow(fields)
for all in all_items:
# 取每臺電腦的ID
Computer_id = all["data-sku"]
print(f"電腦ID為:{Computer_id}")
# 取每臺電腦的名稱
Computer_name = all.find('div', attrs={'class':'p-name p-name-type-2'}).find('em').text
print(f"電腦的名稱為:{Computer_name}")
# 取每臺電腦的價格
Computer_price = all.find('div', attrs={'class':'p-price'}).find('i').text
print(f"電腦的價格為:{Computer_price}元")
# 取每臺電腦的Json資料(包含 評價等等資訊)
Comment = f"https://club.jd.com/comment/productCommentSummaries.action?referenceIds={Computer_id}"
comment_count, good_rate = get_json(Comment)
print('評價人數:', comment_count)
print('好評率:', good_rate)
row = []
row.append(Computer_id)
row.append(Computer_name)
row.append(str(Computer_price) + "元")
row.append(comment_count)
row.append(good_rate)
writer.writerow(row)
def get_json(url):
data = requests.get(url).json()
result = data['CommentsCount']
for i in result:
return i["CommentCountStr"], i["GoodRateShow"]
def main():
headers = {
'User-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
"referer": "https://passport.jd.com"
}
URL = "https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&enc=utf-8&wq=%E7%94%B5%E8%84%91&pvid=1ff18312e8ef48febe71a66631674848"
find_Computer(URL, headers=headers)
if __name__ == '__main__':
main()
執行如下:
在好評率方面應該都是%格式,如下:
若想獲取其他商品可以直接修改主頁面URL地址即可例如:(服裝:女)如下:
但是如何分頁獲取更多的資料內容呢? 你沒有聽錯,關注2019年
第三炮即可!
希望對您有所幫助,再見~~~