Python3爬蟲:爬取大眾點評網北京所有酒店評分資訊
阿新 • • 發佈:2019-02-14
學習Python3爬蟲實戰:爬取大眾點評網某地區所有酒店相關資訊,我爬取的北京地區的酒店,由於網站更新,原文中的一些方法已經不再適用,我的工作是在該文指導下重寫了一個爬蟲。
爬蟲無非分為這幾塊:分析目標、下載頁面、解析頁面、儲存內容,其中下載頁面不提。
- 解析頁面:使用正則表示式和BeautifulSoup兩種方式,一般情況都可以使用正則表示式,除非需要分辨特定使用者的評論。
- 儲存內容:酒店資訊(id和名稱)儲存在“hotel_dianping.txt”中,酒店的評分資訊儲存在“id_name+comments.txt”中
Talk is cheap, show me the code.
#coding=utf-8
import re
import requests
from bs4 import BeautifulSoup
aim_url = "http://www.dianping.com/beijing/hotel"
basic_url = "http://www.dianping.com"
hotel_file = 'hotel_dianping.txt'
def download_page(url):
# 偽裝請求頭部
# 有了Cookie不怕不讓爬
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36' ,
'Cookie':'_lxsdk_cuid=15eea339434c8-0d2cff6b34e61c-c313760-100200-15eea339434c8; _lxsdk=15eea339434c8-0d2cff6b34e61c-c313760-100200-15eea339434c8; _hc.v=cec4c6d7-039d-1717-70c0-4234813c6e90.1507167802;\
s_ViewType=1; __mta=218584358.1507168277959.1507176075960.1507176126471.5; JSESSIONID=48C46DCEFE3A390F647F52FED889020D; aburl=1; cy=2; cye=beijing; _lxsdk_s=15eea9307ab-17c-f87-123%7C%7C48' ,
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Host':'www.dianping.com'
}
data = requests.get(url, headers=headers).content
# 轉換編碼
data = data.decode('utf-8')
return data
# 獲取並存儲酒店資訊(id, name)
def getHotelInfo(hotel_file):
# 網站上顯示酒店頁面有50頁,事實上,只能爬取13頁,之後的頁面為空
for i in range(1, 51):
page = download_page(aim_url)
# 如:"action": "click","content":"/shop/8025450","title":"速8酒店"
re_result = re.compile(r'"action": "click","content":"(.*?)","title":"(.*?)"').findall(page)
txt = ""
for x in re_result:
txt += x[0] # /shop/(/d)+格式
txt += ' ' + x[1] # 酒店名稱
txt += "\n"
writeToFile(hotel_file, txt)
print("第%d頁OK....." % i)
i += 1
# 下一頁的網址
aim_url = "http://www.dianping.com/beijing/hotel/p" + str(i)
# 往檔案中寫content
def writeToFile(file_name, content):
with open(file_name, 'a+', encoding='utf-8') as fp:
fp.write(content)
# 獲取每個評論頁的所有評論
def getScore(page):
# 對於評分需要使用BeautifulSoup,直接使用正則表示式無法判斷資料是哪個使用者的
score_list = []
soup = BeautifulSoup(page, 'html.parser')
comment_rst_list = soup.find_all('div', attrs = {'class': 'comment-rst'})
# 對於各個使用者的評論
for comment_rst in comment_rst_list:
rst_list = comment_rst.find_all('span', attrs={'class': 'rst'})
# 記錄某個使用者的各項評分,預設為零,前五項分別是房間、位置、服務、衛生和設施,最後一項為冗餘項。
single_score_dic = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0}
# 對於各個型別的評論
for rst in rst_list:
comment = rst.getText()
type_ = comment[:2]
score = comment[2]
if type_ == "房間":
single_score_dic[0] = score
elif type_ == "位置":
single_score_dic[1] = score
elif type_ == "服務":
single_score_dic[2] = score
elif type_ == "衛生":
single_score_dic[3] = score
elif type_ == "設施":
single_score_dic[4] = score
else:
single_score_dic[5] = score
score_list.append(single_score_dic)
return score_list
# 中文字元和英文、數字佔用的空間不同,為了輸出顯示友好,user_name不能簡單的以%30s格式輸出
def setProperFormat(user_name):
re_result = re.compile(r'(\d|[A-Z]|[a-z]|\_)').findall(user_name)
len_eng = len(re_result)
total_len = len(user_name)
len_cha = total_len - len_eng
real_len = len_eng + len_cha * 2
blank_len = 30 - real_len
txt = "%s" % (" " * blank_len + user_name)
return txt
# 獲取每一條評論
def getEveryComment(hotel_file):
# 開啟hotel_file檔案
with open(hotel_file, 'r', encoding='utf-8') as fp:
num_hotel = 1
# 對於每家酒店
for line in fp:
# 獲取酒店url, id和name
hotel_url = line.split(' ')[0]
hotel_name = line.split(' ')[1][:-1] # 去掉最後的'\n'
hotel_id = hotel_url.split('/')[2]
# 設定儲存使用者評論的檔案的檔名
store_file = "%s_%scomments.txt" % (hotel_id, hotel_name)
# 存入header
txt = "%12s%12s%30s%15s%15s%15s%15s%15s%15s\n" % ("hotel_id", "user_id", "user_name", "rate_room", "rate_position", "rate_service", "rate_health", "rate_facility", "rate_others")
writeToFile(store_file, txt)
# 獲取評論頁url
business_url = basic_url + hotel_url + '/review_more'
page = download_page(business_url)
# 計算出評論頁數
total_comments = re.compile(r'全部</a><em class="col-exp">\((\d+)\)</em>', re.DOTALL).findall(page)
print(total_comments)
pages = int(int(total_comments[0]) / 20) + 1
# 對於每一頁的評論
for n in range(1, pages+1):
comment_url = business_url + '?pageno=%s' % n
print(comment_url)
page = download_page(comment_url)
# 如:<a target="_blank" title="" href="/member/1158824000">HpointK</a>
# (id, userName)
user_info = re.compile(r'<a target="_blank" title="" href="/member/(\d+)">(.*?)</a>', re.DOTALL).findall(page)
score_list = getScore(page)
txt = ""
try:
for i, info in enumerate(user_info):
txt += "%12s%12s" % (hotel_id, info[0])
txt += setProperFormat(info[1])
txt += "%15s%15s%15s%15s%15s%15s\n" % (score_list[i][0], score_list[i][1], score_list[i][2], score_list[i][3], score_list[i][4], score_list[i][5])
except Exception as e:
print(e)
print(len(user_info))
break
# 每次往檔案中寫網頁中的評論
writeToFile(store_file, txt)
print("第%d頁已儲存,共%d頁" % (n, pages))
break
print("第%s家酒店的評論已儲存", num_hotel)
num_hotel += 1
break
getHotelInfo(hotel_file)
getEveryComment(hotel_file)