Python爬取豆瓣TOP250圖書排行榜
阿新 • • 發佈:2018-12-15
# -*- coding: utf-8 -*- import bs4 import requests def open_url(url): # url = 'https://movie.douban.com/top250' hd = {} hd['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36' r = requests.get(url,headers=hd,timeout=10) return r def find_moive(r): soup = bs4.BeautifulSoup(r.text,'html.parser') #書名 moives = [] targets = soup.find_all('div',class_="pl2") for each in targets: moives.append(each.a.text.split('\n')[1].strip()) #評分 ranks = [] targets = soup.find_all('span',class_='rating_nums') for each in targets: ranks.append('評分: %s' % each.text) #資料 messages =[] pfgets = soup.find_all('p',class_="pl") for each in pfgets: #因為還有別的元素影響,所以要用try來避免干擾 try: #這條資訊被‘\n’分割成了三個欄位,選取第二和第三欄位來進行拼接,並去掉首尾的空格 messages.append(each.text) except: continue #結果 result = [] length =len(moives) for i in range(length): result.append(moives[i] +' ' + ranks[i] + ' ' + messages[i] + '\n') return result #找出一共多少個頁面 #def find_depth(): # soup = bs4.BeautifulSoup(r.text,'html.parser') # # depth = soup.find_all('span',class_="next").previuos_sibling.previuos_sibling.text def main(): host = 'https://book.douban.com/top250' r = open_url(host) # depth = find_depth(r) # depth = 10 result = [] for i in range(10): url = host + '?start=' + '25 * i' r = open_url(url) result.extend(find_moive(r)) with open('booktop250.txt','w',encoding='utf-8') as f: for each in result: f.write(each) if __name__ == "__main__": main()
照葫蘆畫瓢,根據電影該的,裡面的命名也沒有改,定位花了一些時間