python3爬蟲豆瓣top250圖書(並儲存到mysql資料庫)
阿新 • • 發佈:2018-12-10
參考上篇文章
附上程式碼:
import requests from bs4 import BeautifulSoup import mysql.connector def get_pages_link(): # 插入到資料庫 conn = mysql.connector.connect(user='root', password='root', database='test') cursor = conn.cursor() header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36', 'Connection': 'keep-alive' } for item in range(0, 250, 25): url = "https://book.douban.com/top250?start={}".format(item) web_data = requests.get(url, headers=header) soup = BeautifulSoup(web_data.content, 'lxml') for movie in soup.select('.item'): href = movie.find('a')["href"] name = movie.get_text().strip()[:20].strip() # 片名 star = movie.select('.rating_nums')[0].text # 評分 people = movie.select('.star > span')[2].text.replace("(\n", "").replace(")", "").strip() # 評價人數 try: quote = movie.select('.inq')[0].text except: print('沒有quote哦') quote = None data = { # 'url': href, '評分': star, '評價人數': people, '片名': name, '名言': quote } sql = 'insert into bookdoubantop250(score,name,quote,people) values (%f,"%s","%s","%s")' % ( float(star), name, quote, people) cursor.execute(sql) conn.commit() print(data) # print(movie) print('\n' + '-' * 50 + '\n') # 關閉資料庫 cursor.close() conn.close if __name__ == '__main__': get_pages_link()