爬取豆瓣網評論最多的書籍
阿新 • • 發佈:2018-04-19
ups info 程序 不容易 ima nta 單元 bs4 很多
相信很多人都有書荒的時候,想要找到一本合適的書籍確實不容易,所以這次利用剛學習到的知識爬取豆瓣網的各類書籍,傳送門https://book.douban.com/tag/?view=cloud。
首先是這個程序的結構,html_downloader是html下載器,html_outputer是導出到Excel表,html_parser是解析頁面,make_wordcloud是制作詞雲,spided_main是程序入口,url_manager是URL管理器
主要實現思路是先請求下載需要的html,解析得到目標URL並存儲到URL管理器中,再從URL管理器中獲取得到URL,發送請求,解析得到需要的信息內容,導出到Excel表格,再重Excel表中獲取數據進行分析得到詞雲。
html_downloader:
# -*- coding:utf8 -*- import urllib.request from urllib.parse import quote import string class HtmlDownloader(object): def download(self,url): if url is None: return None s = quote(url, safe=string.printable) #url裏有中文需要添加這一句,不然亂碼 response = urllib.request.urlopen(s) if response.getcode()!= 200: return None return response.read()
通過分析豆瓣網的結構,可以看到,我們首先傳進去的是總的圖書分類,但是我們需要的是每一個分類裏面的圖書信息。所以我們需要得到每一個分類的url,再通過這個url去獲取圖書url,所以就有base_url和detail_url。
url_manager:
# -*- coding:utf8 -*- class UrlManage(object): def __init__(self): self.base_urls = set() #基本分類的URL self.detail_urls = set() #詳細內容頁的URL self.old_base_urls = set() self.old_detail_urls = set()
#添加單個url def add_base_url(self,url): if url is None: return if url not in self.base_urls and url not in self.old_base_urls: self.base_urls.add(url) def add_detail_url(self,url): if url is None: return if url not in self.detail_urls and url not in self.old_detail_urls: self.detail_urls.add(url) # print(self.detail_urls) # 添加多個url def add_new_detail_urls(self, urls): if urls is None or len(urls) == 0: return for url in urls: self.add_detail_url(url) def add_new_base_urls(self, urls): if urls is None or len(urls) == 0: return for url in urls: self.add_base_url(url)
#判斷是否還有url def has_new_detail_url(self): return len(self.detail_urls)!=0 def has_new_base_url(self): return len(self.base_urls)!=0
#得到一個新的url def get_base_url(self): new_base_url = self.base_urls.pop() self.old_base_urls.add(new_base_url) return new_base_url def get_detail_url(self): new_detail_url = self.detail_urls.pop() self.old_detail_urls.add(new_detail_url) return new_detail_url
解析器 html_parser:
# -*- coding:utf8 -*- import re from urllib.parse import urlparse from bs4 import BeautifulSoup class HtmlParser(object): def soup(cont): soups = BeautifulSoup(cont, ‘html.parser‘, from_encoding=‘utf-8‘) return soups def get_new_data(soup): dict = {} if (soup.select(‘.subject-list‘)[0].contents): li = soup.select(‘.subject-list‘)[0].select(‘.subject-item‘) di = {} for i in li: bookname = i.select(‘.info‘)[0].select(‘a‘)[0].attrs[‘title‘] # 書名 comment = i.select(‘.clearfix‘)[0].select(‘.pl‘)[0].text comment = re.findall(‘\d+‘, comment)[0] di[bookname] = comment if di: # 返回的字典不為空的時候 dict.update(di) return dict # 得到詳細內容的url def get_detail_url(base_url): detail_urls = set() for k in range(0, 501, 20): if (k == 0): urls = base_url # print(urls) else: urls = base_url + ‘?start={}&type=T‘.format(k) # print(urls) detail_urls.add(urls) return detail_urls # baseurl def get_all_base_urls(soup): links = soup.select(‘.tagCol‘)[0].select(‘a‘) base_urls = set() for link in links: new_full_url = ‘https://book.douban.com{}‘.format(link.attrs[‘href‘]) # HtmlParser.get_detail_url(new_full_url) base_urls.add(new_full_url) return base_urls def parser(cont): soup = BeautifulSoup(cont, ‘html.parser‘, from_encoding=‘utf-8‘) base_urls = HtmlParser.get_all_base_urls(soup) return base_urls
spided_main:
# -*- coding:utf8 -*- from douban_spider2 import url_manager, html_downloader, html_parser, html_outputer class SpiderMain(object): def __init__(self): self.urls = url_manager.UrlManage() self.downloader = html_downloader.HtmlDownloader() self.htmlparser = html_parser.HtmlParser self.outputer = html_outputer.HtmlOutputer() def craw(self,root_url): count = 1 dictdata = {} cont = self.downloader.download(root_url) base_urls = self.htmlparser.parser(cont) self.urls.add_new_base_urls(base_urls) while self.urls.has_new_base_url(): try: base_url = self.urls.get_base_url() detail_urls = self.htmlparser.get_detail_url(base_url) self.urls.add_new_detail_urls(detail_urls) except: print(‘craw failed‘) while self.urls.has_new_detail_url(): try: detail_url = self.urls.get_detail_url() print (‘crow %d : %s‘%(count,detail_url)) html_cont = self.downloader.download(detail_url) soup = self.htmlparser.soup(html_cont) dict = self.htmlparser.get_new_data(soup) dictdata.update(dict) if count == 1000: #這裏先爬取前1000條url的內容 break count = count + 1 except: print (‘craw failed‘) self.outputer.output_excel(dictdata)
#程序入口 if __name__=="__main__": url = ‘https://book.douban.com/tag/?view=cloud‘ obj_spider = SpiderMain() obj_spider.craw(url)
html_outputer:
# -*- coding:utf8 -*- import xlwt #寫入Excel表的庫 class HtmlOutputer(object): def __init__(self): self.datas =[] def output_excel(self, dict): di = dict wbk = xlwt.Workbook(encoding=‘utf-8‘) sheet = wbk.add_sheet("wordCount") # Excel單元格名字 k = 0 for i in di.items(): sheet.write(k, 0, label=i[0]) sheet.write(k, 1, label=i[1]) k = k + 1 wbk.save(‘wordCount.xls‘) # 保存為 wordCount.xls文件
導出的Excel表格格式為,一共導出15261條記錄
make_wordcloud:
# -*- coding:utf8 -*- from wordcloud import WordCloud import matplotlib.pyplot as plt import xlrd from PIL import Image,ImageSequence import numpy as np file = xlrd.open_workbook(‘wordCount.xls‘) sheet = file.sheet_by_name(‘wordCount‘) list = {} for i in range(sheet.nrows): rows = sheet.row_values(i) tu = {} tu[rows[0]]= int(rows[1]) list.update(tu) print(list) image= Image.open(‘./08.png‘) graph = np.array(image) wc = WordCloud(font_path=‘./fonts/simhei.ttf‘,background_color=‘white‘,max_words=20000, max_font_size=50, min_font_size=1,mask=graph, random_state=100) wc.generate_from_frequencies(list) plt.figure() # 以下代碼顯示圖片 plt.imshow(wc) plt.axis("off") plt.show()
背景圖片我選用的是
最後的做出由15261本書形成的詞雲
爬取豆瓣網評論最多的書籍