python爬蟲,爬取豆瓣電影《芳華》電影短評,分詞生成雲圖。
阿新 • • 發佈:2018-12-16
專案github地址:https://github.com/kocor01/spider_cloub/
Python版本為3.6
最近突然想玩玩雲圖,動手寫了個簡單的爬蟲,搭建了簡單的爬蟲架構
爬蟲爬取最近比較火的電影《芳華》分詞後生成雲圖
使用了 jieba分詞,雲圖用wordcloud生成
用了朋友的2B姿勢的自拍照片簡單的P了下(為了不暴露,P成全黑的),作為生成雲圖的底圖模板
雲圖底圖模板:
生成的雲圖效果:
爬蟲基礎框架
spider_main.py 爬蟲入口
url_manager.py URL管理器
html_downloader.py 網頁下載器
html_parser.py 資料提取器
html_outputer.py 資料處理器
word_cloud.py 雲圖生成器
extra_dict資料夾檔案如下:
li.png 雲圖底圖模板
simhei.ttf 生成雲圖的字型檔案
str.txt 爬取的電影短評
stop_words.txt 分詞排除的詞
cut_str.txt jieba分詞後文件
yun.png 最後生成的雲圖
程式碼如下:
spider_main.py 爬蟲入口
#coding:utf-8 import url_manager,html_parser,html_outputer,html_downloader,word_cloud class SpiderMain(object): def __init__(self): # URL管理器 self.urls = url_manager.UrlManager() # 網頁下載器 self.downloader = html_downloader.HtmlDownloader() # 資料提取器 self.parser = html_parser.HtmlParser() # 資料處理器 self.outputer = html_outputer.HtmlOutputer() # 雲圖生成器 self.cloud = word_cloud.Wordcloud() def craw(self, root_url): count =1 # 爬蟲入口URL self.urls.add_new_url(root_url) # 待爬取URL wait_url = self.urls.has_new_url() if wait_url is not None: while wait_url: try: # 獲取一個待爬取URL new_url = self.urls.get_new_url() print("carw %d : %s" % (count, new_url)) # 爬取頁面 html_cont = self.downloader.download(new_url) # 資料提取 new_url, new_datas = self.parser.parser(new_url, html_cont) # 新增新待爬取URL self.urls.add_new_url(new_url) # 資料加工處理 self.outputer.collect_data(new_datas) # 爬蟲迴圈控制 if count == 10: break count = count + 1 except: print("craw failed") # 資料加工輸出 self.outputer.process_data() #print("finish") # 分詞 self.outputer.cut_str() # 生成雲圖 self.cloud.make() print("finish") if __name__ == "__main__": # 爬蟲入口URL root_url = "https://movie.douban.com/subject/26862829/comments?status=P" obj_spider = SpiderMain() # 啟動爬蟲 obj_spider.craw(root_url)
url_manager.py URL管理器
#coding:utf-8 class UrlManager(object): def __init__(self): self.new_urls = set() self.old_urls = set() def add_new_url(self, url): if url is None: return if url not in self.new_urls and url not in self.old_urls: self.new_urls.add(url) def add_new_urls(self, urls): if urls is None or len(urls) == 0: return for url in urls: self.add_new_url(url) def has_new_url(self): return len(self.new_urls) != 0 def get_new_url(self): new_url = self.new_urls.pop() self.old_urls.add(new_url) return new_url
html_downloader.py 網頁下載器
#coding:utf-8
import urllib.request
class HtmlDownloader(object):
def download(self, url):
if url is None:
return None
request = urllib.request.Request(url)
request.add_header("user-agent", "Mozilla/5.0")
response = urllib.request.urlopen(url)
if response.getcode() != 200:
return None
return response.read()
html_parser.py 資料提取器
#coding:utf-8
import http.cookiejar
from bs4 import BeautifulSoup
import re
import urllib.parse
class HtmlParser(object):
def parser(self, page_url, content):
if page_url is None or content is None:
return
soup = BeautifulSoup(content, "html.parser", from_encoding='utf-8')
new_url = self._get_new_url(page_url, soup)
new_datas = self._get_new_datas(page_url, soup)
return new_url, new_datas
def _get_new_url(self, page_url, soup):
new_url = soup.find('div', id="paginator").find('a', class_="next").get('href')
new_full_url = urllib.parse.urljoin(page_url, new_url)
return new_full_url
def _get_new_datas(self, page_url, soup):
res_datas = set()
contents = soup.find_all('div', class_="comment-item")
for content in contents:
res_datas.add(content.find('div', class_="comment").find('p').get_text())
return res_datas
html_outputer.py 資料處理器
#coding:utf-8
import pymysql
import jieba.analyse
class HtmlOutputer(object):
def __init__(self):
self.datas = []
def collect_data(self, data):
res_datas = set()
if data is None:
return
for d in data:
self.datas.append(d)
def process_data(self):
#print(len(self.datas))
file_object = open('./extra_dict/str.txt', 'w',encoding='utf-8',errors='ignore')
data_str = ''
for data in self.datas:
#data_str += data
file_object.write(data)
#print(data_str)
file_object.close()
def cut_str(self):
content = open('./extra_dict/str.txt',encoding='utf-8',errors='ignore').read()
jieba.analyse.set_stop_words("./extra_dict/stop_words.txt")
tags = jieba.analyse.extract_tags(content, topK=1000,withWeight=True)
file_object = open('./extra_dict/cut_str.txt', 'w')
for v, n in tags:
#權重是小數,為了湊整,乘了一萬
#print(v + '\t' + str(int(n * 10000)))
data_str = v + '\t' + str(int(n * 10000)) + '\n'
file_object.write(data_str)
file_object.close()
word_cloud.py 雲圖生成器
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
class Wordcloud(object):
def make(self):
d = path.dirname(__file__)
# Read the whole text.
text = open(path.join(d, './extra_dict/cut_str.txt')).read()
# read the mask / color image taken from
alice_coloring = np.array(Image.open(path.join(d, "./extra_dict/li.png")))
stopwords = set(STOPWORDS)
stopwords.add("said")
wc = WordCloud(font_path="./extra_dict/simhei.ttf",background_color="white", max_words=2000, mask=alice_coloring,
stopwords=stopwords, max_font_size=40, random_state=42)
# generate word cloud
wc.generate(text)
# create coloring from image
image_colors = ImageColorGenerator(alice_coloring)
# show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.figure()
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.figure()
plt.imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")
plt.axis("off")
wc.to_file(path.join(d, "./extra_dict/yun.png"))
plt.show()