python爬蟲抓取新華網新聞並自動生成word文件
阿新 • • 發佈:2019-01-22
認識一個在學校廣播電臺的學妹, 聽她說她們每天都要在網上找新聞, 國際, 國內, 和校內各五篇, 然後將其做成word文件列印, 個人感覺這種活非常浪費時間, 應該寫個程式幫我們自動完成. 後來沒事的時候就寫了這個python程式實現這個功能.
程式用python3.4編寫, 使用到了urllib, 加另外兩個三方庫, BeautifulSoup(解析網頁, 很好用的一個工具)和python-docx(生成word文件,這個庫功能還不是特別強大)
import urllib.request
import os
import shutil
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
national = "國內"
international = "國際"
def get_html_soup(url):#獲取解編碼後的HTML
html = None
try:
response = urllib.request.urlopen(url, timeout = 10)
html = response.read().decode(encoding = "utf8" , errors='ignore')
except Exception as e:
print(e, "please check your network situation")
return None
soup = BeautifulSoup(str(html), "lxml")
return soup
def page_url(url, page_num):#生成帶頁面的URL
if page_num == 1:
return url
index = url.rfind(".")
return url[0 : index] + "_" + str(page_num) + url[index : ]
def get_title_link(url, pattern):#獲取新聞的標題和正文連結
soup = get_html_soup(url)
news_link = {}
scroll_list = BeautifulSoup(str(soup.find("div", attrs = pattern)), "lxml")
for link in scroll_list.find_all("a"):
if len(link.get_text().strip()) > 0 and link.get("href").find("http") != -1:
news_link[link.get_text()] = link.get('href')
return news_link
def get_news_body(url):#抓取新聞主體內容
first = True
content_text = []
page_num = 1
article_div = ""
#使用迴圈處理有分頁的新聞
while first == True or article_div.find("下一頁</a>") != -1:
soup = get_html_soup(page_url(url, page_num))
if soup == None:
return None
article_div = str(soup.find("div", attrs = {"class": "article"}))
soup = BeautifulSoup(str(article_div), "lxml")
for content in soup.find_all("p"):
if len(content.get_text().strip()) > 0:
content_text.append(" " + content.get_text().strip())
page_num += 1
first = False
for x in content_text:
if x == " None":
return None
return content_text
def clean_chinese_character(text):
'''處理特殊的中文符號,將其全部替換為'-' 否則在儲存時Windows無法將有的中文符號作為路徑'''
chars = chars = ["/", "\"", "'", "·", "。","?", "!", ",", "、", ";", ":", "‘", "’", "“", "”", "(", ")", "…", "–", ".", "《", "》"];
new_text = ""
for i in range(len(text)):
if text[i] not in chars:
new_text += text[i]
else:
new_text += "_"
return new_text;
def create_docx(news_type, title, content):
'''這裡使用python-docx庫將新聞的內容生成word檔案'''
document = Document()
paragraph = document.add_paragraph(title)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
paragraph.bold = True
for x in content:
paragraph = document.add_paragraph(x)
style = paragraph.style
font = style.font
font.size = Pt(15)
font.name = "consolas"
name = news_type + "-" + clean_chinese_character(title) + ".docx"
document.save(news_type + "/" + name)
########################################################################
national_news = "http://www.news.cn/politics/"
national_news_pattern = {"id": "hideData0"}
international_news = "http://www.news.cn/world/"
international_news_pattern = {"class": "partR domPC"}
#刪除舊目錄
print("deleting old dir")
if os.path.exists(international):
shutil.rmtree(international)
if os.path.exists(national):
shutil.rmtree(national)
#建立新目錄
print("creating dir: ", international)
os.mkdir(international)
print("creating dir: ", national)
os.mkdir(national)
#獲取新聞的標題和連結
international_news_list = get_title_link(international_news, international_news_pattern)
print("\ngetting international news content")
#獲取新聞的內容主體並寫入檔案
for x in international_news_list:
paras = get_news_body(international_news_list[x])
if paras != None and len(paras) > 0:
print("writing:", clean_chinese_character(x), international_news_list[x])
create_docx(international, x, paras)
national_news_list = get_title_link(national_news, national_news_pattern);
print("\ngetting national news content")
for x in national_news_list:
paras = get_news_body(national_news_list[x])
if paras != None and len(paras) > 0:
print("writing:", clean_chinese_character(x), national_news_list[x])
create_docx(national, x, paras)
print("All done, have a nice day")
程式碼的核心是beautifulsoup庫的使用, 這的確是個很好用的工具, 大家有興趣應該學學