1. 程式人生 > >python爬蟲抓取新華網新聞並自動生成word文件

python爬蟲抓取新華網新聞並自動生成word文件

認識一個在學校廣播電臺的學妹, 聽她說她們每天都要在網上找新聞, 國際, 國內, 和校內各五篇, 然後將其做成word文件列印, 個人感覺這種活非常浪費時間, 應該寫個程式幫我們自動完成. 後來沒事的時候就寫了這個python程式實現這個功能.

程式用python3.4編寫, 使用到了urllib, 加另外兩個三方庫, BeautifulSoup(解析網頁, 很好用的一個工具)和python-docx(生成word文件,這個庫功能還不是特別強大)

import urllib.request
import os
import shutil
from bs4 import BeautifulSoup
from
docx import Document from docx.shared import Pt from docx.enum.text import WD_ALIGN_PARAGRAPH national = "國內" international = "國際" def get_html_soup(url):#獲取解編碼後的HTML html = None try: response = urllib.request.urlopen(url, timeout = 10) html = response.read().decode(encoding = "utf8"
, errors='ignore') except Exception as e: print(e, "please check your network situation") return None soup = BeautifulSoup(str(html), "lxml") return soup def page_url(url, page_num):#生成帶頁面的URL if page_num == 1: return url index = url.rfind(".") return
url[0 : index] + "_" + str(page_num) + url[index : ] def get_title_link(url, pattern):#獲取新聞的標題和正文連結 soup = get_html_soup(url) news_link = {} scroll_list = BeautifulSoup(str(soup.find("div", attrs = pattern)), "lxml") for link in scroll_list.find_all("a"): if len(link.get_text().strip()) > 0 and link.get("href").find("http") != -1: news_link[link.get_text()] = link.get('href') return news_link def get_news_body(url):#抓取新聞主體內容 first = True content_text = [] page_num = 1 article_div = "" #使用迴圈處理有分頁的新聞 while first == True or article_div.find("下一頁</a>") != -1: soup = get_html_soup(page_url(url, page_num)) if soup == None: return None article_div = str(soup.find("div", attrs = {"class": "article"})) soup = BeautifulSoup(str(article_div), "lxml") for content in soup.find_all("p"): if len(content.get_text().strip()) > 0: content_text.append(" " + content.get_text().strip()) page_num += 1 first = False for x in content_text: if x == " None": return None return content_text def clean_chinese_character(text): '''處理特殊的中文符號,將其全部替換為'-' 否則在儲存時Windows無法將有的中文符號作為路徑''' chars = chars = ["/", "\"", "'", "·", "。","?", "!", ",", "、", ";", ":", "‘", "’", "“", "”", "(", ")", "…", "–", ".", "《", "》"]; new_text = "" for i in range(len(text)): if text[i] not in chars: new_text += text[i] else: new_text += "_" return new_text; def create_docx(news_type, title, content): '''這裡使用python-docx庫將新聞的內容生成word檔案''' document = Document() paragraph = document.add_paragraph(title) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER paragraph.bold = True for x in content: paragraph = document.add_paragraph(x) style = paragraph.style font = style.font font.size = Pt(15) font.name = "consolas" name = news_type + "-" + clean_chinese_character(title) + ".docx" document.save(news_type + "/" + name) ######################################################################## national_news = "http://www.news.cn/politics/" national_news_pattern = {"id": "hideData0"} international_news = "http://www.news.cn/world/" international_news_pattern = {"class": "partR domPC"} #刪除舊目錄 print("deleting old dir") if os.path.exists(international): shutil.rmtree(international) if os.path.exists(national): shutil.rmtree(national) #建立新目錄 print("creating dir: ", international) os.mkdir(international) print("creating dir: ", national) os.mkdir(national) #獲取新聞的標題和連結 international_news_list = get_title_link(international_news, international_news_pattern) print("\ngetting international news content") #獲取新聞的內容主體並寫入檔案 for x in international_news_list: paras = get_news_body(international_news_list[x]) if paras != None and len(paras) > 0: print("writing:", clean_chinese_character(x), international_news_list[x]) create_docx(international, x, paras) national_news_list = get_title_link(national_news, national_news_pattern); print("\ngetting national news content") for x in national_news_list: paras = get_news_body(national_news_list[x]) if paras != None and len(paras) > 0: print("writing:", clean_chinese_character(x), national_news_list[x]) create_docx(national, x, paras) print("All done, have a nice day")

程式碼的核心是beautifulsoup庫的使用, 這的確是個很好用的工具, 大家有興趣應該學學