python:爬取新浪新聞的內容
阿新 • • 發佈:2018-12-07
import requests import json from bs4 import BeautifulSoup import re import pandas import sqlite3 commenturl='https://comment.sina.com.cn/page/info?version=1&format=json' \ '&channel=gn&newsid=comos-{}&group=undefined&compress=0&' \ 'ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread' \ '=1&callback=jsonp_1543748934208' # 獲取評論數 def getCommentCounts(newsurl): #獲取沒則新聞的編號(正則表示式) m = re.search('doc-i(.*).shtml', newsurl) newsid = m.group(1) #格式化連結中的的大括號 comments = requests.get(commenturl.format(newsid)) #把加了js外套的json變成標準json jd = json.loads(comments.text.strip('jsonp_1543748934208()')) #獲取評論數 return jd['result']['count']['total']; # 提取每則新聞的內文 def getNewsDetail(newsurl): # 定義一個字典儲存資訊 result = {} rsp = requests.get(newsurl) rsp.encoding = 'utf-8' soup = BeautifulSoup(rsp.text,'html.parser') # 獲取標題 result['title'] = soup.select('.main-title')[0].text # 獲取日期 result['time'] = soup.select('.date')[0].text # 獲取來源 result['source'] = soup.select('.source')[0].text # 獲取內容 result['article'] = ' '.join([p.text.strip() for p in soup.select('#article p')[:-1]]) # 獲取編輯 result['editor'] = soup.select('.show_author')[0].text.lstrip('責任編輯:') # 獲取評論數 result['comment']=getCommentCounts(newsurl) return result # 獲取分頁連結 def parseListLinks(url): newsdetails = [] rsp = requests.get(url) # 把加了js外套的json變成標準json jsonUrl = '{' + str(rsp.text.lstrip('try{feedCardJsonpCallback(').rstrip(') ;}catch(e){};')) + '}}' jd=json.loads(jsonUrl) # 獲取每頁的新聞連結 for ent in jd['result']['data']: newsdetails.append(getNewsDetail(ent['url'])) return newsdetails url='https://feed.sina.com.cn/api/roll/' \ 'get?pageid=121&lid=1356&num=20&versionNumber=1.2.4' \ '&page={}&encode=utf-8&callback=feedCardJsonpCallback&_' news_total = [] for i in range(1,3):#爬取的頁數自己設定 # 格式化連結中的大括號 newsurl = url.format(i) newsary = parseListLinks(newsurl) news_total.extend(newsary) # 使用pandas模組使爬取到的資訊格式化 df = pandas.DataFrame(news_total) # 儲存為xlsx檔案 df.to_excel('news.xlsx')