Python 爬蟲第一天改良版【學習筆記】
晚上改良了下午寫的那個,可以批量獲取新浪新聞網頁了。不過也是搬磚來的別人寫好我抄了一遍。
from bs4 import BeautifulSoup as bs
import requests
import csv
import json, re
import pandas
#csv_file = open("Newslist.csv","w",newline="",encoding="utf-8-sig")
#writer = csv.writer(csv_file)
#writer.writerow(["標題","時間","內容","來源","連結"])
news_total=[]
url = "https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page={}&encode=utf-8&callback"
commentsURL = "https://comment.sina.com.cn/page/info?version=1&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&callback"
def parsListLinks(url):
newsdetails = []; #建立陣列裝新聞資訊;
res = requests.get(url); #載入主頁URL分頁的資訊;
jd = json.loads(res.text); #裝載URL的json資訊。;
for ent in jd["result"]["data"]: #遍歷Data資料;
newsdetails.append(getNewsDetail(ent["url"])) #將新聞頁URL傳給“getNewDetail”;
return newsdetails
def getNewsDetail(newsurl):
result = {} #建立包庫
res = requests.get(newsurl); #獲取頁面地址
res.encoding = "utf-8"
soup = bs(res.text,"html.parser")
result["title"] = soup.select(".main-title")[0].text #標題
result["article"] = " ".join([p.text.strip()for p in soup.select(".article p")[:-1]]) #正文
result["dt"] = soup.select(".date")[0].text #時間
#print(result["dt"])
#result["source"] = soup.select(".source")[0]["href"]; #來源和連線
result['comments'] = getCommentCounts(newsurl); #評論數
result["links"] = newsurl;
return result
def getCommentCounts(newsurl):
m = re.search("doc-i(.*).shtml",newsurl)
newsid = m.group(1)
comments = requests.get(commentsURL.format(newsid));
jd = json.loads(comments.text)
return jd["result"]["count"]["total"]
for i in range(1,8):
newsurl = url.format(i)
newsary= parsListLinks(newsurl)
news_total.extend(newsary);
df=pandas.DataFrame(news_total)
df.head()
df.to_excel('news.xlsx')
#writer.writerow([title,date,article,source[0].text,source[0]["href"]])
#csv_file.close();