1. 程式人生 > >Python 爬蟲第一天改良版【學習筆記】

Python 爬蟲第一天改良版【學習筆記】

晚上改良了下午寫的那個,可以批量獲取新浪新聞網頁了。不過也是搬磚來的別人寫好我抄了一遍。
from bs4 import BeautifulSoup as bs

import requests

import csv

import json, re

import pandas

#csv_file = open("Newslist.csv","w",newline="",encoding="utf-8-sig")

#writer = csv.writer(csv_file)

#writer.writerow(["標題","時間","內容","來源","連結"])


news_total=[]

url = "https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page={}&encode=utf-8&callback"
commentsURL = "https://comment.sina.com.cn/page/info?version=1&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&callback"


def parsListLinks(url):

newsdetails = []; #建立陣列裝新聞資訊;

res = requests.get(url); #載入主頁URL分頁的資訊;

jd = json.loads(res.text); #裝載URL的json資訊。;

for ent in jd["result"]["data"]: #遍歷Data資料;

newsdetails.append(getNewsDetail(ent["url"])) #將新聞頁URL傳給“getNewDetail”;


return newsdetails

def getNewsDetail(newsurl):
result = {} #建立包庫

res = requests.get(newsurl); #獲取頁面地址
res.encoding = "utf-8"
soup = bs(res.text,"html.parser")

result["title"] = soup.select(".main-title")[0].text #標題
result["article"] = " ".join([p.text.strip()for p in soup.select(".article p")[:-1]]) #正文

result["dt"] = soup.select(".date")[0].text #時間
#print(result["dt"])
#result["source"] = soup.select(".source")[0]["href"]; #來源和連線
result['comments'] = getCommentCounts(newsurl); #評論數
result["links"] = newsurl;

return result

def getCommentCounts(newsurl):

m = re.search("doc-i(.*).shtml",newsurl)

newsid = m.group(1)

comments = requests.get(commentsURL.format(newsid));

jd = json.loads(comments.text)

return jd["result"]["count"]["total"]


for i in range(1,8):
newsurl = url.format(i)
newsary= parsListLinks(newsurl)
news_total.extend(newsary);

 

df=pandas.DataFrame(news_total)
df.head()
df.to_excel('news.xlsx')
#writer.writerow([title,date,article,source[0].text,source[0]["href"]])
#csv_file.close();