1. 程式人生 > >學了2天的Python,自己寫了個簡單的爬蟲,可是爬蟲有什麼用呢?

學了2天的Python,自己寫了個簡單的爬蟲,可是爬蟲有什麼用呢?

# -*- coding: UTF-8 -*-
import requests
import pandas
import   re
import json
from bs4 import BeautifulSoup
import  openpyxl





def parseLinkedNews(url):
newsDetails = []
res = requests.get(url)
res.encoding = 'utf-8'
jd = json.loads(res.text.lstrip('  newsloadercallback(').rstrip(');'))
for  ent in 
jd['result']['data']: newsDetails.append(getNewsDetail(ent['url'])) #print(newsDetails) return newsDetails def getNewsDetail(url): #得到具體的新聞連結,接下來對具體的新聞進行處理,所以要requests.get('url') result = {} res = requests.get(url,'html.parser') res.encoding = 'utf-8' soup = BeautifulSoup(res.text,'html.parser'
) title = soup.select('#artibodyTitle')[0].text#新聞標題 result['title']=title str = '' for p in soup.select('#artibody p')[:-1]: #print(p.text) str = str + p.text+'\n' #print(str) result['article']=str#新聞正文 result['time']=soup.select('.time-source')[0].contents[0]#新聞釋出時間 result['source'] = soup.select(
'.time-source span a')[0].text #新聞來源 result['url']=url#新聞連結 return result if __name__ == '__main__': url = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||' \ '=mtjj&level==1||=2&show_ext=1&show_all=1' \ '&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1512199560142' news_total = [] for i in range(1,10): newsurl=url.format(i) #print(newsurl) newsarray = parseLinkedNews(newsurl) news_total.extend(newsarray) df = pandas.DataFrame(news_total) df.to_excel('C:/news.xlsx')