1. 程式人生 > >python爬蟲例項 python爬蟲例項

python爬蟲例項 python爬蟲例項

python爬蟲例項

 

這裡有兩個爬蟲的例項,是剛開始學python用的,一個是爬取京東茅臺酒評論的,另一個是爬取新浪網國內新聞的,兩個都是網上的教程裡邊的,程式碼略微有些不同,供參考學習。

都可以在andconda裡跑

複製程式碼
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import json
import pandas
news_total=[]
commentURL='http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'
url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1509373193047'
def parseListLinks(url):
    newsdetails=[]
    res = requests.get(url)
    jd= json.loads(res.text.strip().lstrip('newsloadercallback(').rstrip(');'))
    for ent in jd['result']['data']:
        newsdetails.append(getNewsDetail(ent['url']))
    return newsdetails
        
def getNewsDetail(newsurl):
    result={}
    res=requests.get(newsurl)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')     
    result['title']=soup.select('#artibodyTitle')[0].text
    result['newssource']=soup.select('.time-source span a')[0].text
    timesource=soup.select('.time-source')[0].contents[0].strip()
    dt1=datetime.strptime(timesource,'%Y年%m月%d日%H:%M')
    result['dt'] =dt1.strftime('%Y-%m-%d-%H:%M')
    result['article']=' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])
    result['editor']=soup.select('.article-editor')[0].text.strip('責任編輯:')
    result['comments']=getCommentCounts(newsurl)
    print('獲得一條新聞')
    return result      
       
    
def getCommentCounts(newsurl):
    m=re.search('doc-i(.+).shtml',newsurl)
    newsid=m.group(1)
    comments=requests.get(commentURL.format(newsid))
    jd=json.loads(comments.text.strip('var data='))
    return jd['result']['count']['total'] 

for i in range(1,8):
    print('正在爬取第'+str(i)+'頁......')
    newsurl=url.format(i)
    newsary= parseListLinks(newsurl)
    news_total.extend(newsary)
print('抓取結束')                                 
df=pandas.DataFrame(news_total)
df.to_excel('news.xlsx')
複製程式碼

 

複製程式碼
import requests 
import re
import json
import time
import xlwt

#
#
#配置表格
#不需要明白是幹啥的
#有下面4行程式碼就可以往表格寫中文了
#
style=xlwt.XFStyle()
font=xlwt.Font()
font.name='SimSun'
style.font=font

#建立一個表格
w=xlwt.Workbook(encoding='utf-8')
#添加個sheet
ws=w.add_sheet('sheet 1',cell_overwrite_ok=True)
#當前寫入表格到第 row行
row=1
#
#寫入表格頭
#
ws.write(0,0,'content')
ws.write(0,1,'userClientShow')
ws.write(0,2,'creationTime')
ws.write(0,3,'userLevelName')
ws.write(0,4,'productColor')
ws.write(0,5,'userLevelId')
ws.write(0,6,'score')
ws.write(0,7,'referenceName')
ws.write(0,8,'referenceTime')
ws.write(0,9,'isMobile')
ws.write(0,10,'nickname')

#
#接受一個json物件
#將內容寫進表格
#一次一頁評論
#
def write_json_to_xls(dat):
    global row
    for comment in dat['comments']:
        ws.write(row,0,comment['content'])
        ws.write(row,1,comment['userClientShow'])
        ws.write(row,2,comment['creationTime'])
        ws.write(row,3,comment['userLevelName'])
        ws.write(row,4,comment['productColor'])
        ws.write(row,5,comment['userLevelId'])
        ws.write(row,6,comment['score'])
        ws.write(row,7,comment['referenceName'])
        ws.write(row,8,comment['referenceTime'])
        ws.write(row,9,comment['isMobile'])
        ws.write(row,10,comment['nickname'])
        row+=1

#
#
# 迴圈獲取資料
#
#
for i in range(1,10+1):
    url='https://club.jd.com/comment/productPageComments.action?productId=1475512465&score=0&sortType=5&page=%d&pageSize=100&isShadowSku=0&fold=' % i
    try:
        json_req = requests.get(url)
        dat = json_req.json()
        write_json_to_xls(dat)
        print(u'寫入一頁資料')
    except Exception as e:
       print(u'獲取資料失敗資料',e)
    time.sleep(0.5)


#將資料存進表格
w.save('result.xls')
複製程式碼

 

 

這裡有兩個爬蟲的例項,是剛開始學python用的,一個是爬取京東茅臺酒評論的,另一個是爬取新浪網國內新聞的,兩個都是網上的教程裡邊的,程式碼略微有些不同,供參考學習。

都可以在andconda裡跑

複製程式碼
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import json
import pandas
news_total=[]
commentURL='http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'
url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1509373193047'
def parseListLinks(url):
    newsdetails=[]
    res = requests.get(url)
    jd= json.loads(res.text.strip().lstrip('newsloadercallback(').rstrip(');'))
    for ent in jd['result']['data']:
        newsdetails.append(getNewsDetail(ent['url']))
    return newsdetails
        
def getNewsDetail(newsurl):
    result={}
    res=requests.get(newsurl)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')     
    result['title']=soup.select('#artibodyTitle')[0].text
    result['newssource']=soup.select('.time-source span a')[0].text
    timesource=soup.select('.time-source')[0].contents[0].strip()
    dt1=datetime.strptime(timesource,'%Y年%m月%d日%H:%M')
    result['dt'] =dt1.strftime('%Y-%m-%d-%H:%M')
    result['article']=' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])
    result['editor']=soup.select('.article-editor')[0].text.strip('責任編輯:')
    result['comments']=getCommentCounts(newsurl)
    print('獲得一條新聞')
    return result      
       
    
def getCommentCounts(newsurl):
    m=re.search('doc-i(.+).shtml',newsurl)
    newsid=m.group(1)
    comments=requests.get(commentURL.format(newsid))
    jd=json.loads(comments.text.strip('var data='))
    return jd['result']['count']['total'] 

for i in range(1,8):
    print('正在爬取第'+str(i)+'頁......')
    newsurl=url.format(i)
    newsary= parseListLinks(newsurl)
    news_total.extend(newsary)
print('抓取結束')                                 
df=pandas.DataFrame(news_total)
df.to_excel('news.xlsx')
複製程式碼

 

複製程式碼
import requests 
import re
import json
import time
import xlwt

#
#
#配置表格
#不需要明白是幹啥的
#有下面4行程式碼就可以往表格寫中文了
#
style=xlwt.XFStyle()
font=xlwt.Font()
font.name='SimSun'
style.font=font

#建立一個表格
w=xlwt.Workbook(encoding='utf-8')
#添加個sheet
ws=w.add_sheet('sheet 1',cell_overwrite_ok=True)
#當前寫入表格到第 row行
row=1
#
#寫入表格頭
#
ws.write(0,0,'content')
ws.write(0,1,'userClientShow')
ws.write(0,2,'creationTime')
ws.write(0,3,'userLevelName')
ws.write(0,4,'productColor')
ws.write(0,5,'userLevelId')
ws.write(0,6,'score')
ws.write(0,7,'referenceName')
ws.write(0,8,'referenceTime')
ws.write(0,9,'isMobile')
ws.write(0,10,'nickname')

#
#接受一個json物件
#將內容寫進表格
#一次一頁評論
#
def write_json_to_xls(dat):
    global row
    for comment in dat['comments']:
        ws.write(row,0,comment['content'])
        ws.write(row,1,comment['userClientShow'])
        ws.write(row,2,comment['creationTime'])
        ws.write(row,3,comment['userLevelName'])
        ws.write(row,4,comment['productColor'])
        ws.write(row,5,comment['userLevelId'])
        ws.write(row,6,comment['score'])
        ws.write(row,7,comment['referenceName'])
        ws.write(row,8,comment['referenceTime'])
        ws.write(row,9,comment['isMobile'])
        ws.write(row,10,comment['nickname'])
        row+=1

#
#
# 迴圈獲取資料
#
#
for i in range(1,10+1):
    url='https://club.jd.com/comment/productPageComments.action?productId=1475512465&score=0&sortType=5&page=%d&pageSize=100&isShadowSku=0&fold=' % i
    try:
        json_req = requests.get(url)
        dat = json_req.json()
        write_json_to_xls(dat)
        print(u'寫入一頁資料')
    except Exception as e:
       print(u'獲取資料失敗資料',e)
    time.sleep(0.5)


#將資料存進表格
w.save('result.xls')
複製程式碼