Python爬蟲(二十)_動態爬取影評信息
阿新 • • 發佈:2018-12-18
type 8.0 out span none function title hot output
本案例介紹從JavaScript中采集加載的數據。更多內容請參考:Python學習指南
#-*- coding:utf-8 -*-
import requests
import re
import time
import json
#數據下載器
class HtmlDownloader(object):
def download(self, url, params=None):
if url is None:
return None
user_agent = ‘Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0‘
headers = {‘User-Agent‘:user_agent}
if params is None:
r = requests.get(url, headers = headers)
else:
r = requests.get(url, headers = headers, params = params)
if r.status_code == 200:
r.encoding = ‘utf-8‘
return r.text
return None
#數據存儲器
class HtmlParser(object):
#從選購電影頁面中解析出所有電影信息,組成一個list
def parser_url(self, page_url, response):
pattern = re.compile(r‘(http://movie.mtime.com/(\d+)/)‘)
urls = pattern.findall(response)
if urls != None:
#將urls去重
return list(set(urls))
else:
return None
#解析正在上映的電影
def __parser_release(self, page_url, value):
‘‘‘
解析已經上映的電影
:param page_url:電影鏈接
:param value: json數據
:return
‘‘‘
try:
isRelease = 1
movieRating = value.get(‘value‘).get(‘movieRating‘)
boxOffice = value.get(‘value‘).get(‘boxOffice‘)
movieTitle = value.get(‘value‘).get(‘movieTitle‘)
RPictureFinal = movieRating.get(‘RPictureFinal‘)
RStoryFinal = movieRating.get(‘RStoryFinal‘)
RDirectorFinal = movieRating.get(‘RDirectorFinal‘)
ROtherFinal = movieRating.get(‘ROtherFinal‘)
RatingFinal = movieRating.get(‘RatingFinal‘)
MovieId = movieRating.get("MovieId")
UserCount = movieRating.get("Usercount")
AttitudeCount = movieRating.get("AttitudeCount")
TotalBoxOffice = boxOffice.get("TotalBoxOffice")
TotalBoxOfficeUnit = boxOffice.get("TotalBoxOfficeUnit")
TodayBoxOffice = boxOffice.get("TodayBoxOffice")
TodayBoxOfficeUnit = boxOffice.get("TodayBoxOfficeUnit")
ShowDays = boxOffice.get(‘ShowDays‘)
try:
Rank = boxOffice.get(‘Rank‘)
except Exception,e:
Rank = 0
#返回所提取的內容
return (MovieId, movieTitle, RatingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal, UserCount, AttitudeCount, TotalBoxOffice+TotalBoxOfficeUnit, TodayBoxOffice+TodayBoxOfficeUnit, Rank, ShowDays, isRelease)
except Exception, e:
print e, page_url, value
return None
#解析未上映的電影
def __parser_no_release(self, page_url, value, isRelease=0):
‘‘‘
解析未上映的電影信息
:param page_url
:param value
: return
‘‘‘
try:
movieRating = value.get(‘value‘).get(‘movieRating‘)
movieTitle = value.get(‘value‘).get(‘movieTitle‘)
RPictureFinal = movieRating.get(‘RPictureFinal‘)
RStoryFinal = movieRating.get(‘RStoryFinal‘)
RDirectorFinal = movieRating.get(‘RDirectorFinal‘)
ROtherFinal = movieRating.get(‘ROtherFinal‘)
RatingFinal = movieRating.get(‘RatingFinal‘)
MovieId = movieRating.get("MovieId")
UserCount = movieRating.get("Usercount")
AttitudeCount = movieRating.get("AttitudeCount")
try:
Rank = value.get(‘value‘).get(‘hotValue‘).get(‘Ranking‘)
except Exception,e:
Rank = 0
#返回所提取的內容
return (MovieId, movieTitle, RatingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal, UserCount, AttitudeCount, u‘無‘, u‘無‘, Rank, 0, isRelease)
except Exception, e:
print e, page_url, value
return None
#解析電影中的json信息
def parser_json(self, page_url, response):
"""
解析響應
:param response
:return
"""
#將"="和";"之間的內容提取出來
pattern = re.compile(r‘=(.*?);‘)
result = pattern.findall(response)[0]
if result != None:
#json模塊加載字符串
value = json.loads(result)
# print(result)
try:
isRelease = value.get(‘value‘).get(‘isRelease‘)
except Exception, e:
print e
return None
if isRelease:
‘‘‘
isRelease:0 很長時間都不會上映的電影;1 已經上映的電影; 2 即將上映的電影
‘‘‘
if value.get(‘value‘).get(‘hotValue‘) == None:
#解析正在上映的電影
# print(self.__parser_release(page_url, value))
return self.__parser_release(page_url, value)
else:
#解析即將上映的電影
# print(self.__parser_no_release(page_url, value, isRelease = 2))
return self.__parser_no_release(page_url, value, isRelease = 2)
else:
#解析還有很長時間才能上映的電影
return self.__parser_no_release(page_url, value)
#數據存儲器
#數據存儲器將返回的數據插入mysql數據庫中,主要包括建表,插入和關閉數據庫等操作,表中設置了15個字段,用來存儲電影信息,代碼如下:
#這裏以後補充
class SpiderMain(object):
def __init__(self):
self.downloader = HtmlDownloader()
self.parser = HtmlParser()
def crawl(self, root_url):
content = self.downloader.download(root_url)
urls = self.parser.parser_url(root_url, content)
#構造一個活的評分和票房鏈接
for url in urls:
try:
t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
param = {
‘Ajax_CallBack‘:‘true‘,
‘Ajax_CallBackType‘: ‘Mtime.Library.Services‘,
‘Ajax_CallBackMethod‘: ‘GetMovieOverviewRating‘,
‘Ajax_CallBackArgument0‘ : ‘%s‘%(url[1]),
‘Ajax_RequestUrl‘ : ‘%s‘%(url[0]),
‘Ajax_CrossDomain‘ : ‘1‘,
‘t‘ : ‘%s‘%t
}
rank_url = ‘http://service.library.mtime.com/Movie.api?‘
rank_content = self.downloader.download(rank_url, param)
data = self.parser.parser_json(rank_url, rank_content)
self.output.output_end()
except Exception, e:
print("Crawl failed")
if __name__ == ‘__main__‘:
spier = SpiderMain()
spier.crawl(‘http://theater.mtime.com/China_Jiangsu_Province_Nanjing/‘)
Python爬蟲(二十)_動態爬取影評信息