1. 程式人生 > >20170513爬取貓眼電影Top100

20170513爬取貓眼電影Top100

top compile bs4 etime http res XML n) quest

import json
import re
import requests
from bs4 import BeautifulSoup
from requests import RequestException
from multiprocessing import Pool
def get_one_page(url):
headers = {‘User-Agent‘:‘baiduspider+‘}
try:
response = requests.get(url,headers=headers,timeout = 5)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
#pattern = re.compile(‘<dd>.*?board-index.*?>(\d+)</i>.*?src="(.*?)".*?name"><a.*?(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)<i>.*?fraction">(.*?)</i>.*?</dd>‘,re.S)
#items = re.findall(pattern,html)
for item in BeautifulSoup(html,‘lxml‘).find_all(‘dd‘):
rank = item.select(‘i‘)[0].text
name = item.select(‘p > a‘)[0].text
star = item.select(‘.star‘)[0].text.strip()
releasetime = item.select(‘.releasetime‘)[0].text
integer = item.select(‘.integer‘)[0].text
fraction = item.select(‘.fraction‘)[0].text
grade = integer+fraction
yield {
‘rank‘:rank,
‘name‘:name,
‘star‘:star,
‘releasetime‘:releasetime,
‘grade‘:grade
}
#print(rank,name,star,releasetime,‘評分為:‘,integer+fraction)
#return rank,name,star,releasetime,‘評分為:‘,integer+fraction
def write_to_file(content):
with open(‘result.txt‘,‘a‘,encoding=‘utf-8‘) as f:
f.write(json.dumps(content,ensure_ascii=False) + ‘\n‘)#將字典轉化為字符串
f.close()
def main(offset):
url = ‘http://maoyan.com/board/4?offset=‘ + str(offset)
html = get_one_page(url)
parse_one_page(html)
for item in parse_one_page(html):
print(item)
write_to_file(item)

if __name__=="__main__":
for i in range(10):
main(i*10)
#pool = Pool()
#pool.map(main,[i*10 for i in range(10)])

20170513爬取貓眼電影Top100