1. 程式人生 > >Requests+正則表達式爬取貓眼電影

Requests+正則表達式爬取貓眼電影

none tle req boa cto asc sta int col

 1 # encoding:utf-8
 2 from requests.exceptions import RequestException
 3 import requests
 4 import re
 5 import json
 6 from multiprocessing import Pool
 7 
 8 def get_one_page(url):
 9     try:
10         response = requests.get(url)
11         if response.status_code == 200:
12             return
response.text 13 return None 14 except RequestException: 15 return None 16 17 def parse_one_page(html): 18 pattern = re.compile(<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a 19 +.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>
20 +.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>,re.S) 21 items = re.findall(pattern, html) 22 # print(items) 23 for item in items: 24 yield { 25 index: item[0], 26 image: item[1], 27
title: item[2], 28 actor: item[3].strip()[3:], 29 time: item[4].strip()[5:], 30 score: item[5]+item[6] 31 } 32 33 def write_to_file(content): 34 with open(MaoyanTop100.txt, a, encoding=utf-8) as f: 35 f.write(json.dumps(content, ensure_ascii=False)+\n) 36 f.close() 37 38 def main(offset): 39 url = "http://maoyan.com/board/4?offset="+str(offset) 40 html = get_one_page(url) 41 # print(html) 42 # parse_one_page(html) 43 for item in parse_one_page(html): 44 print(item) 45 write_to_file(item) 46 47 if __name__ == __main__: 48 pool = Pool() 49 # for i in range(10): 50 # main(i*10) 51 # 加快效率 52 pool.map(main, [i*10 for i in range(10)])

效果圖:

技術分享圖片

Requests+正則表達式爬取貓眼電影