1. 程式人生 > >requests+re+multiprocessing爬取貓眼電影top100

requests+re+multiprocessing爬取貓眼電影top100

core except exce mat RoCE yield tle itl pan

技術分享圖片
import re
import json
import requests
from multiprocessing import Pool
from requests.exceptions import RequestException


def get_one_page(url):
    """
    獲取單頁面信息
    :param url:
    :return:
    """
    try:
        response = requests.get(url)
        if response.status_code == 200:
            
return response.text return None except RequestException: return None def parse_one_page(html): """ 解析頁面信息 :param html: :return: """ pattern = re.compile(<dd>.*?board-index.*?>(\d+)</i>.*?poster-default.*?src="(.*?)"
.*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?) </p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>, re.S) items = re.findall(pattern, html) for item in items: yield { "index": item[0],
"image": item[1], "title": item[2], "star": item[3].strip()[3:], "time": item[4].strip()[5:], "score": item[5]+item[6] } def save_to_file(content): """ 將信息保存到文件中 :param content: :return: """ with open("maoyan.txt", "a", encoding="utf-8") as f: f.write(json.dumps(content, ensure_ascii=False) + "\n") def main(offset): url = "https://maoyan.com/board/4?offset={}".format(offset) html = get_one_page(url) for item in parse_one_page(html): save_to_file(item) if __name__ == "__main__": # for i in range(10): # main(i*10) # 使用多進程請求多個url來減少網絡等待浪費的時間 # map默認異步執行任務、自帶close和join功能 pool = Pool() pool.map(main, [i*10 for i in range(10)])
View Code

requests+re+multiprocessing爬取貓眼電影top100