1. 程式人生 > >python爬蟲——requests抓取某電影網站top100

python爬蟲——requests抓取某電影網站top100

今天閒的沒事,學習了一下爬蟲方面的知識,然後用requests庫實現了抓取貓眼網站top100電影,還是挺有意思的。

最近用到python比較多,也算是加強了python的運用吧 :-)

import  requests
from requests.exceptions import RequestException
import re
import json
from multiprocessing import pool  # 引入程序池,多程序抓取

def get_one_page(url):
     try:
        headers={
            "user-agent"
: "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } response=requests.get(url,headers=headers) # check the status_code,if success,return the HTML code. if response.status_code==200: return response.text return
None except RequestException: return None # macth movie info in each HTML def parse_source_page(html): # re.S表示.可以匹配換行符 pattern=re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a' +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S) all_items=re.findall(pattern,html) # print("一共抓取到"+str(len(all_items))+"個電影") for item in all_items: yield { "index":item[0], "image_url":item[1], "movie_name":item[2], "stars":item[3].strip()[3:], "show_time":item[4].strip()[5:], "rate":item[5]+item[6] } # content是包含n個dict型別的列表,需要用json.dumps()轉為str型別 def save_info_to_file(content,filename): file=open(filename,"a",encoding="utf-8") for i in range(len(content)): file.write(json.dumps(content[i],ensure_ascii=False)+"\n") file.close() def main(offset): url="https://maoyan.com/board/4?offset="+str(offset) html=get_one_page(url) item_list=[] for item in parse_source_page(html): print(item) item_list.append(item) save_info_to_file(item_list,"movies_top.txt") if __name__=="__main__": # 多執行緒抓取,但抓取到的內容不一定是rank 1-100 # spider_pool=pool.Pool() # spider_pool.map(main,[i*10 for i in range(10)]) for i in range(10): main(i*10)

抓取記錄