python爬蟲——requests抓取某電影網站top100
阿新 • • 發佈:2019-02-04
今天閒的沒事,學習了一下爬蟲方面的知識,然後用requests庫實現了抓取貓眼網站top100電影,還是挺有意思的。
最近用到python比較多,也算是加強了python的運用吧 :-)
import requests
from requests.exceptions import RequestException
import re
import json
from multiprocessing import pool # 引入程序池,多程序抓取
def get_one_page(url):
try:
headers={
"user-agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
}
response=requests.get(url,headers=headers)
# check the status_code,if success,return the HTML code.
if response.status_code==200:
return response.text
return None
except RequestException:
return None
# macth movie info in each HTML
def parse_source_page(html):
# re.S表示.可以匹配換行符
pattern=re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
all_items=re.findall(pattern,html)
# print("一共抓取到"+str(len(all_items))+"個電影")
for item in all_items:
yield {
"index":item[0],
"image_url":item[1],
"movie_name":item[2],
"stars":item[3].strip()[3:],
"show_time":item[4].strip()[5:],
"rate":item[5]+item[6]
}
# content是包含n個dict型別的列表,需要用json.dumps()轉為str型別
def save_info_to_file(content,filename):
file=open(filename,"a",encoding="utf-8")
for i in range(len(content)):
file.write(json.dumps(content[i],ensure_ascii=False)+"\n")
file.close()
def main(offset):
url="https://maoyan.com/board/4?offset="+str(offset)
html=get_one_page(url)
item_list=[]
for item in parse_source_page(html):
print(item)
item_list.append(item)
save_info_to_file(item_list,"movies_top.txt")
if __name__=="__main__":
# 多執行緒抓取,但抓取到的內容不一定是rank 1-100
# spider_pool=pool.Pool()
# spider_pool.map(main,[i*10 for i in range(10)])
for i in range(10):
main(i*10)