python爬取貓眼電影資訊
阿新 • • 發佈:2019-01-30
- 爬取TOP100所有電影的資訊,(電影名, 主演, 上映時間)
- 爬取該電影的宣傳封面的圖片, 儲存到本地/mnt/img/目錄中;
- 將獲取的資訊, 儲存到mysql資料庫中(電影名, 主演, 上映時間, 封面圖片的本地路徑)
import re #<p class="releasetime">上映時間:1998-04-03</p> from concurrent.futures import ThreadPoolExecutor import time import json import pymysql from gevent import monkey monkey.patch_socket() import gevent url='http://maoyan.com/board/4?offset=0' from urllib.request import urlopen def get_page(url): page_li=[] for i in range(0,10): new_url=url+str(i*10) page_li.append(new_url) return page_li def load_page_content(url): with urlopen(url,timeout=40) as f: content= str(f.read().decode('utf-8')) return content.replace('\n','').replace('\t','') def get_info(url): """爬取每個網頁電影的名字,主演以及上映時間""" content=load_page_content(url) pattern_rank = r'<i class="board-index board-index-(\d+)">' film_rank= re.findall(pattern_rank, content) #電影名 pattern_name=r'alt="(\w+·?:?\w*)' film_name=re.findall(pattern_name,content) #電影主演 pattern_actor=r'class="star">\s*\w+:([\w+·?\w*·?\w*,?]+)' film_actor=re.findall(pattern_actor,content) #電影上映時間 pattern_time=r'上映時間:(\w+-?\w*-?\w*\(?\w*\)?)' film_time=re.findall(pattern_time,content) info=[i for i in zip(film_rank,film_name,film_actor,film_time)] return info # #將爬取到的資訊存入資料庫 def Mysql_reserve(url): conn=pymysql.connect(host='localhost',user='root',passwd='123', db='westos01',charset='utf8') cur=conn.cursor() try: insert_sql='insert into filminfo values(%s,%s,%s,%s);' info=get_info(url) cur.executemany(insert_sql,info) conn.commit() except Exception as e: print('To lead mysql failure',e) else: print('To lead mysql success') cur.close() conn.close() # def main(url): page_li=get_page(url) for page_url in page_li: Mysql_reserve(page_url) # 使用多執行緒爬取 def useTreading(url): page_li = get_page(url) with ThreadPoolExecutor(max_workers=4) as pool: pool.map(Mysql_reserve,page_li) #使用協程爬取,順序不定 def geventMain(url): page_li = get_page(url) gevents=[gevent.spawn(Mysql_reserve,page_url)for page_url in page_li] gevent.joinall(gevents) start=time.time() main(url) end=time.time() print('%s run %s' %(main.__name__,end-start)) start=time.time() useTreading(url) end=time.time() print('%s run %s' %(useTreading.__name__,end-start)) start=time.time() geventMain(url) end=time.time() print('%s run %s' %(geventMain.__name__,end-start))