1. 程式人生 > >python爬取貓眼電影資訊

python爬取貓眼電影資訊

  1. 爬取TOP100所有電影的資訊,(電影名, 主演, 上映時間)
  2. 爬取該電影的宣傳封面的圖片, 儲存到本地/mnt/img/目錄中;
  3. 將獲取的資訊, 儲存到mysql資料庫中(電影名, 主演, 上映時間, 封面圖片的本地路徑)
import re

#<p class="releasetime">上映時間:1998-04-03</p>
from concurrent.futures import ThreadPoolExecutor

import time
import json
import pymysql
from gevent import monkey
monkey.patch_socket()
import gevent


url='http://maoyan.com/board/4?offset=0'

from urllib.request import urlopen


def get_page(url):
    page_li=[]
    for i in range(0,10):
        new_url=url+str(i*10)
        page_li.append(new_url)
    return page_li

def load_page_content(url):
    with urlopen(url,timeout=40) as f:
        content= str(f.read().decode('utf-8'))
        return content.replace('\n','').replace('\t','')

def get_info(url):
    """爬取每個網頁電影的名字,主演以及上映時間"""
    content=load_page_content(url)
    pattern_rank = r'<i class="board-index board-index-(\d+)">'
    film_rank= re.findall(pattern_rank, content)
    #電影名
    pattern_name=r'alt="(\w+·?:?\w*)'
    film_name=re.findall(pattern_name,content)
    #電影主演
    pattern_actor=r'class="star">\s*\w+:([\w+·?\w*·?\w*,?]+)'
    film_actor=re.findall(pattern_actor,content)
    #電影上映時間
    pattern_time=r'上映時間:(\w+-?\w*-?\w*\(?\w*\)?)'
    film_time=re.findall(pattern_time,content)
    info=[i for i in zip(film_rank,film_name,film_actor,film_time)]
    return info


# #將爬取到的資訊存入資料庫
def Mysql_reserve(url):
    conn=pymysql.connect(host='localhost',user='root',passwd='123',
                         db='westos01',charset='utf8')

    cur=conn.cursor()
    try:
        insert_sql='insert into filminfo values(%s,%s,%s,%s);'
        info=get_info(url)

        cur.executemany(insert_sql,info)
        conn.commit()
    except Exception as e:
        print('To lead mysql failure',e)
    else:
        print('To lead mysql success')


    cur.close()
    conn.close()
    
#
def main(url):
    page_li=get_page(url)
    for page_url in page_li:
      Mysql_reserve(page_url)


# 使用多執行緒爬取
def useTreading(url):
    page_li = get_page(url)
    with ThreadPoolExecutor(max_workers=4) as pool:
        pool.map(Mysql_reserve,page_li)

#使用協程爬取,順序不定
def geventMain(url):
    page_li = get_page(url)
    gevents=[gevent.spawn(Mysql_reserve,page_url)for page_url in page_li]
    gevent.joinall(gevents)

start=time.time()
main(url)
end=time.time()
print('%s run %s' %(main.__name__,end-start))


start=time.time()
useTreading(url)
end=time.time()
print('%s run %s' %(useTreading.__name__,end-start))

start=time.time()
geventMain(url)
end=time.time()
print('%s run %s' %(geventMain.__name__,end-start))