1. 程式人生 > >python爬蟲 電影頁面資訊 xpath csv寫入 圖片儲存到本地

python爬蟲 電影頁面資訊 xpath csv寫入 圖片儲存到本地

import re
import requests
from  lxml import etree
import time
import urllib.request
import csv
import os

# 獲取電影詳情
def getMoviesDetail(id,score):
    movies_id = re.sub(r'/films/', '', id)
    details_url = 'http://maoyan.com/films/' + movies_id
    print(details_url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
    }
    # 定義代理
    proxy_addr = {
        'http': '88.146.227.253:8080'
    }
    details_content = requests.get(details_url, headers=headers,proxies = proxy_addr).text
    html = etree.HTML(details_content)
    # 過濾出電影名稱
    name = html.xpath('//div[@class="movie-brief-container"]/h3/text()')[0]
    # 過濾出地區 上映時間
    region_showTime = html.xpath('//div[@class="movie-brief-container"]/ul/li[3]/text()')[0]
    pattern = re.compile(u"[\u4e00-\u9fa5]+")
    region = re.findall(pattern, region_showTime)[0]
    pattern = re.compile(u"[\u4e00-\u9fa5]+")
    show_time = re.sub(pattern,"",region_showTime)
    # 過濾出電影時長
    duartion = html.xpath('//div[@class="movie-brief-container"]/ul/li[2]/text()')[0]
    pattern = re.compile(r"\d+")
    duartion = re.findall(pattern, duartion)[0]
    # 過濾出圖片連結
    image_url = html.xpath('//div[@class="avatar-shadow"]/img/@src')[0]

    # 將電影資訊寫入csv文件
    data = [name, score, region, show_time, duartion]
    writerDataTocsv(data)
    # 將圖片下載操本地
    dowloadImage(image_url,name)





# 獲取電影id
def getMoviesId():
    url = "http://maoyan.com/films"
    #設定請求頭
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
    }
    # 定義代理
    proxy_addr ={
        'http': '88.146.227.253:8080'
    }
    content = requests.get(url,headers= headers,proxies = proxy_addr).text.encode('utf-8')
    html = etree.HTML(content)
    # xpath過濾
    films_list =html.xpath('//div[@class = "movies-list"]/dl//div[@class="movie-item"]/a/@href')
    # 將標題先存入
    data = ['電影名稱', '電影評分', '上映地區', '上映時間', '電影時長']
    writerDataTocsv(data)
    # 通過id迴圈呼叫下載詳情頁
    for i in  films_list:
       getMoviesDetail(i,90)
       time.sleep(3)

# 資料寫入csv
def writerDataTocsv(data):
    try:
        with open('movie_info.csv', 'a+') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(data)
            csvfile.close()
    except:
        print("寫入檔案錯誤")

# 下載圖片到本地
def dowloadImage(image_url,name):
    file_path = 'moviesImage'
    try:
        if not os.path.exists(file_path):
             os.makedirs(file_path)
        filename = '{}{}{}{}'.format(file_path,os.sep,name,'.jpg',)
        urllib.request.urlretrieve(image_url,filename=filename)
    except IOError as e:
        print('檔案操作失敗',e)

getMoviesId()