pyspider 爬豆瓣電影資訊

阿新 • • 發佈：2019-01-20

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-11-23 10:50:38
# Project: doubanmovie
from pyspider.libs.base_handler import *
import pymongo,sys
from pymongo import MongoClient
import re,time,random,hashlib
from scrapy.selector import Selector
from datetime import *
class Handler(BaseHandler):

    client = MongoClient(host="localhost",port=27017, read_preference=pymongo.read_preferences.ReadPreference.PRIMARY_PREFERRED)
    db = client.edogdata
    db.authenticate("database","passwd")

倫理片http://www.dotdy.com/

    @every(minutes=24*60)
    def on_start(self):
        self.crawl('http://movie.douban.com/tag/', callback=self.index_page) #豆瓣電影標籤

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            if re.match("http://www.douban.com/tag/\w+", each.attr.href, re.U):   # 愛情，戲劇，劉德華，1990，驚悚，恐怖。。。
                self.crawl(each.attr.href, callback=self.list_page)

    @config(age=2, priority=2)
    def list_page(self, response):
        #print "urlllll response",response.doc
        movie = response.doc('a[href^="http"]').items()
        for each in movie:
            if re.match("http://www.douban.com/link2/\W+",each.attr.href):
                #print "each",each.text()
                #print "each.attr.href",each.attr.href
                if each.attr.href.find("movie") < 130:
                    self.crawl(each.attr.href, callback=self.final_page)
                    print "find movie"

    def final_page(self,response):
        for each in response.doc('a[href^="http"]').items():
            if re.match('http://movie.douban.com/\w+',each.attr.href,re.U):
                self.crawl(each.attr.href,callback=self.detail_page)
            #翻頁
        self.crawl([x.attr.href for x in response.doc('.next a').items()],callback=self.final_page) # .next 的 '.' 表示class=next ，如果是 #next 的話，則表示 id=next ×××××××××××××××××××××××××××××××××××××××××××××××××××
    #@config(priority=4)
    def detail_page(self, response):
        now = str(datetime.now())
        _id = hashlib.md5(response.doc('HTML>BODY>DIV#wrapper>DIV#content>H1>SPAN').text().encode('utf-8')).hexdigest()
        site = response.url
        name = response.doc("title").text().split('(')[0]#response.doc('HTML>BODY>DIV#wrapper>DIV#content>H1>SPAN[property="v:itemreviewed"]').text()
        director = [x.text() for x in response.doc('a[rel="v:directedBy"]').items()]
        release_date = [x.text() for x in response.doc('span[property="v:initialReleaseDate"]').items()]
        actor = '/'.join([x.text() for x in response.doc('a[rel="v:starring"]').items()])
        rating = [x.text() for x in response.doc('strong[property="v:average"]').items()]
        type = [x.text() for x in response.doc('span[property="v:genre"]').items()]
        source = '豆瓣'
        #IMDb_Link = [x.attr.href for x in response.doc('a[rel="nofollow"]').items()]
        IMDb_Link = response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-16-8.clearfix>DIV.article>DIV.indent.clearfix>DIV.subjectwrap.clearfix>DIV.subject.clearfix>DIV#info>a[rel="nofollow"]').attr.href # HTML>BODY 可以不用
        self.db.douban.insert({"_id":_id,"url":site , "title":name,"time":now,"director":director,"release_date": release_date,'actor':actor,"type": type,"source": source,"rating":rating,"IMDb_Link":IMDb_Link})
        return {
            "url": response.url,
            "title": name,#response.doc('HTML>BODY>DIV#wrapper>DIV#content>H1>SPAN[property="v:itemreviewed"]').text(),#property="v:itemreviewed
            "rating": rating,#response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-16-8.clearfix>DIV.article>DIV.indent.clearfix>DIV.subjectwrap.clearfix>DIV#interest_sectl>DIV.rating_wrap.clearbox>DIV.rating_self.clearfix>STRONG.ll.rating_num').text(),
            "導演": actor,#[x.text() for x in response.doc('a[rel="v:directedBy"]').items()],
            "time": now,
            "release_date" : release_date,
            "actor" : actor,
            "type" : type,
            "IMDb_Link" : IMDb_Link

影音先鋒電影http://www.iskdy.com/

pyspider 爬豆瓣電影資訊

倫理片http://www.dotdy.com/

pyspider 爬豆瓣電影資訊

案例學python——案例三：豆瓣電影資訊入庫一起學爬蟲——通過爬取豆瓣電影top250學習requests庫的使用

python 爬蟲學習三（Scrapy 實戰，豆瓣爬取電影資訊）

豆瓣電影資訊爬取並儲存到excel

python爬取豆瓣電影資訊

Python爬蟲入門 | 2 爬取豆瓣電影資訊

小菜鳥的第一個爬蟲：豆瓣爬取電影資訊

爬豆瓣電影名

python爬蟲之獲取豆瓣電影資訊

python爬蟲登陸豆瓣爬豆瓣電影短評

使用BeautifulSoup方法抓取豆瓣電影資訊

python爬取豆瓣電影Top250的資訊

python爬蟲——爬取豆瓣電影top250資訊並載入到MongoDB資料庫中

【Python爬蟲】Scrapy框架運用1—爬取豆瓣電影top250的電影資訊(1)

爬取豆瓣電影排行（T250）的資訊

python爬取資料（豆瓣上TOP250的電影資訊）初學者必看！！！

Python網路爬蟲：利用正則表示式爬取豆瓣電影top250排行前10頁電影資訊

2-6-1 應用案例：爬取豆瓣 TOP250 電影資訊並存儲（版本：py3）——學習筆記

基於BeautifulSoup爬取豆瓣網上的電影資訊

scrapy爬取豆瓣電影top250

pyspider 爬豆瓣電影資訊

倫理片http://www.dotdy.com/

相關推薦