1. 程式人生 > >scrapy實戰爬取電影天堂相關資訊

scrapy實戰爬取電影天堂相關資訊

# encoding: utf-8
import scrapy

from scrapy import Selector
from scrapy import Request
from pacong.items import MovieNews, MovieChina, MovieOuMei, MovieRiHan;


class DmozSpider(scrapy.Spider):
    name = "movie"
    start_urls = [
        "http://www.dytt8.net"
    ]
    url = "http://www.dytt8.net"
    xq_url = "http://www.ygdy8.net"
def parse(self, response): selector = Selector(response) typeul = selector.xpath('//ul') types = typeul.xpath('li') for index, t in enumerate(types): href = t.xpath('a/@href').extract() title = t.xpath('a/text()').extract() if index < 5
: href = href[0] title = title[0] if 'http://' in href: if href == "http://www.ygdy8.net/html/gndy/index.html": pass else: yield Request(href, callback=self.parseItems, meta={'title'
: title,'url':href}) else: yield Request(self.xq_url + href, callback=self.parseItems, meta={'title': title,'url':self.xq_url+href}) def parseItems(self, response): title = response.meta['title'] urll = response.meta['url'] selector = Selector(response) nextHrefs = selector.xpath('//select[@name="sldd"]') # for next in nextHrefs: hrefs = nextHrefs.xpath('option/@value').extract() for href in hrefs: nextUrl = urll[0:urll.rindex('/') + 1] + href yield Request(nextUrl, callback=self.parseItems, meta={'title': title, 'url': nextUrl}) movies = selector.xpath('//div[@class="co_content8"]') mov = movies.xpath('ul/td/table') # 審查元素和直接看網頁原始碼不一樣,,,審查元素沒有td標籤,艹。 for movie in mov: name = movie.xpath('tr/td/b/a/text()').extract() href = movie.xpath('tr/td/b/a/@href').extract() time = movie.xpath('tr/td/font/text()').extract() zonghe = movie.xpath('tr/td[@colspan="2"]/text()').extract() name = name[len(name)-1] href = href[len(href)-1] time = time[0] zonghe = zonghe[0] yield Request(self.xq_url + href, callback=self.parseDetail, meta={'name': name, "time": time, 'title': title, 'zonghe': zonghe}) def parseDetail(self,response): selector = Selector(response) zoom = selector.xpath('//div[@id="Zoom"]') imgs = zoom.xpath('td/img/@src').extract() img = ','.join(imgs) downloadUrl = zoom.xpath('td/table/tbody/tr/td/a/text()').extract() title = response.meta['title'] name = response.meta['name'] time = response.meta['time'] zonghe = response.meta['zonghe'] if downloadUrl: downloadUrl = downloadUrl[0] if title == u"最新影片": movieNews = MovieNews() movieNews['movie_name'] = name movieNews['movie_time'] = time movieNews['movie_image'] = img movieNews['movie_abstract'] = zonghe movieNews['movie_download'] = downloadUrl yield movieNews elif title == u"其它電影": movieRh = MovieRiHan() movieRh['movie_name'] = name movieRh['movie_time'] = time movieRh['movie_image'] = img movieRh['movie_abstract'] = zonghe movieRh['movie_download'] = downloadUrl yield movieRh elif title == u"歐美電影": movieOm = MovieOuMei() movieOm['movie_name'] = name movieOm['movie_time'] = time movieOm['movie_image'] = img movieOm['movie_abstract'] = zonghe movieOm['movie_download'] = downloadUrl yield movieOm elif title == u"國內電影": movieCh = MovieChina() movieCh['movie_name'] = name movieCh['movie_time'] = time movieCh['movie_image'] = img movieCh['movie_abstract'] = zonghe movieCh['movie_download'] = downloadUrl yield movieCh

恩,有點小基礎的同學應該不用我翻譯程式碼,下載跑一下估計就明白了。
我大概描述下把,首先parse方法是爬取首頁的分類的連結,我只取了4個分類,分別是最新,國內,歐美,日韓。然後根據分類連結爬取每個分類下的電影列表,也就是parseItems方法,並在裡面實現了下頁迴圈爬取的迭代,最後根據爬取到的電影詳情頁連結去爬取詳情頁的資料,並存在mongodb資料庫裡面。最後執行爬取了7000多條電影資料。這裡寫圖片描述
這裡寫圖片描述

沒有積分下載的同學,可以看我之前的3篇關於慕課網視訊文章裡面有我的聯絡方式,免費索取程式碼。