1. 程式人生 > >python 爬蟲學習三(Scrapy 實戰,豆瓣爬取電影資訊)

python 爬蟲學習三(Scrapy 實戰,豆瓣爬取電影資訊)

利用Scrapy爬取豆瓣電影資訊主要列出Scrapy的三部分程式碼:

spider.py檔案:

# _*_ coding=utf-8 _*_
import scrapy
from course.douban_items import DouBanItem
from scrapy.http import Request


class DouBanSpider(scrapy.Spider):
    name = "DouBanSpider"
    allowed_domains = ["movie.douban.com"]
    start_urls = []

    def start_requests(self):
        file_object = open('/Users/lucas/PycharmProjects/scrapy_learn/course/course/movie_name', 'r')
        try:
            url_head = "http://movie.douban.com/subject_search?search_text="
            for line in file_object:
                self.start_urls.append(url_head+line)
            for url in self.start_urls:
                yield self.make_requests_from_url(url)
        finally:
            file_object.close()

    def parse(self, response):
        url = response.xpath('//*[@id="content"]/div/div[1]/div[2]/table[1]/tr/td[1]/a/@href').extract()
        if url:
            yield Request(url[0], callback=self.parse_item)

    def parse_item(self, response):
        item = DouBanItem()
        item['movie_name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
        item['movie_director'] = response.xpath('//*[@id="info"]/span[1]/span[2]/a/text()').extract()
        item['movie_writer'] = response.xpath('//*[@id="info"]/span[2]/span[2]/a/text()').extract()
        item['movie_roles'] = response.xpath('//*[@id="info"]/span[3]/span[2]/a/text()').extract()
        item['movie_language'] = response.xpath('//*[@id="info"]/text()').extract()[10]
        item['movie_date'] = response.xpath('//*[@id="info"]/span[11]/text()').extract()
        item['movie_long'] = response.xpath('//*[@id="info"]/span[13]/text()').extract()
        item['movie_description'] = response.xpath('//*[@id="link-report"]/span/text()').extract()
        item["movie_score"] = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()
        yield item
item.py檔案:
# _*_ coding=utf-8 _*_
import scrapy
from scrapy import Item, Field


class DouBanItem(Item):
	movie_name = Field()
	movie_director = Field()
	movie_writer = Field()
	movie_roles = Field()
	movie_language = Field()
	movie_date = Field()
	movie_long = Field()
	movie_description = Field()
	movie_score = Field()
pipeline.py檔案:
# _*_ coding=utf-8 _*_
from scrapy.exceptions import DropItem
import json


class DouBanPipeline(object):
    def __init__(self):
        #開啟檔案
        self.file = open('douban.json', 'w')
    #該方法用於處理資料
    def process_item(self, item, spider):
        #讀取item中的資料
        line = json.dumps(dict(item)) + "\n"
        #寫入檔案
        self.file.write(line)
        #返回item
        return item
    #該方法在spider被開啟時被呼叫。
    def open_spider(self, spider):
        pass
    #該方法在spider被關閉時被呼叫。
    def close_spider(self, spider):
        pass
主要程式碼就是以上三部分,附上我的程式碼下載地址,需要的朋友可以下載看看,共同學習:http://download.csdn.net/detail/lb245557472/9851006