python 爬蟲學習三(Scrapy 實戰,豆瓣爬取電影資訊)
阿新 • • 發佈:2019-01-26
利用Scrapy爬取豆瓣電影資訊主要列出Scrapy的三部分程式碼:
spider.py檔案:
item.py檔案:# _*_ coding=utf-8 _*_ import scrapy from course.douban_items import DouBanItem from scrapy.http import Request class DouBanSpider(scrapy.Spider): name = "DouBanSpider" allowed_domains = ["movie.douban.com"] start_urls = [] def start_requests(self): file_object = open('/Users/lucas/PycharmProjects/scrapy_learn/course/course/movie_name', 'r') try: url_head = "http://movie.douban.com/subject_search?search_text=" for line in file_object: self.start_urls.append(url_head+line) for url in self.start_urls: yield self.make_requests_from_url(url) finally: file_object.close() def parse(self, response): url = response.xpath('//*[@id="content"]/div/div[1]/div[2]/table[1]/tr/td[1]/a/@href').extract() if url: yield Request(url[0], callback=self.parse_item) def parse_item(self, response): item = DouBanItem() item['movie_name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract() item['movie_director'] = response.xpath('//*[@id="info"]/span[1]/span[2]/a/text()').extract() item['movie_writer'] = response.xpath('//*[@id="info"]/span[2]/span[2]/a/text()').extract() item['movie_roles'] = response.xpath('//*[@id="info"]/span[3]/span[2]/a/text()').extract() item['movie_language'] = response.xpath('//*[@id="info"]/text()').extract()[10] item['movie_date'] = response.xpath('//*[@id="info"]/span[11]/text()').extract() item['movie_long'] = response.xpath('//*[@id="info"]/span[13]/text()').extract() item['movie_description'] = response.xpath('//*[@id="link-report"]/span/text()').extract() item["movie_score"] = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract() yield item
pipeline.py檔案:# _*_ coding=utf-8 _*_ import scrapy from scrapy import Item, Field class DouBanItem(Item): movie_name = Field() movie_director = Field() movie_writer = Field() movie_roles = Field() movie_language = Field() movie_date = Field() movie_long = Field() movie_description = Field() movie_score = Field()
主要程式碼就是以上三部分,附上我的程式碼下載地址,需要的朋友可以下載看看,共同學習:http://download.csdn.net/detail/lb245557472/9851006# _*_ coding=utf-8 _*_ from scrapy.exceptions import DropItem import json class DouBanPipeline(object): def __init__(self): #開啟檔案 self.file = open('douban.json', 'w') #該方法用於處理資料 def process_item(self, item, spider): #讀取item中的資料 line = json.dumps(dict(item)) + "\n" #寫入檔案 self.file.write(line) #返回item return item #該方法在spider被開啟時被呼叫。 def open_spider(self, spider): pass #該方法在spider被關閉時被呼叫。 def close_spider(self, spider): pass