1. 程式人生 > >使用scrapy框架爬取貓眼電影全部的頁碼 並寫入資料庫

使用scrapy框架爬取貓眼電影全部的頁碼 並寫入資料庫

使用scrapy框架爬取貓眼電影 爬取全部的頁數

import scrapy,re
from jobmaoyan.items import JobmaoyanItem
class MaoyanSpider(scrapy.Spider):
    name = 'maoyan_spider'
    allowed_domains=['maoyan.com']
    start_urls=['http://maoyan.com/films?showType=3']

    page_set=set()
    def parse(self, response):

        datalist = response.xpath("//dd")
        for data in datalist:
            item = JobmaoyanItem()
            item['title'] = data.xpath("div[@class='channel-detail movie-item-title']/a/text()").extract()[0]
            item['imgurl'] = data.xpath("div[@class='movie-item']/a[@target='_blank']/div[@class='movie-poster']/img[2]/@data-src").extract()[0]

            item['types'] = response.xpath("//li[@class='tags-line']/ul[@class='tags']/li[@class='active']/a[starts-with(@href,'javascript')]/text()").extract()[0]
            # print('==========================',item['types'])
            datail_url = "http://maoyan.com" + data.xpath("div[@class='movie-item']/a/@href").extract()[0]

            yield scrapy.Request(url=datail_url, callback=self.parse_detail, meta={"data": item})

	#獲取頁碼網址遞迴迴圈實現獲取全部的頁碼
        pageurls = response.xpath("//a[starts-with(@href,'?showType=3&offset=')]/@href").extract()
        for pageurl in pageurls:
            if pageurl in self.page_set:
                pass
            else:
                self.page_set.add(pageurl)
                purl = 'http://maoyan.com/films' + pageurl
                # print('------------------------',purl)
                yield scrapy.Request(url=purl, callback=self.parse)

    def parse_detail(self,response):

        item = response.meta['data']
        item["d_type"] = response.xpath("//div[@class='movie-brief-container']/ul/li[1]/text()").extract()[0]
        item["d_country"] = response.xpath("//div[@class='movie-brief-container']/ul/li[2]/text()").extract()[0]
        item['d_country'] = re.sub(r"\s", "", item['d_country'])
        item["d_stime"] = response.xpath("//div[@class='movie-brief-container']/ul/li[3]/text()").extract()[0]
        item["d_content"]=response.xpath("//div[@class='mod-content']/span[@class ='dra']/text()").extract()[0]
        item["comment1"]=response.xpath("//div[@class='comment-list-container']/ul/li[1]/div[@class='main']/div[@class='comment-content']/text()").extract()[0]
        item["comment2"]=response.xpath("//div[@class='comment-list-container']/ul/li[2]/div[@class='main']/div[@class='comment-content']/text()").extract()[0]
        item["comment3"]=response.xpath("//div[@class='comment-list-container']/ul/li[3]/div[@class='main']/div[@class='comment-content']/text()").extract()[0]

        yield item

寫入資料庫檔案pipelines_mysql 需要在setting中寫入pipelines_mysql檔案 並放開

ITEM_PIPELINES = {
   # 'jobmaoyan.pipelines.JobmaoyanPipeline': 300,
   # 'jobmaoyan.pipelines_txt.JobmaoyanPipeline': 300,
   'jobmaoyan.pipelines_mysql.JobmaoyanPipeline': 300,
   # 'jobmaoyan.pipelines_json.JobmaoyanPipeline': 300,
   # 'jobmaoyan.pipelines_xls.JobmaoyanPipeline': 300,
   # 'jobmaoyan.pipelines_mongdb.JobmaoyanPipeline': 300,
}
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql

class JobmaoyanPipeline(object):
    def process_item(self, item, spider):
        db = pymysql.connect('127.0.0.1', "root", "123456", "jobbole")
        cursor = db.cursor()
        create_sql="create table if not exists catmovies(id int primary key auto_increment,types text,title text,imgurl text,type text,country text,stime text,content text,comment1 text,comment2 text,comment3 text)"
        cursor.execute(create_sql)
        insert_sql = "insert into catmovies values(0,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"%(item['types'],item['title'],item['imgurl'],item['d_type'],item['d_country'],item['d_stime'],item['d_content'],item["comment1"],item["comment2"],item["comment3"])
        try:
            cursor.execute(insert_sql)
            db.commit()
        except:
            db.rollback()

        cursor.close()
        db.close()

        return item