1. 程式人生 > >爬取起點小說並存入資料庫

爬取起點小說並存入資料庫

最終效果如下:

······················主程式:·······································

# -*- coding: utf-8 -*-
import scrapy
import  requests
import json
from qidian.items import QidianItem

class MyqidianSpider(scrapy.Spider):
    name = 'myqidian'
    allowed_domains = ['qidian.com']
    start_urls = ['http://www.qidian.com/all?chanId=21&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0']

    def parse(self, response):
        # print(response.text)

        bookList = response.xpath('//ul[@class="all-img-list cf"]/li')
        for i in bookList:
            bookId = i.xpath('./div[@class="book-img-box"]/a/@data-bid').extract()[0]
            bookUrl = 'http:'+ i.xpath('./div[@class="book-img-box"]/a/@href').extract()[0]
            yield scrapy.Request(bookUrl,callback=self.get_url,meta={"bookId":bookId})#把 url , bookId  傳到下一個方法
        #構建翻頁
        page = response.xpath('//@data-pagemax)').extract_first()
        page = int(page)
        for i in range(2, page + 1):
            url = "http://www.qidian.com/all?chanId=21&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page={}".format(
                i)
            yield scrapy.Request(url, callback=self.parse)

    def get_url(self,response):
        meta = response.meta
        bookId = response.meta['bookId']
        jsonurl = 'https://book.qidian.com/ajax/book/category?_csrfToken=OFmDKzipSh4trLG5YRG79dFXcFYAEZgV0cjNceDd&bookId=' + bookId
        bookName = response.xpath('//div[@class="book-info "]/h1/em/text()').extract()[0]
        writerName = response.xpath('//div[@class="book-info "]/h1/span/a/text()').extract()[0]
        xinxi = response.xpath('//div[@class="book-intro"]/p/text()').extract()[0].strip()
        meta =  {
            "bookName" : bookName,"writerName" : writerName, "xinxi" : xinxi
        }
        yield scrapy.Request(jsonurl,callback=self.get_zhangjie,meta = meta)

    def get_zhangjie(self,response):
        meta = response.meta
        bookName = meta['bookName']
        writerName = meta['writerName']
        xinxi = meta['xinxi']
        html = requests.get(response.url).content.decode('utf-8')
        data = json.loads(html)['data']
        vs = data.get('vs')
        for i in vs:
            cs = i.get('cs')
            for i in cs:
                cN = i.get('cN')
                cU = i.get('cU')
                curl = 'https://read.qidian.com/chapter/'+cU
                uT = i.get('uT')
                cnt = i.get('cnt')

                meta = {
                    "bookName": bookName, "writerName": writerName, "xinxi": xinxi,
                    "cN" : cN, "curl" : curl,"uT" : uT,"cnt":cnt
                }
                yield scrapy.Request(curl,callback=self.Lett_text,meta = meta)

    def Lett_text(self,response):
        item = QidianItem()
        meta = response.meta
        item['bookName'] = meta['bookName']
        item['writerName'] = meta['writerName']
        item['xinxi'] = meta['xinxi']
        item['cN'] = meta['cN']
        item['curl'] = meta['curl']
        item['uT'] = meta['uT']
        item['cnt'] = meta['cnt']

        textList = response.xpath('//div[@class="read-content j_readContent"]')
        for text in textList:
            text = text.xpath('//p/text()').extract()[1:]
            item['text'] = ''.join(text).strip().replace('\u3000','')
            yield item


··············item檔案:··························
import scrapy

class QidianItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    bookName = scrapy.Field()
    writerName = scrapy.Field()
    xinxi = scrapy.Field()
    cN = scrapy.Field()
    curl = scrapy.Field()
    uT = scrapy.Field()
    cnt = scrapy.Field()
    text = scrapy.Field()
 ················寫入資料庫

import pymysql
class QidianPipeline(object):
    def __init__(self):
        self.conn = None
        self.cur = None

    def open_spider(self, spider):
        self.conn = pymysql.connect(
            host='127.0.0.1',
            port=3306,
            user='root',
            password='密碼',
            db='pydata201806',
            charset='utf8'
        )
        self.cur = self.conn.cursor()

    def process_item(self, item, spider):
        # if not hasattr(item, 'table_name'):
        #     return item
        cols, values = zip(*item.items())
        sql = "INSERT INTO `%s` (%s) VALUES (%s)" % \
              (
                  'qidianbook',
                  ','.join(cols),
                  ','.join(['%s'] * len(values))
               )
        self.cur.execute(sql, values)
        self.conn.commit()
        print(self.cur._last_executed)
        return item

    def close_spider(self, spider):
        self.cur.close()
        self.conn.close()