1. 程式人生 > >用crawl spider爬取起點網小說信息

用crawl spider爬取起點網小說信息

models anti arc pub work 全部 see 效率 rand

起點作為主流的小說網站,在防止數據采集反面還是做了準備的,其對主要的數字采用了自定義的編碼映射取值,想直接通過頁面來實現數據的獲取,是無法實現的。

單獨獲取數字還是可以實現的,通過requests發送請求,用正則去匹配字符元素,並再次匹配其映射關系的url,獲取到的數據通過font包工具解析成字典格式,再做編碼匹配,起點返回的編碼匹配英文數字,英文數字匹配阿拉伯數字,最後拼接,得到實際的數字字符串,但這樣多次發送請求,爬取效率會大大降低。本次集中爬取舍棄了爬取數字,選擇了較容易獲取的評分數字。評分值默認為0 ,是從後臺推送的js數據裏取值更新的。

實現的主要代碼:

items部分:

#
-*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy from scrapy import Field class QdItem(scrapy.Item): # define the fields for your item here like: book_name = scrapy.Field() #
書名 author=scrapy.Field() #作者 state=scrapy.Field() #狀態 type=scrapy.Field() #類型 about=scrapy.Field() #簡介 # number=scrapy.Field() #字數 score=scrapy.Field() #評分 story=scrapy.Field() #
故事 news=scrapy.Field() #最新章節 photo=scrapy.Field() #封面

spider部分:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from qd.items import QdItem
import re,requests
from fontTools.ttLib import TTFont
from io import BytesIO
import time


class ReadSpider(CrawlSpider):
    name = read
    # allowed_domains = [‘qidian.com‘]
    start_urls = [https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page=1]

    rules = (
        #匹配全部主頁面的url規則  深度爬取子頁面
        Rule(LinkExtractor(allow=(rhttps://www.qidian.com/all\?orderId=\&style=1\&pageSize=20\&siteid=1\&pubflag=0\&hiddenField=0\&page=(\d+))),follow=True),
        #匹配詳情頁面 不作深度爬取
        Rule(LinkExtractor(allow=rhttps://book.qidian.com/info/(\d+)), callback=parse_item, follow=False),
    )

    def parse_item(self, response):
        item=QdItem()

        item[book_name]=self.get_book_name(response)
        item[author]=self.get_author(response)
        item[state]=self.get_state(response)
        item[type]=self.get_type(response)
        item[about]=self.get_about(response)
        # item[‘number‘]=self.get_number(response)
        item[score]=self.get_score(response)
        item[story]=self.get_story(response)
        item[news]=self.get_news(response)
        item[photo]=self.get_photo(response)

        yield item

    def get_book_name(self,response):

        book_name=response.xpath(//h1/em/text()).extract()[0]
        if len(book_name)>0:
            book_name=book_name.strip()
        else:
            book_name=NULL
        return book_name

    def get_author(self,response):
        author=response.xpath(//h1/span/a/text()).extract()[0]
        if len(author)>0:
            author=author.strip()
        else:
            author=NULL
        return author

    def get_state(self,response):
        state=response.xpath(//p[@class="tag"]/span/text()).extract()[0]
        if len(state)>0:
            state=state.strip()
        else:
            st=NULL
        return state

    def get_type(self,response):
        type=response.xpath(//p[@class="tag"]/a/text()).extract()
        if len(type)>0:
            t=""
            for i in type:
                t+= +i
            type=t
        else:
            type=NULL
        return type

    def get_about(self,response):
        about=response.xpath(//p[@class="intro"]/text()).extract()[0]
        if len(about)>0:
            about=about.strip()
        else:
            about=NULL
        return about

    # def get_number(self,response):
    #
    #     def get_font(url):      #獲取字體對應的字典編碼
    #         time.sleep(2)
    #         resp=requests.get(url)
    #         font=TTFont(BytesIO(resp.content))
    #         cmap=font.getBestCmap()
    #         font.close()
    #         return cmap
    #
    #     def get_encode(cmap,values):
    #         #values的值    ‘𘛖𘛘𘛕𘛔𘛎𘛎‘
    #         #中英數字編碼表
    #         WORD_MAP = {‘zero‘: ‘0‘, ‘one‘: ‘1‘, ‘two‘: ‘2‘, ‘three‘: ‘3‘, ‘four‘: ‘4‘, ‘five‘: ‘5‘, ‘six‘: ‘6‘,
    #                     ‘seven‘: ‘7‘,‘eight‘: ‘8‘, ‘nine‘: ‘9‘, ‘period‘: ‘.‘}
    #         list=values.split(‘;‘)
    #         list.pop(-1)
    #         new_num=‘‘
    #         #移除最後的分號;
    #         for num in list:
    #             value=num[2:]
    #             key=cmap[int(value)]
    #             new_num+=WORD_MAP[key]
    #         return new_num
    #
    #     # pattern=re.compile(‘</style><span.*?>(.*?)</span>‘,re.S)           #數字字符匹配規則
    #     # # &#100054;&#100056;&#100053;&#100052;&#100046;&#100046;
    #     # number_list=re.findall(pattern,response)
    #     # #匹配所有數字字符列表
    #     # reg=re.compile(‘<style.*?>(.*?)\s*</style>‘,re.S)                  #包含字體鏈接的文本
    #     # font_url=re.findall(reg,response)[0]
    #     # url=re.search(‘woff.*?url.*?\‘(.+?)\‘.*?truetype‘,font_url).group(1)        #獲取當前數字庫的鏈接地址
    #     # # https://qidian.gtimg.com/qd_anti_spider/xxxxx.ttf
    #     #
    #     # cmap=get_font(url)          #獲取字典對應編碼
    #     # #   {100046: ‘seven‘, 100048: ‘three‘, 100049: ‘five‘, 100050: ‘six‘, 100051: ‘one‘, 100052: ‘period‘, 100053: ‘nine‘, 100054: ‘four‘, 100055: ‘eight‘, 100056: ‘two‘, 100057: ‘zero‘}
    #     #
    #     #
    #     # d_num=[]                    #解碼後的所有數字追加進去
    #     # for num in number_list:     #遍歷列表中的元素
    #     #     d_num.append(get_encode(cmap,num))
    #     # if len(d_num)>0:
    #     #     return d_num[0]+‘萬字‘
    #     # else:
    #     return ‘NULL‘

    def get_score(self,response):

        def get_sc(id):
            urll = https://book.qidian.com/ajax/comment/index?_csrfToken=ziKrBzt4NggZbkfyUMDwZvGH0X0wtrO5RdEGbI9w&bookId= + id + &pageSize=15
            rr = requests.get(urll)
            # print(rr)
            score = rr.text[16:19]
            return score

        bid=response.xpath(//a[@id="bookImg"]/@data-bid).extract()[0]         #獲取書的id
        if len(bid)>0:
            score=get_sc(bid)           #調用方法獲取評分 若是整數 可能返回 9,"
            if score[1]==,:
                score=score.replace(,",".0")
            else:
                score=score

        else:
            score=NULL
        return score

    def get_story(self,response):
        story=response.xpath(//div[@class="book-intro"]/p/text()).extract()[0]
        if len(story)>0:
            story=story.strip()
        else:
            story=NULL
        return story

    def get_news(self,response):
        news=response.xpath(//div[@class="detail"]/p[@class="cf"]/a/text()).extract()[0]
        if len(news)>0:
            news=news.strip()
        else:
            news=NULL
        return news

    def get_photo(self,response):
        photo=response.xpath(//div[@class="book-img"]/a[@class="J-getJumpUrl"]/img/@src).extract()[0]
        if len(photo)>0:
            photo=photo.strip()
        else:
            photo=NULL
        return photo

middlewaver 中間件部分:

# # -*- coding: utf-8 -*-
#
# # Define here the models for your spider middleware
# #
# # See documentation in:
# # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#
# from scrapy import signals
#
#
# class QdSpiderMiddleware(object):
#     # Not all methods need to be defined. If a method is not defined,
#     # scrapy acts as if the spider middleware does not modify the
#     # passed objects.
#
#     @classmethod
#     def from_crawler(cls, crawler):
#         # This method is used by Scrapy to create your spiders.
#         s = cls()
#         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
#         return s
#
#     def process_spider_input(self, response, spider):
#         # Called for each response that goes through the spider
#         # middleware and into the spider.
#
#         # Should return None or raise an exception.
#         return None
#
#     def process_spider_output(self, response, result, spider):
#         # Called with the results returned from the Spider, after
#         # it has processed the response.
#
#         # Must return an iterable of Request, dict or Item objects.
#         for i in result:
#             yield i
#
#     def process_spider_exception(self, response, exception, spider):
#         # Called when a spider or process_spider_input() method
#         # (from other spider middleware) raises an exception.
#
#         # Should return either None or an iterable of Response, dict
#         # or Item objects.
#         pass
#
#     def process_start_requests(self, start_requests, spider):
#         # Called with the start requests of the spider, and works
#         # similarly to the process_spider_output() method, except
#         # that it doesn’t have a response associated.
#
#         # Must return only requests (not items).
#         for r in start_requests:
#             yield r
#
#     def spider_opened(self, spider):
#         spider.logger.info(‘Spider opened: %s‘ % spider.name)
#
#
# class QdDownloaderMiddleware(object):
#     # Not all methods need to be defined. If a method is not defined,
#     # scrapy acts as if the downloader middleware does not modify the
#     # passed objects.
#
#     @classmethod
#     def from_crawler(cls, crawler):
#         # This method is used by Scrapy to create your spiders.
#         s = cls()
#         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
#         return s
#
#     def process_request(self, request, spider):
#         # Called for each request that goes through the downloader
#         # middleware.
#
#         # Must either:
#         # - return None: continue processing this request
#         # - or return a Response object
#         # - or return a Request object
#         # - or raise IgnoreRequest: process_exception() methods of
#         #   installed downloader middleware will be called
#         return None
#
#     def process_response(self, request, response, spider):
#         # Called with the response returned from the downloader.
#
#         # Must either;
#         # - return a Response object
#         # - return a Request object
#         # - or raise IgnoreRequest
#         return response
#
#     def process_exception(self, request, exception, spider):
#         # Called when a download handler or a process_request()
#         # (from other downloader middleware) raises an exception.
#
#         # Must either:
#         # - return None: continue processing this exception
#         # - return a Response object: stops process_exception() chain
#         # - return a Request object: stops process_exception() chain
#         pass
#
#     def spider_opened(self, spider):
#         spider.logger.info(‘Spider opened: %s‘ % spider.name)
import random,base64
from qd.settings import USER_AGENT,PROXIES

class RandomUserAgent(object):
    def process_request(self,request,spider):
        user_agent=random.choice(USER_AGENT)
        if user_agent:
            request.headers.setdefault("User-Agent",user_agent)

class RandomProxy(object):
     def process_request(self,request,spider):
         proxy=random.choice(PROXIES)
         if proxy[user_psd]is None:       #沒有用戶名和密碼則不需要認證
             request.meta[proxy]=http://+proxy[ip_port]
         else:
             bs64_user_psd=base64.b64encode(proxy[user_psd])
             request.meta[proxy]=http://+proxy[ip_port]
             request.headers[Proxy-Authorization]=Basic +bs64_user_psd

pipeline管道部分:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql,re
from scrapy.exceptions import DropItem

class QdPipeline(object):

    def __init__(self):
        self.connect = pymysql.connect(
            user=root,  # 用戶名
            password=1234,  # 密碼
            db=lgweb,  # 數據庫名
            host=127.0.0.1,  # 地址
            port=3306,
            charset=utf8
        )

    def table_exists(self, con, table_name):
        # 判斷數據表是否已經創建
        sql = show tables;
        con.execute(sql)
        tables = [con.fetchall()]
        table_list = re.findall((\‘.*?\‘), str(tables))
        table_list = [re.sub("", ‘‘, each) for each in table_list]  # 遍歷並獲得數據庫表
        if table_name in table_list:
            return 1  # 創建了返回1
        else:
            return 0  # 不創建返回0

    def process_item(self, item, spider):
        conn = self.connect.cursor()  # 創建該鏈接的遊標
        conn.execute(use lgweb)  # 指定數據庫
        table_name = db_read  # 數據庫表

        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem(Missing %s of blogpost from %s % (data, item[url]))
        if valid:  # 如果item裏面有數據則取出來
            book_name = item[book_name]
            author = item[author]
            state = item[state]
            type = item[type]
            about = item[about]
            # number = item[‘number‘]
            score = item[score]
            story = item[story]
            news = item[news]
            photo = item[photo]

        # 沒有對應數據庫表則創建
        if (self.table_exists(conn, table_name) != 1):
            sql = create table db_read(書名 VARCHAR (30),作者 VARCHAR (30),評分 VARCHAR (10),類型 VARCHAR (30),狀態 VARCHAR (30),簡介 VARCHAR (50),詳情 VARCHAR (1000),最新章節 VARCHAR (50),封面 VARCHAR (100))
            conn.execute(sql)  # 不存在則創建數據庫表

        try:
            # 有數據則插入數據表
            sql = "insert into db_read(書名,作者,評分,類型,狀態,簡介,詳情,最新章節,封面)VALUES (‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘)" % (
                book_name,author,score,type,state,about,story,news, photo)
            conn.execute(sql)  # 執行插入數據操作
            self.connect.commit()  # 提交保存
        finally:
            conn.close()

        return item

settings進行簡單配置,就可以運行程序了。

為了方便調試程序,可以在項目外編寫一個main.py入口文件,和命令行執行 scrapy crawl read 效果是一樣的。

main代碼如下:

from scrapy import cmdline
cmdline.execute(scrapy crawl read.split())

爬取數據效果圖:技術分享圖片

用crawl spider爬取起點網小說信息