用crawl spider爬取起點網小說信息
阿新 • • 發佈:2019-03-28
models anti arc pub work 全部 see 效率 rand
起點作為主流的小說網站,在防止數據采集反面還是做了準備的,其對主要的數字采用了自定義的編碼映射取值,想直接通過頁面來實現數據的獲取,是無法實現的。
單獨獲取數字還是可以實現的,通過requests發送請求,用正則去匹配字符元素,並再次匹配其映射關系的url,獲取到的數據通過font包工具解析成字典格式,再做編碼匹配,起點返回的編碼匹配英文數字,英文數字匹配阿拉伯數字,最後拼接,得到實際的數字字符串,但這樣多次發送請求,爬取效率會大大降低。本次集中爬取舍棄了爬取數字,選擇了較容易獲取的評分數字。評分值默認為0 ,是從後臺推送的js數據裏取值更新的。
實現的主要代碼:
items部分:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy import Field
class QdItem(scrapy.Item):
# define the fields for your item here like:
book_name = scrapy.Field() # 書名
author=scrapy.Field() #作者
state=scrapy.Field() #狀態
type=scrapy.Field() #類型
about=scrapy.Field() #簡介
# number=scrapy.Field() #字數
score=scrapy.Field() #評分
story=scrapy.Field() # 故事
news=scrapy.Field() #最新章節
photo=scrapy.Field() #封面
spider部分:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from qd.items import QdItem
import re,requests
from fontTools.ttLib import TTFont
from io import BytesIO
import time
class ReadSpider(CrawlSpider):
name = ‘read‘
# allowed_domains = [‘qidian.com‘]
start_urls = [‘https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page=1‘]
rules = (
#匹配全部主頁面的url規則 深度爬取子頁面
Rule(LinkExtractor(allow=(r‘https://www.qidian.com/all\?orderId=\&style=1\&pageSize=20\&siteid=1\&pubflag=0\&hiddenField=0\&page=(\d+)‘)),follow=True),
#匹配詳情頁面 不作深度爬取
Rule(LinkExtractor(allow=r‘https://book.qidian.com/info/(\d+)‘), callback=‘parse_item‘, follow=False),
)
def parse_item(self, response):
item=QdItem()
item[‘book_name‘]=self.get_book_name(response)
item[‘author‘]=self.get_author(response)
item[‘state‘]=self.get_state(response)
item[‘type‘]=self.get_type(response)
item[‘about‘]=self.get_about(response)
# item[‘number‘]=self.get_number(response)
item[‘score‘]=self.get_score(response)
item[‘story‘]=self.get_story(response)
item[‘news‘]=self.get_news(response)
item[‘photo‘]=self.get_photo(response)
yield item
def get_book_name(self,response):
book_name=response.xpath(‘//h1/em/text()‘).extract()[0]
if len(book_name)>0:
book_name=book_name.strip()
else:
book_name=‘NULL‘
return book_name
def get_author(self,response):
author=response.xpath(‘//h1/span/a/text()‘).extract()[0]
if len(author)>0:
author=author.strip()
else:
author=‘NULL‘
return author
def get_state(self,response):
state=response.xpath(‘//p[@class="tag"]/span/text()‘).extract()[0]
if len(state)>0:
state=state.strip()
else:
st=‘NULL‘
return state
def get_type(self,response):
type=response.xpath(‘//p[@class="tag"]/a/text()‘).extract()
if len(type)>0:
t=""
for i in type:
t+=‘ ‘+i
type=t
else:
type=‘NULL‘
return type
def get_about(self,response):
about=response.xpath(‘//p[@class="intro"]/text()‘).extract()[0]
if len(about)>0:
about=about.strip()
else:
about=‘NULL‘
return about
# def get_number(self,response):
#
# def get_font(url): #獲取字體對應的字典編碼
# time.sleep(2)
# resp=requests.get(url)
# font=TTFont(BytesIO(resp.content))
# cmap=font.getBestCmap()
# font.close()
# return cmap
#
# def get_encode(cmap,values):
# #values的值 ‘𘛖𘛘𘛕𘛔𘛎𘛎‘
# #中英數字編碼表
# WORD_MAP = {‘zero‘: ‘0‘, ‘one‘: ‘1‘, ‘two‘: ‘2‘, ‘three‘: ‘3‘, ‘four‘: ‘4‘, ‘five‘: ‘5‘, ‘six‘: ‘6‘,
# ‘seven‘: ‘7‘,‘eight‘: ‘8‘, ‘nine‘: ‘9‘, ‘period‘: ‘.‘}
# list=values.split(‘;‘)
# list.pop(-1)
# new_num=‘‘
# #移除最後的分號;
# for num in list:
# value=num[2:]
# key=cmap[int(value)]
# new_num+=WORD_MAP[key]
# return new_num
#
# # pattern=re.compile(‘</style><span.*?>(.*?)</span>‘,re.S) #數字字符匹配規則
# # # 𘛖𘛘𘛕𘛔𘛎𘛎
# # number_list=re.findall(pattern,response)
# # #匹配所有數字字符列表
# # reg=re.compile(‘<style.*?>(.*?)\s*</style>‘,re.S) #包含字體鏈接的文本
# # font_url=re.findall(reg,response)[0]
# # url=re.search(‘woff.*?url.*?\‘(.+?)\‘.*?truetype‘,font_url).group(1) #獲取當前數字庫的鏈接地址
# # # https://qidian.gtimg.com/qd_anti_spider/xxxxx.ttf
# #
# # cmap=get_font(url) #獲取字典對應編碼
# # # {100046: ‘seven‘, 100048: ‘three‘, 100049: ‘five‘, 100050: ‘six‘, 100051: ‘one‘, 100052: ‘period‘, 100053: ‘nine‘, 100054: ‘four‘, 100055: ‘eight‘, 100056: ‘two‘, 100057: ‘zero‘}
# #
# #
# # d_num=[] #解碼後的所有數字追加進去
# # for num in number_list: #遍歷列表中的元素
# # d_num.append(get_encode(cmap,num))
# # if len(d_num)>0:
# # return d_num[0]+‘萬字‘
# # else:
# return ‘NULL‘
def get_score(self,response):
def get_sc(id):
urll = ‘https://book.qidian.com/ajax/comment/index?_csrfToken=ziKrBzt4NggZbkfyUMDwZvGH0X0wtrO5RdEGbI9w&bookId=‘ + id + ‘&pageSize=15‘
rr = requests.get(urll)
# print(rr)
score = rr.text[16:19]
return score
bid=response.xpath(‘//a[@id="bookImg"]/@data-bid‘).extract()[0] #獲取書的id
if len(bid)>0:
score=get_sc(bid) #調用方法獲取評分 若是整數 可能返回 9,"
if score[1]==‘,‘:
score=score.replace(‘,"‘,".0")
else:
score=score
else:
score=‘NULL‘
return score
def get_story(self,response):
story=response.xpath(‘//div[@class="book-intro"]/p/text()‘).extract()[0]
if len(story)>0:
story=story.strip()
else:
story=‘NULL‘
return story
def get_news(self,response):
news=response.xpath(‘//div[@class="detail"]/p[@class="cf"]/a/text()‘).extract()[0]
if len(news)>0:
news=news.strip()
else:
news=‘NULL‘
return news
def get_photo(self,response):
photo=response.xpath(‘//div[@class="book-img"]/a[@class="J-getJumpUrl"]/img/@src‘).extract()[0]
if len(photo)>0:
photo=photo.strip()
else:
photo=‘NULL‘
return photo
middlewaver 中間件部分:
# # -*- coding: utf-8 -*-
#
# # Define here the models for your spider middleware
# #
# # See documentation in:
# # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#
# from scrapy import signals
#
#
# class QdSpiderMiddleware(object):
# # Not all methods need to be defined. If a method is not defined,
# # scrapy acts as if the spider middleware does not modify the
# # passed objects.
#
# @classmethod
# def from_crawler(cls, crawler):
# # This method is used by Scrapy to create your spiders.
# s = cls()
# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
# return s
#
# def process_spider_input(self, response, spider):
# # Called for each response that goes through the spider
# # middleware and into the spider.
#
# # Should return None or raise an exception.
# return None
#
# def process_spider_output(self, response, result, spider):
# # Called with the results returned from the Spider, after
# # it has processed the response.
#
# # Must return an iterable of Request, dict or Item objects.
# for i in result:
# yield i
#
# def process_spider_exception(self, response, exception, spider):
# # Called when a spider or process_spider_input() method
# # (from other spider middleware) raises an exception.
#
# # Should return either None or an iterable of Response, dict
# # or Item objects.
# pass
#
# def process_start_requests(self, start_requests, spider):
# # Called with the start requests of the spider, and works
# # similarly to the process_spider_output() method, except
# # that it doesn’t have a response associated.
#
# # Must return only requests (not items).
# for r in start_requests:
# yield r
#
# def spider_opened(self, spider):
# spider.logger.info(‘Spider opened: %s‘ % spider.name)
#
#
# class QdDownloaderMiddleware(object):
# # Not all methods need to be defined. If a method is not defined,
# # scrapy acts as if the downloader middleware does not modify the
# # passed objects.
#
# @classmethod
# def from_crawler(cls, crawler):
# # This method is used by Scrapy to create your spiders.
# s = cls()
# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
# return s
#
# def process_request(self, request, spider):
# # Called for each request that goes through the downloader
# # middleware.
#
# # Must either:
# # - return None: continue processing this request
# # - or return a Response object
# # - or return a Request object
# # - or raise IgnoreRequest: process_exception() methods of
# # installed downloader middleware will be called
# return None
#
# def process_response(self, request, response, spider):
# # Called with the response returned from the downloader.
#
# # Must either;
# # - return a Response object
# # - return a Request object
# # - or raise IgnoreRequest
# return response
#
# def process_exception(self, request, exception, spider):
# # Called when a download handler or a process_request()
# # (from other downloader middleware) raises an exception.
#
# # Must either:
# # - return None: continue processing this exception
# # - return a Response object: stops process_exception() chain
# # - return a Request object: stops process_exception() chain
# pass
#
# def spider_opened(self, spider):
# spider.logger.info(‘Spider opened: %s‘ % spider.name)
import random,base64
from qd.settings import USER_AGENT,PROXIES
class RandomUserAgent(object):
def process_request(self,request,spider):
user_agent=random.choice(USER_AGENT)
if user_agent:
request.headers.setdefault("User-Agent",user_agent)
class RandomProxy(object):
def process_request(self,request,spider):
proxy=random.choice(PROXIES)
if proxy[‘user_psd‘]is None: #沒有用戶名和密碼則不需要認證
request.meta[‘proxy‘]=‘http://‘+proxy[‘ip_port‘]
else:
bs64_user_psd=base64.b64encode(proxy[‘user_psd‘])
request.meta[‘proxy‘]=‘http://‘+proxy[‘ip_port‘]
request.headers[‘Proxy-Authorization‘]=‘Basic ‘+bs64_user_psd
pipeline管道部分:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql,re
from scrapy.exceptions import DropItem
class QdPipeline(object):
def __init__(self):
self.connect = pymysql.connect(
user=‘root‘, # 用戶名
password=‘1234‘, # 密碼
db=‘lgweb‘, # 數據庫名
host=‘127.0.0.1‘, # 地址
port=3306,
charset=‘utf8‘
)
def table_exists(self, con, table_name):
# 判斷數據表是否已經創建
sql = ‘show tables;‘
con.execute(sql)
tables = [con.fetchall()]
table_list = re.findall(‘(\‘.*?\‘)‘, str(tables))
table_list = [re.sub("‘", ‘‘, each) for each in table_list] # 遍歷並獲得數據庫表
if table_name in table_list:
return 1 # 創建了返回1
else:
return 0 # 不創建返回0
def process_item(self, item, spider):
conn = self.connect.cursor() # 創建該鏈接的遊標
conn.execute(‘use lgweb‘) # 指定數據庫
table_name = ‘db_read‘ # 數據庫表
valid = True
for data in item:
if not data:
valid = False
raise DropItem(‘Missing %s of blogpost from %s‘ % (data, item[‘url‘]))
if valid: # 如果item裏面有數據則取出來
book_name = item[‘book_name‘]
author = item[‘author‘]
state = item[‘state‘]
type = item[‘type‘]
about = item[‘about‘]
# number = item[‘number‘]
score = item[‘score‘]
story = item[‘story‘]
news = item[‘news‘]
photo = item[‘photo‘]
# 沒有對應數據庫表則創建
if (self.table_exists(conn, table_name) != 1):
sql = ‘create table db_read(書名 VARCHAR (30),作者 VARCHAR (30),評分 VARCHAR (10),類型 VARCHAR (30),狀態 VARCHAR (30),簡介 VARCHAR (50),詳情 VARCHAR (1000),最新章節 VARCHAR (50),封面 VARCHAR (100))‘
conn.execute(sql) # 不存在則創建數據庫表
try:
# 有數據則插入數據表
sql = "insert into db_read(書名,作者,評分,類型,狀態,簡介,詳情,最新章節,封面)VALUES (‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘)" % (
book_name,author,score,type,state,about,story,news, photo)
conn.execute(sql) # 執行插入數據操作
self.connect.commit() # 提交保存
finally:
conn.close()
return item
settings進行簡單配置,就可以運行程序了。
為了方便調試程序,可以在項目外編寫一個main.py入口文件,和命令行執行 scrapy crawl read 效果是一樣的。
main代碼如下:
from scrapy import cmdline
cmdline.execute(‘scrapy crawl read‘.split())
爬取數據效果圖:
用crawl spider爬取起點網小說信息