1. 程式人生 > >python爬蟲(16)使用scrapy框架爬取頂點小說網

python爬蟲(16)使用scrapy框架爬取頂點小說網

本文以scrapy 框架來爬取整個頂點小說網的小說

1.scrapy的安裝

這個安裝教程,網上有很多的例子,這裡就不在贅述了

2.關於scrapy

scrapy框架 是一個非常好的東西,能夠實現非同步爬取,節省時間,其實本文純粹的按照之前的思維來做,

也不是不可以,但是感覺速度太慢了,畢竟資料量有點大

框架內容也在網上找找例子吧

3.直接說實現吧

使用 

scrapy startproject dingdian
建立專案

然後增加檔案,最後程式碼目錄如下:

├── dingdian
│   ├── __init__.py
│   ├── items.py
│   ├── pipelines.py
│   ├── settings.py
│   └── spiders
│       ├── __init__.py
│       └── mydingdian.py

主要程式:

mydingdian.py

#coding:utf-8
import scrapy
import re
from scrapy.http import Request
from dingdian.items import DingdianItem
#from dingdian.items import DDNovelContentItem

class Myspider(scrapy.Spider):
	name = "dingdian"
	allowed_domains = ["23us.com"]
	bash_url = "http://www.23us.com/class/"
	bashurl='.html'
	
	def start_requests(self):
		#for i in range(1,11):
		for i in range(7,8):
			url=self.bash_url+str(i)+"_1"+self.bashurl
			yield Request(url,self.parse)
	
	def parse(self, response):
		
		baseurl=response.url  #此處得到的url為http://www.23us.com/class/*_1.html
		
		max_num=response.xpath('//*[@id="pagelink"]/a[14]/text()').extract_first()#獲取當前頁面的最大頁碼數
		print max_num
		baseurl=baseurl[:-7]
				
		#for num in xrange(1,int(max_num)+1):
		for num in xrange(1,3):
			newurl=baseurl+"_"+str(num)+self.bashurl
			#此處使用dont_filter和不使用的效果不一樣,使用dont_filter就能夠抓取到第一個頁面的內容,不用就抓不到
			#scrapy會對request的URL去重(RFPDupeFilter),加上dont_filter則告訴它這個URL不參與去重。
			yield Request(newurl,dont_filter=True,callback=self.get_name)#將新的頁面url的內容傳遞給get_name函式去處理
	
	def get_name(self,response):
		for nameinfo in response.xpath('//tr'):
			novelurl = nameinfo.xpath('td[1]/a/@href').extract_first()#小說地址
			name = nameinfo.xpath('td[1]/a/text()').extract_first()#小說名字
			if  novelurl:
				yield Request(novelurl,dont_filter=True,callback=self.get_novelcontent,meta={'name':name})
			'''
			#在當前頁面獲取小說詳情
			#print nameinfo
			name = nameinfo.xpath('td[1]/a/text()').extract_first()#小說名字
			author= nameinfo.xpath('td[3]/text()').extract_first()#小說作者
			novelurl = nameinfo.xpath('td[1]/a/@href').extract_first()#小說地址
			serialstatus = nameinfo.xpath('td[6]/text()').extract_first()#小說狀態
			serialnumber = nameinfo.xpath('td[4]/text()').extract_first()#小說字數
			if  novelurl:
				targentcontent['novel_name']=name
				targentcontent['author']=author
				targentcontent['novelurl']=novelurl
				targentcontent['serialstatus']=serialstatus
				targentcontent['serialnumber']=serialnumber	
				#print name,author,novelurl,serialstatus,serialnumber
			
				yield Request(novelurl,callback=self.get_novelcontent,meta={'targentcontent':targentcontent})
			小說相關的詳情可以暫時不傳遞
			'''
	
	def get_novelcontent(self,response):
		#targentcontent=response.meta['targentcontent']
		#print targentcontent['novelurl'],targentcontent['name']
		#title = response.xpath('//dd[1]/h1/text()').extract_first()
		novel_name=response.meta['name']#小說名字
		author = response.xpath('//tr[1]/td[2]/text()').extract_first()#作者
		novelurl = response.url#小說地址
		serialstatus = response.xpath('//tr[1]/td[3]/text()').extract_first()#狀態
		serialnumber = response.xpath('//tr[2]/td[2]/text()').extract_first()#連載字數
		category = response.xpath('//tr[1]/td[1]/a/text()').extract_first()#小說類別
		name_id = novelurl[-5:]#小說編號
		collect_num_total=response.xpath('//tr[2]/td[1]/text()').extract_first()#總收藏
		click_num_total=response.xpath('//tr[3]/td[1]/text()').extract_first()#總點選
		
		#chapterlistul=response.xpath('//dd[2]/div[2]/p[2]/a/text()').extract_first()
		chapterlisturl=response.xpath('//dd[2]/div[2]/p[2]/a/@href').extract_first()
		novel_breif=response.xpath('//dd[2]/p[2]').extract_first()
		
		targentcontent=DingdianItem()
		targentcontent['novel_name']=novel_name
		targentcontent['author']=author
		targentcontent['novelurl']=novelurl
		targentcontent['serialstatus']=serialstatus
		targentcontent['serialnumber']=serialnumber	
		targentcontent['category']=category	
		targentcontent['name_id']=name_id	
		targentcontent['collect_num_total']=collect_num_total	
		targentcontent['click_num_total']=click_num_total	
		targentcontent['novel_breif']=novel_breif	
		#yield targentcontent
		#print novel_name,author,novelurl,serialstatus,serialnumber,category,name_id,collect_num_total,click_num_total,chapterlisturl
		yield Request(chapterlisturl,dont_filter=True,callback=self.get_charaterurl,meta={'targentcontent':targentcontent})
		
		
		
		
		
		
	def get_charaterurl(self,response):
		#print response.url
		item=response.meta['targentcontent']
		for contents in response.xpath('//table/tr'):
			for content in contents.xpath('td'):
				if  content.xpath('a/text()').extract_first():
					#print content.xpath('a/text()').extract_first()
					item['chapterurl']=response.url+content.xpath('a/@href').extract_first()
					item['chaptername']=content.xpath('a/text()').extract_first()
					yield item
		
		
		
		
定義的存貯內容即 items.py
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class DingdianItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
	novel_name = scrapy.Field()#小說名字
	author = scrapy.Field()#作者
	novelurl = scrapy.Field()#小說地址
	serialstatus = scrapy.Field()#狀態
	serialnumber = scrapy.Field()#連載字數
	category = scrapy.Field()#小說類別
	name_id = scrapy.Field()#小說編號
	collect_num_total=scrapy.Field()#總收藏
	click_num_total=scrapy.Field()#總點選
	novel_breif=scrapy.Field()#小說簡介
	chapterurl = scrapy.Field()#小說章節地址
	chaptername = scrapy.Field()#小說章節名字

設定相關  settings.py
# -*- coding: utf-8 -*-

# Scrapy settings for dingdian project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'dingdian'

SPIDER_MODULES = ['dingdian.spiders']
NEWSPIDER_MODULE = 'dingdian.spiders'


PAGE_STORGE="novels"

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'dingdian (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'dingdian.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'dingdian.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'dingdian.pipelines.DingdianPipeline': 100,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
最終的資料處理以及儲存

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from dingdian import settings
import os
import urllib2
from dingdian.items import DingdianItem
#from dingdian.items import DDNovelContentItem

from bs4 import BeautifulSoup as bs
import sys

reload(sys)
sys.setdefaultencoding('utf-8')


class DingdianPipeline(object):
    def process_item(self, item, spider):
		
		dir_path="%s/%s" % (settings.PAGE_STORGE,spider.name)

		if not os.path.exists(dir_path):
		#	print "dir_path is %s",dir_path
			os.makedirs(dir_path)
		if isinstance(item,DingdianItem):
			novelpath=dir_path+'/'+item['novel_name']
			print novelpath
			if not os.path.exists(novelpath):
				os.makedirs(novelpath)
			novelbreif=item['novel_name']+"_簡介"
			novelbreifpath=novelpath+'/'+novelbreif+'.txt'
			if not os.path.exists(novelbreifpath):
				with open(novelbreifpath,'wb') as novel_write:
					novel_write.write(item['novel_name'])
					novel_write.write('\t|\t')
					novel_write.write(item['author'])
					novel_write.write('\t|\t')
					novel_write.write(item['novelurl'])
					novel_write.write('\n')
					novel_write.write(item['serialstatus'])
					novel_write.write('\t|\t')
					novel_write.write(item['serialnumber'])
					novel_write.write('\t|\t')
					novel_write.write(item['category'])
					novel_write.write('\n')
					novel_write.write(item['name_id'])
					novel_write.write('\t|\t')
					novel_write.write(item['collect_num_total'])
					novel_write.write('\t|\t')
					novel_write.write(item['click_num_total'])
					novel_write.write('\n')
					novel_write.write(item['novel_breif'])
					novel_write.close
					
			titlename=item['chaptername']
			titlenamepath=novelpath+'/'+titlename+'.txt'
			print titlenamepath
			chapterurl=item['chapterurl']
			html=urllib2.urlopen(chapterurl).read()
			soup1=bs(html,'lxml')
			if not os.path.exists(titlenamepath):
				with open(titlenamepath,'wb') as file_write:
					cont=soup1.find("dd",attrs={"id":"contents"}).getText()
					#print cont
					file_write.write(cont)
					file_write.close()		
					
		

		
		return item
		
		#-o books.csv 引數的意思是將抓取的Item集合輸出到csv檔案。
		#除了CSV格式,Scrapy還支援JSON,XML的格式輸入



然後執行  

scrapy crawl dingdian
沒有報錯的話,就等上幾個小時,然後就能看到好多小說就躺在自己的電腦上面了