1. 程式人生 > >爬蟲--使用scrapy爬取糗事百科並在txt文件中持久化存儲

爬蟲--使用scrapy爬取糗事百科並在txt文件中持久化存儲

max color 圖片 得到 acc deb ould ins ant

工程目錄結構技術分享圖片

 spiders下的first源碼

  

# -*- coding: utf-8 -*-
import scrapy
from  firstBlood.items  import FirstbloodItem
class FirstSpider(scrapy.Spider):
    #爬蟲文件的名稱
    #當有多個爬蟲文件時,可以通過名稱定位到指定的爬蟲文件
    name = first
    #allowed_domains 允許的域名 跟start_url互悖
    #allowed_domains = [www.xxx.com
] #start_url 請求的url列表,會被自動的請求發送 start_urls = [https://www.qiushibaike.com/text/] def parse(self, response): ‘‘‘ 解析請求的響應 可以使用正則,XPATH ,因為scrapy 集成了XPATH,建議使用XAPTH 解析得到一個selector :param response: :return: ‘‘‘ all_data = [] div_list
=response.xpath(//div[@id="content-left"]/div) for div in div_list: #author=div.xpath(./div[1]/a[2]/h2/text())#author 拿到的不是之前理解的源碼數據而 # 是selector對象,我們只需將selector類型對象下的data對象拿到即可 #author=author[0].extract() #如果存在匿名用戶時,將會報錯(匿名用戶的數據結構與登錄的用戶名的數據結構不一樣)
‘‘‘ 改進版‘‘‘ author = div.xpath(./div[1]/a[2]/h2/text()| ./div[1]/span[2]/h2/text())[0].extract() content=div.xpath(.//div[@class="content"]/span//text()).extract() content=‘‘.join(content) #print(author+:+content.strip( \n \t )) #基於終端的存儲 # dic={ # author:author, # content:content # } # all_data.append(dic) # return all_data #持久化存儲的兩種方式 #1 基於終端指令:parse方法有一個返回值 #scrapy crawl first -o qiubai.csv --nolog #終端指令只能存儲json,csv,xml等格式文件 #2基於管道 item = FirstbloodItem()#循環裏面,每次實例化一個item對象 item[author]=author item[content]=content yield item #將item提交給管道

Items文件

  

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class FirstbloodItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    #item類型對象 萬能對象,可以接受任意類型屬性,字符串,json等
    author = scrapy.Field()
    content = scrapy.Field()

pipeline文件

  

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Dont forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


#只要涉及持久化存儲的相關操作代碼都需要寫在該文件種
class FirstbloodPipeline(object):
    fp=None
    def open_spider(self,spider):
        print(開始爬蟲)
        self.fp=open(./qiushibaike.txt,w,encoding=utf-8)
    def process_item(self, item, spider):
        ‘‘‘
        處理Item
        :param item:
        :param spider:
        :return:
        ‘‘‘
        self.fp.write(item[author]+:+item[content])
        print(item[author],item[content])
        return item
    def close_spider(self,spider):
        print(爬蟲結束)
        self.fp.close()

Setting文件

# -*- coding: utf-8 -*-

# Scrapy settings for firstBlood project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = firstBlood

SPIDER_MODULES = [firstBlood.spiders]
NEWSPIDER_MODULE = firstBlood.spiders


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36

# Obey robots.txt rules
#默認為True ,改為False  不遵從ROBOTS協議  反爬
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,
#   Accept-Language: en,
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    firstBlood.middlewares.FirstbloodSpiderMiddleware: 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    firstBlood.middlewares.FirstbloodDownloaderMiddleware: 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    scrapy.extensions.telnet.TelnetConsole: None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   firstBlood.pipelines.FirstbloodPipeline: 300,#300 為優先級
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = httpcache
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = scrapy.extensions.httpcache.FilesystemCacheStorage

爬蟲--使用scrapy爬取糗事百科並在txt文件中持久化存儲