爬蟲--使用scrapy爬取糗事百科並在txt文件中持久化存儲
阿新 • • 發佈:2019-01-28
max color 圖片 得到 acc deb ould ins ant
工程目錄結構
spiders下的first源碼
# -*- coding: utf-8 -*- import scrapy from firstBlood.items import FirstbloodItem class FirstSpider(scrapy.Spider): #爬蟲文件的名稱 #當有多個爬蟲文件時,可以通過名稱定位到指定的爬蟲文件 name = ‘first‘ #allowed_domains 允許的域名 跟start_url互悖 #allowed_domains = [‘www.xxx.com‘] #start_url 請求的url列表,會被自動的請求發送 start_urls = [‘https://www.qiushibaike.com/text/‘] def parse(self, response): ‘‘‘ 解析請求的響應 可以使用正則,XPATH ,因為scrapy 集成了XPATH,建議使用XAPTH 解析得到一個selector :param response: :return: ‘‘‘ all_data = [] div_list=response.xpath(‘//div[@id="content-left"]/div‘) for div in div_list: #author=div.xpath(‘./div[1]/a[2]/h2/text()‘)#author 拿到的不是之前理解的源碼數據而 # 是selector對象,我們只需將selector類型對象下的data對象拿到即可 #author=author[0].extract() #如果存在匿名用戶時,將會報錯(匿名用戶的數據結構與登錄的用戶名的數據結構不一樣)‘‘‘ 改進版‘‘‘ author = div.xpath(‘./div[1]/a[2]/h2/text()| ./div[1]/span[2]/h2/text()‘)[0].extract() content=div.xpath(‘.//div[@class="content"]/span//text()‘).extract() content=‘‘.join(content) #print(author+‘:‘+content.strip(‘ \n \t ‘)) #基於終端的存儲 # dic={ # ‘author‘:author, # ‘content‘:content # } # all_data.append(dic) # return all_data #持久化存儲的兩種方式 #1 基於終端指令:parse方法有一個返回值 #scrapy crawl first -o qiubai.csv --nolog #終端指令只能存儲json,csv,xml等格式文件 #2基於管道 item = FirstbloodItem()#循環裏面,每次實例化一個item對象 item[‘author‘]=author item[‘content‘]=content yield item #將item提交給管道
Items文件
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class FirstbloodItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #item類型對象 萬能對象,可以接受任意類型屬性,字符串,json等 author = scrapy.Field() content = scrapy.Field()
pipeline文件
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html #只要涉及持久化存儲的相關操作代碼都需要寫在該文件種 class FirstbloodPipeline(object): fp=None def open_spider(self,spider): print(‘開始爬蟲‘) self.fp=open(‘./qiushibaike.txt‘,‘w‘,encoding=‘utf-8‘) def process_item(self, item, spider): ‘‘‘ 處理Item :param item: :param spider: :return: ‘‘‘ self.fp.write(item[‘author‘]+‘:‘+item[‘content‘]) print(item[‘author‘],item[‘content‘]) return item def close_spider(self,spider): print(‘爬蟲結束‘) self.fp.close()
Setting文件
# -*- coding: utf-8 -*- # Scrapy settings for firstBlood project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = ‘firstBlood‘ SPIDER_MODULES = [‘firstBlood.spiders‘] NEWSPIDER_MODULE = ‘firstBlood.spiders‘ # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36‘ # Obey robots.txt rules #默認為True ,改為False 不遵從ROBOTS協議 反爬 ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, # ‘Accept-Language‘: ‘en‘, #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # ‘firstBlood.middlewares.FirstbloodSpiderMiddleware‘: 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # ‘firstBlood.middlewares.FirstbloodDownloaderMiddleware‘: 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # ‘scrapy.extensions.telnet.TelnetConsole‘: None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { ‘firstBlood.pipelines.FirstbloodPipeline‘: 300,#300 為優先級 } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = ‘httpcache‘ #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘
爬蟲--使用scrapy爬取糗事百科並在txt文件中持久化存儲