1. 程式人生 > >爬蟲練習之遞迴爬取入口頁面下所有連結(scrapy-redis分散式)

爬蟲練習之遞迴爬取入口頁面下所有連結(scrapy-redis分散式)

1. 實現scrapy-redis前的一些準備

  1. pycharm中安裝scrapy和scrapy-redis模組
  2. pycharm中開啟scrapy-redis原始碼所在資料夾
  3. 同scrapy用法,修改四個檔案items, settings, pipelines 和自定義的爬蟲程式碼dmoz

2. scrapy-redis與scrapy區別

利用redis實現分散式爬蟲

排程器Scheduler

  1. scrapy
    1. 改寫Python雙向佇列為自己的優先順序佇列,但是scapry中存在多個spider時不能共享同一個待爬佇列
  2. scrapy-redis
    1. 將scarpy佇列放到redis資料庫中讀取,實現多個爬蟲共享一個佇列
    2. 同時還支援使用FIFO佇列和LIFO佇列

去除重複Duplication Filter

  1. scrapy
    1. 使用集合實現去重
    2. 將已傳送請求的指紋存入集合,新發送請求時與該集合比對判斷是否已請求過
  2. scrapy-redis
    1. redis的zset具有不重複的特點
    2. 將指紋存入redis,將不重複的請求寫入請求佇列

資料管道Item Pipeline

  1. scrapy
    爬取到的資料直接傳給管道檔案
  2. scrapy-redis
    將爬取到的資料存入redis資料佇列,可實現items processes叢集

爬蟲引擎Base Spider

  1. scrapy
    Spider類
  2. scrapy-redis
    繼承Spider類和RedisMixin類,從redis讀取url

3. 程式碼部分

settings

# Scrapy settings for lagou project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/topics/settings.html
#
SPIDER_MODULES = ['lagou.spiders']
NEWSPIDER_MODULE = 'lagou.spiders'
#USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)' # 本地重複過濾 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 計劃排程器,將請求佇列處理分發 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 是否將本地請求佇列持久化到遠端伺服器 SCHEDULER_PERSIST = True # 使用框架提供的佇列 SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"# 常用,優先順序佇列 #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"# FIFO佇列,先進先出 #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"# LIFO佇列,後進先出 ITEM_PIPELINES = { 'lagou.pipelines.lagouPipeline': 300, 'scrapy_redis.pipelines.RedisPipeline': 400, } # 日誌級別 # LOG_LEVEL = 'DEBUG' # Introduce an artifical delay to make use of parallelism. to speed up the # crawl. # 爬取間隔 DOWNLOAD_DELAY = 30 # 請求頭 DEFAULT_REQUEST_HEADERS = { 'Referer': 'https://www.lagou.com/jobs/list_%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput=', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', } # COOKIES不用 COOKIES_ENABLED = False # 機器人規則不遵守 ROBOTSTXT_OBEY = False # 重試 RETRY_ENABLE = True RETRY_TIMES = 5 # 重試次數,次 DOWNLOAD_TIMEOUT = 5 # 超時時長,秒 # 連線遠端redis服務,可連線redis叢集實現分散式 REDIS_HOST = '10.25.34.65' REDIS_PORT = 6379

items

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html

from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, TakeFirst, Join


class ExampleItem(Item):
    # 框架預設欄位
    name = Field()
    description = Field()
    link = Field()
    crawled = Field()
    spider = Field()
    url = Field()
    # 自定義欄位
    positionName = Field()
    companyFullName = Field()
    companyShortName = Field()
    companySize = Field()
    financeStage = Field()
    district = Field()
    education = Field()
    workYear = Field()
    salary = Field()
    positionAdvantage = Field()


class ExampleLoader(ItemLoader):
    default_item_class = ExampleItem
    default_input_processor = MapCompose(lambda s: s.strip())
    default_output_processor = TakeFirst()
    description_out = Join()

pipelines

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/topics/item-pipeline.html
from datetime import datetime

import pandas


class lagouPipeline(object):
    def process_item(self, item, spider):
        # 框架預設
        item["crawled"] = datetime.utcnow()
        item["spider"] = spider.name
        # 自定義
        positionName = item['positionName']
        companyFullName = item['companyFullName']
        companyShortName = item['companyFullName']
        companySize = item['companyFullName']
        financeStage = item['companyFullName']
        district = item['companyFullName']
        education = item['companyFullName']
        workYear = item['companyFullName']
        salary = item['companyFullName']
        positionAdvantage = item['companyFullName']
        data=[companyFullName,companyShortName,companySize,financeStage,district,positionName
            ,workYear,education,salary,positionAdvantage]
        columns=['公司全名', '公司簡稱', '公司規模', '融資階段', '區域', '職位名稱', '工作經驗', '學歷要求', '工資', '職位福利']
        df=pandas.DataFrame(data=data,index=None,columns=columns)
        df.to_csv('北京-機器學習.csv',index=None)
        return item

自定義爬蟲程式碼dmoz

import json
import math

import scrapy
from scrapy.spiders import CrawlSpider, Rule

from lagou.items import ExampleItem


class DmozSpider(CrawlSpider):
    name = 'dmoz'
    allowed_domains = ['www.lagou.com']
    start_urls=['https://www.lagou.com/jobs/positionAjax.json?px=default&city=北京&needAddtionalResult=false']

    # rules = [
    #     Rule(LinkExtractor(
    #         allow=(r'支援正則表示匹配爬蟲域www.lagou.com內所有連結')
    #     ), callback='start_requests', follow=True),
    # ]

    def start_requests(self):
        print('start_requests--------------------------------------------------------')
        url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=北京&needAddtionalResult=false'
        yield scrapy.FormRequest(
            url= url,
            formdata={
                'first': 'true',
                'pn': '1',
                'kd': '機器學習'
            },
            callback=self.get_pagenum,
        )
    def get_pagenum(self,response):
        # 確定總頁數
        meta = json.loads(response.body)
        print(meta)
        jobnum = meta['content']['positionResult']['totalCount']
        pagedemo=math.ceil(jobnum / 15)
        if pagedemo>30:
            pagenum=30
        else:
            pagenum=pagedemo
        print(f'總頁數:{pagenum}')
        url = response.url
        for num in range(1,pagenum+1):
            yield scrapy.FormRequest(
                url= url,
                formdata={
                    'first': 'true',
                    'pn': str(num),
                    'kd': '機器學習'
                },
                callback=self.get_message,
            )
    def get_message(self,response):
        # json.loads獲取json資料列表
        meta=json.loads(response.body)
        print(f'meta:{meta}')

        item = ExampleItem()
        joblist = meta['content']['positionResult']['result']
        for job in joblist:
            item['positionName'] = job['positionName']
            item['companyFullName'] = job['companyFullName']
            item['companyShortName'] = job['companyShortName']
            item['companySize'] = job['companySize']
            item['financeStage'] = job['financeStage']
            item['district'] = job['district']
            item['education'] = job['education']
            item['workYear'] = job['workYear']
            item['salary'] = job['salary']
            item['positionAdvantage'] = job['positionAdvantage']