1. 程式人生 > >python - scrapy 爬蟲框架 ( redis去重 )

python - scrapy 爬蟲框架 ( redis去重 )

use 去重 class conn elf sin cls col returns

1. 使用內置,並加以修改 ( 自定義 redis 存儲的 keys )

settings 配置
# ############### scrapy redis連接 ####################

REDIS_HOST = 140.143.227.206                            # 主機名
REDIS_PORT = 8888                                   # 端口
REDIS_PARAMS  = {password:beta}                                  #
Redis連接參數 默認:REDIS_PARAMS = {‘socket_timeout‘: 30,‘socket_connect_timeout‘: 30,‘retry_on_timeout‘: True,‘encoding‘: REDIS_ENCODING,}) REDIS_ENCODING = "utf-8" # redis編碼類型 默認:‘utf-8‘ # REDIS_URL = ‘redis://user:pass@hostname:9001‘ # 連接URL(優先於以上配置)
DUPEFILTER_KEY = dupefilter:%(timestamp)s # DUPEFILTER_CLASS = ‘scrapy_redis.dupefilter.RFPDupeFilter‘ DUPEFILTER_CLASS = dbd.xxx.RedisDupeFilter
from scrapy_redis.dupefilter import RFPDupeFilter
from scrapy_redis.connection import get_redis_from_settings
from scrapy_redis import defaults

class RedisDupeFilter(RFPDupeFilter): @classmethod def from_settings(cls, settings): """Returns an instance from given settings. This uses by default the key ``dupefilter:<timestamp>``. When using the ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as it needs to pass the spider name in the key. Parameters ---------- settings : scrapy.settings.Settings Returns ------- RFPDupeFilter A RFPDupeFilter instance. """ server = get_redis_from_settings(settings) # XXX: This creates one-time key. needed to support to use this # class as standalone dupefilter with scrapy‘s default scheduler # if scrapy passes spider on open() method this wouldn‘t be needed # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. key = defaults.DUPEFILTER_KEY % {timestamp: 這裏寫 keys} debug = settings.getbool(DUPEFILTER_DEBUG) return cls(server, key=key, debug=debug)

2. 自定義

from scrapy.dupefilter import BaseDupeFilter
import redis
from scrapy.utils.request import request_fingerprint
import scrapy_redis


class DupFilter(BaseDupeFilter):
    def __init__(self):
        self.conn = redis.Redis(host=140.143.227.206,port=8888,password=beta)

    def request_seen(self, request):
        """
        檢測當前請求是否已經被訪問過
        :param request: 
        :return: True表示已經訪問過;False表示未訪問過
        """
        fid = request_fingerprint(request)
        result = self.conn.sadd(visited_urls, fid)
        if result == 1:
            return False
        return True

python - scrapy 爬蟲框架 ( redis去重 )