1. 程式人生 > >python Scrapy的spider中回撥函式的引數傳遞方法

python Scrapy的spider中回撥函式的引數傳遞方法

通常有兩種方法,一種是使用meta進行引數傳遞。另一種是使用lambda進行引數傳遞。

方法一:

使用meta進行引數傳遞。

舉例如下:

from scrapy.spiders import Spider
import scrapy
import FirmCrawler.items as MI
from sets import Set
import time
import urlparse


class RicohSpider(Spider):
    name = "ricoh"
    allowed_domain = ["www.ricoh-imaging.com.cn"]
    start_urls = [
        "http://www.ricoh-imaging.com.cn/ricoh/service_download.html"
    ]

    allsuffix = Set()
    timeout = 20
    trytimes = 3
    headurl = "http://www.ricoh-imaging.com.cn/"


    def common(self, tr_list):
        pass



    def parse(self, response):

        tr_list_1 = response.xpath(".//*[@id='content2']/div[4]/table[1]/tr[position()>1]")
        for tr in tr_list_1:

            href = tr.xpath("./td[2]/a/@href").extract().pop()
            url = urlparse.urljoin(self.headurl, href)

            prductVersion = tr.xpath("./td[3]/text()").extract().pop()


            filename = tr.xpath("./td[1]/text()").extract().pop()
            print filename

            desc = tr.xpath("./td[2]/a/text()").extract().pop()
            # print desc


            request = scrapy.FormRequest(url, callback=self.parse_page, meta = {'filename':filename,'productVersion':prductVersion,'desc':desc})
            yield request


    def parse_page(self, response):
        print response.url
        publish_Time = response.xpath(".//*[@id='content2']/div[4]/div[3]/table/tbody/tr[4]/td[2]/p/span[1]/text()").extract()
        if publish_Time:
            publishTime = publish_Time.pop()
        else:
            publishTime = ""

        print "test:"+response.meta['filename']
        print "productVersion:" + response.meta['productVersion']
        print "desc:" + response.meta['desc']

主要方法:在scrapy.FormRequest方法中新增meta引數,meta使用字典形式的表達形式。

request = scrapy.FormRequest(url, callback=self.parse_page, meta = {'filename':filename,'productVersion':prductVersion,'desc':desc})
            yield request

在被呼叫的方法中引入字典的key:

# response.meta['key']
print "filename:"+response.meta['filename']
print "productVersion:" + response.meta['productVersion']
print "desc:" + response.meta['desc']

方法二:

使用lambda函式進行引數傳遞。

舉例如下:

    def parse(self, response):

        tr_list_1 = response.xpath(".//*[@id='content2']/div[4]/table[1]/tr[position()>1]")
        tr_list_2 = response.xpath(".//*[@id='content2']/div[4]/table[2]/tr[position()>1]")
        tr_list = tr_list_1 + tr_list_2

        for tr in tr_list:

            href = tr.xpath("./td[2]/a/@href").extract().pop()
            url = urlparse.urljoin(self.headurl, href)

            prduct_Version = tr.xpath("./td[3]/text()").extract()

            if prduct_Version:
                productVersion = prduct_Version.pop()
            else:
                productVersion = ""

            productModel = tr.xpath("./td[1]/text()").extract().pop()

            desc = tr.xpath("./td[2]/a/text()").extract().pop()

            # request = scrapy.FormRequest(url, callback=self.parse_page,
            #                          meta={'productModel': productModel, 'productVersion': prductVersion, 'desc': desc})
            # yield request

            request = scrapy.FormRequest(url, callback=lambda response, pm = productModel,pv= productVersion,dc = desc : self.parse_page(response, pm, pv, dc), dont_filter=True)
            yield request

    def parse_page(self, response, pm, pv , dc ):
        print pm,pv,dc

使用lambda進行引數傳遞,注意引數轉化如:pm = productVersion

使用meta的是時候不需要使用再被呼叫的函式之中新增引數。

 request = scrapy.FormRequest(url, callback=lambda response, pm = productModel,pv= productVersion,dc = desc : self.parse_page(response, pm, pv, dc), dont_filter=True)
            yield request

    def parse_page(self, response, pm, pv , dc ):
        print pm,pv,dc