1. 程式人生 > >Scrapy框架自定義pipeline兩層下載路徑去下載圖片,關於item傳值的問題

Scrapy框架自定義pipeline兩層下載路徑去下載圖片,關於item傳值的問題

自定義兩層路徑的時候,item是需要經過傳值的,爬蟲函式如下

import scrapy
from urllib.request import urljoin
from ..items import OffmymindspiderItem
class OffmymindSpider(scrapy.Spider):
    name = 'offmymind'
    allowed_domains = ['www.biaobaiju.com']
    start_urls = ['http://www.biaobaiju.com/']

    def parse(self, response):
        """
        獲取每個分類的地址和分類的名稱
        :param response:
        :return:
        """
        a_list = response.xpath("//ul[@class='nav clearfix']/li/a")
        for a in a_list:
            img_type_url = a.xpath("@href").extract_first("")
            img_type_name = a.xpath("text()").extract_first("")
            yield scrapy.Request(url=img_type_url, dont_filter=False, callback=self.parse_img_type_info, meta={"img_type_name":img_type_name})

    def parse_img_type_info(self, response):
        """
        解析每個分類地址的原始碼,並取出每個圖片集連結的網址
        :param response:
        :return:
        """
        div_list = response.xpath("//ul[@id='container']/li/div[2]")
        #因為parse()裡item的值是不能丟棄的,所以在這需要用item接收一下,將meta裡的值取出來,然後在parse_img_type_info()裡給item再新增一個鍵值對(第二層路徑),通過request一塊傳給下一個函式。
        item = response.meta
        #取每一頁中的小分類的url地址
        for div in div_list:
            img_small_type_href = div.xpath("a/@href").extract_first("")
            img_small_type_name = div.xpath("a/text()").extract_first("")
            item["img_small_type_name"] = img_small_type_name
            yield scrapy.Request(url=img_small_type_href, dont_filter=True, callback=self.parse_every_small_type_info, meta=item)
        #判斷是否有下一頁;由於頁數較多,這段程式碼沒有執行,只下載每個分類的第一頁
        # href = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").extract_first("")
        # if href=="":
        #     print("沒有下一頁了!")
        # elif href!="":
        #     next_page_url = urljoin(response.url,href)
        #    #需要呼叫自身,即下載完第一頁的圖片後,判斷是否有下一頁,有的話在呼叫parse_img_type_info(),然後下載第二頁的圖片
        #     yield scrapy.Request(url=next_page_url, dont_filter=True, callback=self.parse_img_type_info, meta=item)

    def parse_every_small_type_info(self, response):
        """
        解析每個小分類地址的圖片網址
        :return:
        """
        # 只取第一頁的圖片
        # print(response)
        p_list = response.xpath("//div[@class='content tag_blue']/p")
        for p in p_list:
            img_url = p.xpath("img/@src").extract_first("")
            if img_url=="":
                del img_url
            elif img_url!="":
                item = OffmymindspiderItem()
                item["img_url"] = [img_url]
                #img_type_name,img_small_type_name是以字典的形式存放在meta中的,屬於response
                item["img_small_type_name"] = response.meta.get("img_small_type_name")
                item["img_type_name"] = response.meta.get("img_type_name")
                yield item

items.py中的程式碼

import scrapy
class ZhanzhangsucaispiderItem(scrapy.Item):
    name = scrapy.Field()
    img_url = scrapy.Field()
    img_path = scrapy.Field()

settings.py中修改的內容

ROBOTSTXT_OBEY = False#第22行需要改成False
#第67行
ITEM_PIPELINES = {
   'OffMyMindSpider.pipelines.CustomImagesPipeline': 300,
}
IMAGES_STORE = "imgs"

pipelines.py中的程式碼

from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import scrapy
class CustomImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for img_download_url in item.get("img_url"):
            yield scrapy.Request(url=img_download_url, meta={"item":item})

    def file_path(self, request, response=None, info=None):
        # print(request)
        #將item取出來
        item = request.meta["item"]
        img_type_name = item["img_type_name"]
        img_small_type_name = item["img_small_type_name"]
        #img_url存放在列表中
        img_url = item.get("img_url")[0].split("/")[-1]
        return "%s/%s/%s"%(img_type_name, img_small_type_name, img_url)

    def item_completed(self, results, item, info):
        print("---")
        img_path = results[0][1].get("path")
        if not img_path:
            raise DropItem("Image download failed, delete the corresponding item value, do not let it return out")
        item["img_path"] = img_path
        return item

具體內容的註釋可以參考連結:https://mp.csdn.net/postedit/84668344