1. 程式人生 > >用scrapy爬取搜狗Lofter圖片

用scrapy爬取搜狗Lofter圖片

request index import rap .so 圖片 file loader clas

用scrapy爬取搜狗Lofter圖片

# -*- coding: utf-8 -*-
import json

import scrapy
from scrapy.http import Request
from urllib import parse
from scrapy.loader import ItemLoader

from tutorial.items import LofterSpiderItem


class LofterSpider(scrapy.Spider):
    name = "lofter"
    allowed_domains = ["pic.sogou.com"
] start_urls = [‘http://pic.sogou.com/‘] # question的第一頁answer的請求url start_answer_url = "http://pic.sogou.com/pics/channel/getAllRecomPicByTag.jsp?category=LOFTER&tag=%E5%85%A8%E9%83%A8&start={0}&len=15" headers = { "HOST": "pic.sogou.com", "Referer": "http://pic.sogou.com"
, ‘User-Agent‘: "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" } def parse(self, response): yield scrapy.Request(self.start_answer_url.format(0), headers=self.headers,callback=self.parse_url) def parse_url(self,response): ans_json = json.loads(response.text) for
ans in ans_json[‘all_items‘]: image_url = ans[‘ori_pic_url‘] item_loader = ItemLoader(item=LofterSpiderItem(), response=response) item_loader.add_value("lofter_image_url", image_url) lofter_item = item_loader.load_item() yield lofter_item yield scrapy.Request(self.start_answer_url.format(ans_json[‘startIndex‘]+15), headers=self.headers,callback=self.parse_url)

settings.py

ITEM_PIPELINES = {
   ‘tutorial.pipelines.TutorialPipeline‘: 300,
    ‘tutorial.pipelines.TutorialImagePipeline‘: 1,
}
# IMAGES_URLS_FIELD = "front_image_url"
project_dir = os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(project_dir, ‘image‘)

items.py

class LofterSpiderItem(scrapy.Item):
    lofter_image_url = scrapy.Field(
        output_processor=MapCompose(return_value)
    )

用scrapy爬取搜狗Lofter圖片