1. 程式人生 > >Python爬蟲 —— 抓取美女圖片(Scrapy篇)

Python爬蟲 —— 抓取美女圖片(Scrapy篇)

parse color 爬蟲 select 尺度 dex -i www 模塊

雜談:

之前用requests模塊爬取了美女圖片,今天用scrapy框架實現了一遍。

(圖片尺度確實大了點,但老衲早已無戀紅塵,權當觀賞哈哈哈)

Item:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy

class GirlpicItem(scrapy.Item):
    title = scrapy.Field()
    image 
= scrapy.Field() index = scrapy.Field()

Spider:

#coding:utf-8
from scrapy.spiders import Spider
from scrapy.http import Request
from scrapy.selector import Selector
from girlpic.items import GirlpicItem
import scrapy
import sys
reload(sys)
sys.setdefaultencoding(utf-8)

class GirlpicSipder(Spider):
    name 
= girlpic allowed_domains = [] # 允許的域名 start_urls = ["http://www.mzitu.com/all/"] def parse(self, response): groups = response.xpath("//div[@class=‘main-content‘]//ul[@class=‘archives‘]//a") count = 0 for group in groups: count = count + 1 if count > 5:
return #此處小心,不要用os.exit(0) groupUrl = group.xpath(@href).extract()[0] title = group.xpath("text()").extract()[0] request = scrapy.Request(url=groupUrl, callback=self.getGroup, meta={title: title,groupUrl:groupUrl}, dont_filter=True) yield request def getGroup(self, response): maxIndex = response.xpath("//div[@class=‘pagenavi‘]//span/text()").extract()[-2] for index in range(1, int(maxIndex) + 1): pageUrl = response.meta[groupUrl]+/+str(index) meta = response.meta meta[index] = index request = scrapy.Request(url=pageUrl, callback=self.getPage, meta=meta, dont_filter=True) yield request def getPage(self, response): imageurl = response.xpath("//div[@class=‘main-image‘]//img/@src").extract()[0] # 獲取圖片url request = scrapy.Request(url=imageurl, callback=self.FormItem, meta=response.meta,dont_filter=True) yield request def FormItem(self, response): title = response.meta[title] index = response.meta[index] image = response.body item = GirlpicItem(title=title,index=index,image=image) yield item

PipeLine:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import os
import codecs
import sys
reload(sys)
sys.setdefaultencoding(utf-8)

class GirlpicPipeline(object):

    def __init__(self):
        self.dirpath = uD:\學習資料
        if not os.path.exists(self.dirpath):
            os.makedirs(self.dirpath)

    def process_item(self, item, spider):
        title = item[title]
        index = item[index]
        image = item[image]
        groupdir = os.path.join(self.dirpath, title)
        if not os.path.exists(groupdir):
            os.makedirs(groupdir)
        imagepath = os.path.join(groupdir, str(index) + u.jpg)
        file = codecs.open(imagepath, wb)
        file.write(image)
        file.close()
        return item

Python爬蟲 —— 抓取美女圖片(Scrapy篇)