Python爬蟲 —— 抓取美女圖片(Scrapy篇)
阿新 • • 發佈:2018-06-30
parse color 爬蟲 select 尺度 dex -i www 模塊
雜談:
之前用requests模塊爬取了美女圖片,今天用scrapy框架實現了一遍。
(圖片尺度確實大了點,但老衲早已無戀紅塵,權當觀賞哈哈哈)
Item:
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class GirlpicItem(scrapy.Item): title = scrapy.Field() image= scrapy.Field() index = scrapy.Field()
Spider:
#coding:utf-8 from scrapy.spiders import Spider from scrapy.http import Request from scrapy.selector import Selector from girlpic.items import GirlpicItem import scrapy import sys reload(sys) sys.setdefaultencoding(‘utf-8‘) class GirlpicSipder(Spider): name= ‘girlpic‘ allowed_domains = [] # 允許的域名 start_urls = ["http://www.mzitu.com/all/"] def parse(self, response): groups = response.xpath("//div[@class=‘main-content‘]//ul[@class=‘archives‘]//a") count = 0 for group in groups: count = count + 1 if count > 5:return #此處小心,不要用os.exit(0) groupUrl = group.xpath(‘@href‘).extract()[0] title = group.xpath("text()").extract()[0] request = scrapy.Request(url=groupUrl, callback=self.getGroup, meta={‘title‘: title,‘groupUrl‘:groupUrl}, dont_filter=True) yield request def getGroup(self, response): maxIndex = response.xpath("//div[@class=‘pagenavi‘]//span/text()").extract()[-2] for index in range(1, int(maxIndex) + 1): pageUrl = response.meta[‘groupUrl‘]+‘/‘+str(index) meta = response.meta meta[‘index‘] = index request = scrapy.Request(url=pageUrl, callback=self.getPage, meta=meta, dont_filter=True) yield request def getPage(self, response): imageurl = response.xpath("//div[@class=‘main-image‘]//img/@src").extract()[0] # 獲取圖片url request = scrapy.Request(url=imageurl, callback=self.FormItem, meta=response.meta,dont_filter=True) yield request def FormItem(self, response): title = response.meta[‘title‘] index = response.meta[‘index‘] image = response.body item = GirlpicItem(title=title,index=index,image=image) yield item
PipeLine:
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import os import codecs import sys reload(sys) sys.setdefaultencoding(‘utf-8‘) class GirlpicPipeline(object): def __init__(self): self.dirpath = u‘D:\學習資料‘ if not os.path.exists(self.dirpath): os.makedirs(self.dirpath) def process_item(self, item, spider): title = item[‘title‘] index = item[‘index‘] image = item[‘image‘] groupdir = os.path.join(self.dirpath, title) if not os.path.exists(groupdir): os.makedirs(groupdir) imagepath = os.path.join(groupdir, str(index) + u‘.jpg‘) file = codecs.open(imagepath, ‘wb‘) file.write(image) file.close() return item
Python爬蟲 —— 抓取美女圖片(Scrapy篇)