Scrapy爬取搜狗圖片
阿新 • • 發佈:2019-01-04
1.新建專案
scrapy startproject images360
scrapy genspider images images.so.com
2.構造請求
在setting.py定義爬取的頁數
MAX_PAGE=50
定義start_request()方法,用來生成50次請求
def start_requests(self): data = {'ch': 'photography', 'listtype': 'new'} base_url = 'http://image.so.com/zj?' for page in range(1, self.settings.get('MAX_PAGE') + 1): data['sn'] = page * 30 params = urlencode(data) url = base_url + params yield Request(url, self.parse)
再修改setting中ROBOTSTXT_OBEY將其設定為False
3.提取資訊
定義ImageItem
from scrapy import Item, Field
class ImageItem(Item):
collection = table = 'images'
id = Field()
url = Field()
title = Field()
thumb = Field()
提取Spider的有關資訊,將parse()方法改為:
def parse(self, response): result = json.loads(response.text) for image in result.get('list'): item = ImageItem() item['id'] = image.get('imageid') item['url'] = image.get('qhimg_url') item['title'] = image.get('group_title') item['thumb'] = image.get('qhimg_thumb_url') yield item
4.Mongo
MONGO_URI = 'localhost' MONGO_DB = 'images360'
import pymongo class MongoPipeline(object): def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_db=crawler.settings.get('MONGO_DB'), mongo_uri=crawler.settings.get('MONGO_URI') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def process_item(self, item, spider): self.db[item.collection].insert(dict(item)) return item def close_spider(self, spider): self.client.close()
5.Mysql
MYSQL_HOST = '127.0.0.1' MYSQL_DATABASE = 'images360' MYSQL_USER = 'root' MYSQL_PASSWORD = 'qaz123' MYSQL_PORT = 3306
import pymysql
class MysqlPipeline():
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
database=crawler.settings.get('MYSQL_DATABASE'),
user=crawler.settings.get('MYSQL_USER'),
password=crawler.settings.get('MYSQL_PASSWORD'),
port=crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self, spider):
self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8',
port=self.port)
self.cursor = self.db.cursor()
def close_spider(self, spider):
self.db.close()
def process_item(self, item, spider):
data = dict(item)
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values)
self.cursor.execute(sql, tuple(data.values()))
self.db.commit()
return item
6.儲存圖片到本地
在setting加入:
IMAGES_STORE='./images'
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split('/')[-1]
return file_name
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image Downloaded Failed')
return item
def get_media_requests(self, item, info):
yield Request(item['url'])
setting
ITEM_PIPELINES = {
'images360.pipelines.ImagePipeline': 300,
'images360.pipelines.MongoPipeline': 301,
'images360.pipelines.MysqlPipeline': 302,
}