1. 程式人生 > >python爬蟲之scrapy的pipeline的使用

python爬蟲之scrapy的pipeline的使用

python爬蟲 pre ram .py pid cati port 目錄 自動創建

scrapy的pipeline是一個非常重要的模塊,主要作用是將return的items寫入到數據庫、文件等持久化模塊,下面我們就簡單的了解一下pipelines的用法。

案例一:

  技術分享圖片

items池

技術分享圖片
class ZhihuuserItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    id = scrapy.Field()
    name = scrapy.Field()
    avatar_url = scrapy.Field()
    headline 
= scrapy.Field() description = scrapy.Field() url = scrapy.Field() url_token = scrapy.Field() gender = scrapy.Field() cover_url = scrapy.Field() type = scrapy.Field() badge = scrapy.Field() answer_count = scrapy.Field() articles_count = scrapy.Field() commercial_question
= scrapy.Field() favorite_count = scrapy.Field() favorited_count = scrapy.Field() follower_count = scrapy.Field() following_columns_count = scrapy.Field() following_count = scrapy.Field() pins_count = scrapy.Field() question_count = scrapy.Field() thank_from_count = scrapy.Field() thank_to_count
= scrapy.Field() thanked_count = scrapy.Field() vote_from_count = scrapy.Field() vote_to_count = scrapy.Field() voteup_count = scrapy.Field() following_favlists_count = scrapy.Field() following_question_count = scrapy.Field() following_topic_count = scrapy.Field() marked_answers_count = scrapy.Field() mutual_followees_count = scrapy.Field() participated_live_count = scrapy.Field() locations = scrapy.Field() educations = scrapy.Field() employments = scrapy.Field()
items

寫入MongoDB數據庫的基本配置

技術分享圖片
#配置MongoDB數據庫的連接信息
MONGO_URL = 172.16.5.239
MONGO_PORT = 27017
MONGO_DB = zhihuuser

#參數等於False,就等於告訴你這個網站你想取什麽就取什麽,不會讀取每個網站的根目錄下的禁止爬取列表(例如:www.baidu.com/robots.txt)
ROBOTSTXT_OBEY = False


執行pipelines下的寫入操作
ITEM_PIPELINES = {
   zhihuuser.pipelines.MongoDBPipeline: 300,
}
settings.py

pipelines.py:
  1、首先我們要從settings文件中讀取數據的地址、端口、數據庫名稱(沒有會自動創建)。
  2、拿到數據庫的基本信息後進行連接。
  3、將數據寫入數據庫
  4、關閉數據庫
  註意:只有打開和關閉是只執行一次,而寫入操作會根據具體的寫入次數而定。
import pymongo

class MongoDBPipeline(object):
    """
    1、連接數據庫操作
    """
    def __init__(self,mongourl,mongoport,mongodb):
        ‘‘‘
        初始化mongodb數據的url、端口號、數據庫名稱
        :param mongourl:
        :param mongoport:
        :param mongodb:
        ‘‘‘
        self.mongourl = mongourl
        self.mongoport = mongoport
        self.mongodb = mongodb

    @classmethod
    def from_crawler(cls,crawler):
        """
        1、讀取settings裏面的mongodb數據的url、port、DB。
        :param crawler:
        :return:
        """
        return cls(
            mongourl = crawler.settings.get("MONGO_URL"),
            mongoport = crawler.settings.get("MONGO_PORT"),
            mongodb = crawler.settings.get("MONGO_DB")
        )

    def open_spider(self,spider):
        ‘‘‘
        1、連接mongodb數據
        :param spider:
        :return:
        ‘‘‘
        self.client = pymongo.MongoClient(self.mongourl,self.mongoport)
        self.db = self.client[self.mongodb]

    def process_item(self,item,spider):
        ‘‘‘
        1、將數據寫入數據庫
        :param item:
        :param spider:
        :return:
        ‘‘‘
        name = item.__class__.__name__
        # self.db[name].insert(dict(item))
        self.db[‘user‘].update({‘url_token‘:item[‘url_token‘]},{‘$set‘:item},True)
        return item

    def close_spider(self,spider):
        ‘‘‘
        1、關閉數據庫連接
        :param spider:
        :return:
        ‘‘‘
        self.client.close()

  

python爬蟲之scrapy的pipeline的使用