1. 程式人生 > >Scrapy框架學習

Scrapy框架學習

概述

主要是為了練習使用CrawlSpider類的rules變數中定義多個Rule的用法,體會Scrapy框架的強大、靈活性。

因此,對抓取到的內容只是儲存到JSON檔案中,沒有進行進一步的處理。

原始碼

items.py

class CnblogNewsItem(scrapy.Item):
    # 新聞標題
    title=scrapy.Field()
    # 投遞人
    postor=scrapy.Field()
    # 釋出時間
    pubtime=scrapy.Field()
    # 新聞內容
    content=scrapy.Field()

spiders/cnblognews_spider.py

# !/usr/bin/env python
# -*- coding:utf-8 -*-

from scrapy.spider import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor

from myscrapy.items import CnblogNewsItem


class CnblogNewsSpider(CrawlSpider):
    """
    部落格園新聞爬蟲Spider
        爬取新聞列表連結資料
        爬取每一條新聞的詳情頁資料
    """

    name = 'cnblognews'
    allowed_domains=['news.cnblogs.com']
    start_urls=['https://news.cnblogs.com/n/page/1/']

    # 新聞頁的LinkExtractor,使用正則規則提取
    page_link_extractor=LinkExtractor(allow=(r'page/\d+'))
    # 每一條新聞的LinkExtractor,使用XPath規則提取
    detail_link_extractor=LinkExtractor(restrict_xpaths=(r'//h2[@class="news_entry"]'))

    rules = [
        # 新聞頁提取規則,follow=True,跟進
        Rule(link_extractor=page_link_extractor,follow=True),
        # 新聞詳情頁提取規則,follow=False,不跟進
        Rule(link_extractor=detail_link_extractor,callback='parse_detail',follow=False)
    ]

    def parse_detail(self,response):
        """處理新聞詳情頁資料回撥方法"""

        # print(response.url)

        title=response.xpath('//div[@id="news_title"]/a/text()')[0].extract()
        postor = response.xpath('//span[@class="news_poster"]/a/text()')[0].extract()
        pubtime = response.xpath('//span[@class="time"]/text()')[0].extract()
        content = response.xpath('//div[@id="news_body"]/p/text()').extract()

        item=CnblogNewsItem()
        item['title']=title
        item['postor']=postor
        item['pubtime']=pubtime
        item['content']=content

        yield item

pipelines.py

class CnblognewsPipeline(object):
    """部落格園新聞Item PipeLIne"""

    def __init__(self):
        self.f=open('cnblognews.json',mode='w')

    def process_item(self,item,spider):
        news=json.dumps(dict(item),ensure_ascii=False,indent=4).strip().encode('utf-8')
        self.f.write(news+',\n')

    def close_spider(self,spider):
        self.f.close()

settings.py

ITEM_PIPELINES = {
   'myscrapy.pipelines.CnblognewsPipeline': 1,
}

執行結果