1. 程式人生 > >爬取陽光問政平臺

爬取陽光問政平臺

鏈接 import ML ont con spa sta http type

創建項目

scrapy startproject dongguan

items.py

import scrapy


class DongguanItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    content = scrapy.Field()
    url = scrapy.Field()
    number = scrapy.Field()

創建CrawSpider,使用模版crawl

scrapy genspider -t crawl sun wz.sun0769.com

sun.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dongguan.items import DongguanItem

class SunSpider(CrawlSpider):
    name = sun
    allowed_domains = [wz.sun0769.com]
    start_urls = [http://wz.sun0769.com/index.php/question/questionType?type=4&page=0
] rules = ( Rule(LinkExtractor(allow=rtype=4&page=\d+)), Rule(LinkExtractor(allow=r/html/question/\d+/\d+.shtml), callback = parse_item), ) def parse_item(self, response): item = DongguanItem()
item[title] = response.xpath(//div[contains(@class, "pagecenter p3")]//strong/text()
).extract()[0] # 編號 item[number] = item[title].split( )[-1].split(":")[-1] # 內容 item[content] = response.xpath(//div[@class="c1 text14_2"]/text()).extract()[0] # 鏈接 item[url] = response.url yield item

pipelines.py

import json

class DongguanPipeline(object):
    def __init__(self):
        self.filename = open("dongguan.json", "w")

    def process_item(self, item, spider):
        text = json.dumps(dict(item), ensure_ascii = False) + ",\n"
        self.filename.write(text.encode("utf-8"))
#python3中需改為:
self.filename.write(text)
return item def close_spider(self, spider): self.filename.close()
settings.py
BOT_NAME = dongguan

SPIDER_MODULES = [dongguan.spiders]
NEWSPIDER_MODULE = dongguan.spiders

ROBOTSTXT_OBEY = True

ITEM_PIPELINES = {
    dongguan.pipelines.DongguanPipeline: 300,
}

LOG_FILE = "dg.log"
LOG_LEVEL = "DEBUG"

 

執行

scrapy crawl sun

發現爬取內容有缺失

問題分析:

通過 print(response.url)分析:

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dongguan.items import DongguanItem

class SunSpider(CrawlSpider):
    name = sun
    allowed_domains = [wz.sun0769.com]
    start_urls = [http://wz.sun0769.com/index.php/question/questionType?type=4&page=0]

    rules = (
        Rule(LinkExtractor(allow=rtype=4&page=\d+),callback = parse_item),
        #Rule(LinkExtractor(allow=r‘/html/question/\d+/\d+.shtml‘), callback = ‘parse_item‘),
    )

    def parse_item(self, response):
        print(response.url)
        ‘‘‘
        item = DongguanItem()

        item[‘title‘] = response.xpath(‘//div[contains(@class, "pagecenter p3")]//strong/text()‘).extract()[0]
        # 編號
        item[‘number‘] = item[‘title‘].split(‘ ‘)[-1].split(":")[-1]
        # 內容
        item[‘content‘] = response.xpath(‘//div[@class="c1 text14_2"]/text()‘).extract()[0]
        # 鏈接
        item[‘url‘] = response.url

        yield item
        ‘‘‘ 

技術分享圖片

更改匹配規則:

    rules = (
        Rule(LinkExtractor(allow=rtype=4),callback = parse_item),
    ) 

技術分享圖片

設置

follow=True

修改sun.py

技術分享圖片

響應內容不一定是發送的url,後面的URL無效。

改寫sun.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from newdongguan.items import NewdongguanItem

class DongdongSpider(CrawlSpider):
    name = dongdong
    allowed_domains = [wz.sun0769.com]
    start_urls = [http://wz.sun0769.com/index.php/question/questionType?type=4&page=]

    # 每一頁的匹配規則
    pagelink = LinkExtractor(allow=("type=4"))
    # 每一頁裏的每個帖子的匹配規則
    contentlink = LinkExtractor(allow=(r"/html/question/\d+/\d+.shtml"))

    rules = (
        # 本案例的url被web服務器篡改,需要調用process_links來處理提取出來的url
        Rule(pagelink, process_links = "deal_links"),
        Rule(contentlink, callback = "parse_item")
    )

    # links 是當前response裏提取出來的鏈接列表
    def deal_links(self, links):
        for each in links:
            each.url = each.url.replace("?","&").replace("Type&","Type?")
        return links

    def parse_item(self, response):
        item = NewdongguanItem()
        # 標題
        item[title] = response.xpath(//div[contains(@class, "pagecenter p3")]//strong/text()).extract()[0]
        # 編號
        item[number] = item[title].split( )[-1].split(":")[-1]
        # 內容,先使用有圖片情況下的匹配規則,如果有內容,返回所有內容的列表集合
        content = response.xpath(//div[@class="contentext"]/text()).extract()
        # 如果沒有內容,則返回空列表,則使用無圖片情況下的匹配規則
        if len(content) == 0:
            content = response.xpath(//div[@class="c1 text14_2"]/text()).extract()
            item[content] = "".join(content).strip()
        else:
            item[content] = "".join(content).strip()
        # 鏈接
        item[url] = response.url

        yield item

爬取陽光問政平臺