1. 程式人生 > >scrapy爬蟲系列之四--爬取列表和詳情

scrapy爬蟲系列之四--爬取列表和詳情

ont str extra utf-8 book line col turn detail

功能點:如何爬取列表頁,並根據列表頁獲取詳情頁信息?

爬取網站:東莞陽光政務網

完整代碼:https://files.cnblogs.com/files/bookwed/yangguang.zip

主要代碼:

yg.py

import scrapy
from yangguang.items import YangguangItem


class YgSpider(scrapy.Spider):
    name = yg
    allowed_domains = [sun0769.com]
    start_urls = [http://wz.sun0769.com/index.php/question/report
] def parse(self, response): tr_list = response.xpath("//div[@class=‘greyframe‘]/table[2]//tr") for tr in tr_list: item = YangguangItem() item["title"] = tr.xpath("./td[2]/a[2]/text()").extract_first() item["href"] = tr.xpath("./td[2]/a[2]/@href"
).extract_first() item["status"] = tr.xpath("./td[3]/span/text()").extract_first() item["publish_time"] = tr.xpath("./td[last()]/text()").extract_first() if type(item["href"]) == str: # 請求詳情頁 yield scrapy.Request( item[
"href"], callback=self.parse_detail, meta={"item": item} ) # 翻頁 next_url = response.xpath("//a[text()=‘>‘]/@href").extract_first() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse) # 解析詳情頁 def parse_detail(self, response): item = response.meta["item"] # 獲取詳情頁的內容、圖片 item["content"] = response.xpath("//div[@class=‘wzy1‘]/table[2]//tr[1]/td[@class=‘txt16_3‘]/text()").extract() item["content_image"] = response.xpath("//div[@class=‘wzy1‘]/table[2]//tr[1]/td[@class=‘txt16_3‘]//img/@src").extract() item["content_image"] = ["http://wz.sun0769.com"+i for i in item["content_image"]] yield item # 對返回的數據進行處理

pipelines.py

class YangguangPipeline(object):
    def __init__(self):
        self.f = open(yangguang.json, w, encoding=utf-8)

    def process_item(self, item, spider):
        item["content"] = self.process_content(item["content"])
        self.f.write(json.dumps(dict(item), ensure_ascii=False) + ,\n)
        return item

    def process_content(self, content):
        # 對內容項裏的\xa0 和 空白字符替換為空
        content = [re.sub(r"\xa0|\s", "", i) for i in content]
        # 對替換過的空字符串去除
        content = [i for i in content if len(i) > 0]
        return content

scrapy爬蟲系列之四--爬取列表和詳情