1. 程式人生 > >scrapy案例:爬取翼蜂網絡新聞列表和詳情頁面

scrapy案例:爬取翼蜂網絡新聞列表和詳情頁面

model rap name lB htm nod meta http AR

# -*- coding: utf-8 -*-
import scrapy
from Demo.items import DemoItem


class AbcSpider(scrapy.Spider):
    name = abcd
    allowed_domains = [www.cnyifeng.net]
    # start_urls = [‘http://abc.com/‘]

    # 拼接url
    baseURL = "http://www.cnyifeng.net/news/1/{}.html"

    offset = 1

    start_urls = [baseURL.format(offset)]

    
def parse(self,response): node_list = response.xpath("//div[@class=‘news_con‘]/dl[@class=‘news_dl‘]") for node in node_list: item = DemoItem() if len(node.xpath(".//a[@class=‘dt_1‘]//text()")): item[title] = node.xpath(".//a[@class=‘dt_1‘]//text()"
).extract()[0] else: item[title] = ‘‘ if len(node.xpath("./dd//text()")): item[zhaiyao] = node.xpath("./dd//text()").extract()[0] else: item[zhaiyao] = ‘‘ item[times] = node.xpath(".//span//text()").extract()[0] mainUrl
= http://www.cnyifeng.net erUrl = mainUrl + node.xpath(".//a[@class=‘dt_1‘]/@href").extract()[0] yield scrapy.Request(erUrl,callback=self.parse_detail_info,meta=item) # 把item傳遞給詳情頁的方法中 if len(response.xpath("//div[@class=‘flickr‘]//span[@class=‘disabled‘]")) == 0: url = response.xpath("//div[@class=‘flickr‘]/a[last()]/@href").extract()[0] yield scrapy.Request("http://www.cnyifeng.net" + url, callback=self.parse) else: ToNext = response.xpath("//div[@class=‘flickr‘]//span[@class=‘disabled‘]//text()").extract()[0].encode(utf-8) if str(ToNext != 下一頁?): url = response.xpath("//div[@class=‘flickr‘]/a[last()]/@href").extract()[0] yield scrapy.Request("http://www.cnyifeng.net" + url, callback=self.parse) def parse_detail_info(self,response): item = response.meta #接收列表頁的模型 item[viewcount] = 90 if len(response.xpath("//div[@id=‘left‘]/div[@class=‘content_arc‘]/span/text()")): content_list = response.xpath("//div[@id=‘left‘]/div[@class=‘content_arc‘]/span/text()").extract() content_str =‘‘ for model in content_list: content_str = content_str + str(model).strip() item[content] = content_str yield item

scrapy案例:爬取翼蜂網絡新聞列表和詳情頁面