1. 程式人生 > >爬取多個url頁面資料--手動實現

爬取多個url頁面資料--手動實現

# -*- coding: utf-8 -*-
import scrapy
from qiubaiByPages.items import QiubaibypagesItem

class QiubaiSpider(scrapy.Spider):
    name = 'qiubai'
    #allowed_domains = ['www.qiushibaike.com/text']
    start_urls = ['https://www.qiushibaike.com/text/']

    #設計一個通用的url模板
    url = 'https://www.qiushibaike.com/text/page/%d/
' pageNum = 1 def parse(self, response): div_list = response.xpath('//*[@id="content-left"]/div') for div in div_list: author = div.xpath('./div[@class="author clearfix"]/a[2]/h2/text()').extract_first() content = div.xpath('.//div[@class="content"]/span/text()
').extract_first() #建立一個items物件,將解析到資料值儲存到items物件中 item = QiubaibypagesItem() item['author'] = author item['content'] = content #將item提交給管道 yield item #請求的手動傳送 #13表示的是最後一頁的頁碼 if self.pageNum <= 13:
print('爬取到了第%d頁的頁面資料'%self.pageNum) self.pageNum += 1 new_url = format(self.url % self.pageNum) yield scrapy.Request(url=new_url,callback=self.parse)