1. 程式人生 > >python scrapy 基本操作演示程式碼

python scrapy 基本操作演示程式碼


# -*- coding: utf-8 -*-
import scrapy
# from quotetutorial.items import QuoteItem
from quotetutorial.items import QuotetutorialItem

# 主要編輯專案資訊基本上都在在這裡完成的

class QuotesSpider(scrapy.Spider):
    name = 'quotes'
    allowed_domains = ['quotes.toscrape.com']
    start_urls = ['http://quotes.toscrape.com/']

    # 爬取資訊
    def parse(self, response):
        # pass
        # 列印原始碼
        # print(response.text)
        quotes = response.css('.col-md-8 .quote')
        for quote in quotes:
            item = QuotetutorialItem()
            text = quote.css('.text::text').extract_first()
            author = quote.css('.author::text').extract_first() # 只提取一個內容 類似於 findone
            tags = quote.css('.tags .tag::text').extract() # 提多多個內容 類似於 findall
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item
        next = response.css('.pager .netxt a::attr(href)').extract_first() # 選擇下一頁
        url = response.urljoin(next) # 因為獲取的地址不完整,獲取完整的網址加內容連線地址
        yield scrapy.Request(url=url,callback=self.parse()) # 從新呼叫自己並翻頁
# 儲存檔案
# scrapy crawl quotes -o quotes.json
# scrapy crawl quotes -o quotes.jl
# scrapy crawl quotes -o quotes.csv
# scrapy crawl quotes -o quotes.xml
# scrapy crawl quotes -o ftp://user:
[email protected]
/path/quotes.csv