1. 程式人生 > >爬蟲之Scrapy遞迴爬取網頁資訊

爬蟲之Scrapy遞迴爬取網頁資訊

# -*- coding: utf-8 -*-
import re

import scrapy

from zhipin.items import ZhipinItem


class BossZhipinSpider(scrapy.Spider):
    name = 'boss_zhipin'
    allowed_domains = ['https://www.zhipin.com']
    url = 'https://www.zhipin.com/c101020100/h_101020100/?query=python&page=%s'
    offset = 1
    start_urls = [url % offset]

    # https: // www.zhipin.com / c101020100 / h_101020100 /?query = python & page = 1
# https: // www.zhipin.com / c101020100 / h_101020100 /?query = python & page = 10 def parse(self, response): item = ZhipinItem() for response_part in response.css('#main > div > div.job-list > ul').extract(): company_size = [] company_info = re.findall('<em class="vline"></em>(.*?)</p>'
, response_part) # 算出列表長度,拿到偶數位的資料 # for idx in range(1, len(company_info)+1, 2): for idx, val in enumerate(company_info): if int(idx) % 2 != 0: if '<em class="vline"></em>' in company_info[idx]: new_item = company_info[idx].rsplit('</em>'
)[-1] company_size.append(new_item) else: # 沒帶</em>直接加到company_size中 company_size.append(company_info[idx]) result = zip(re.findall('title">(.*?)</div>', response_part), re.findall('<span class="red">(.*?)</span>', response_part), re.findall('ka="search_list_company_\d+_custompage" target="_blank">(.*?)</a>', response_part), company_size, re.findall('釋出於(.*?)</p>', response_part)) for job_item in result: """ 處理元組資料,返回item """ item['job_title'] = job_item[0] item['job_salary'] = job_item[1] item['job_company'] = job_item[2] item['company_size'] = job_item[3] item['publish_date'] = job_item[4] yield item if self.offset < 10: self.offset += 1 yield scrapy.Request(self.url % self.offset, callback=self.parse, dont_filter=True)

在遞迴爬取過程中遇到一個warning:

2018-07-24 16:38:09 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.zhipin.com':

問題出現原因:

因為 Request中請求的 URL 和 allowed_domains 中定義的域名衝突,所以將Request中請求的URL過濾掉了,無法請求

問題解決方案,有個引數dont_filter:

yield scrapy.Request(self.url % self.offset, callback=self.parse, dont_filter=True)

遞迴無法停止?分析下條件,將程式碼縮排下就可以了!!!