1. 程式人生 > >scrapy 爬取智聯招聘

scrapy 爬取智聯招聘

 

準備工作

  1. scrapy startproject Jobs
  2. cd Jobs
  3. scrapy genspider ZhaopinSpider www.zhaopin.com
  4. scrapy crawl ZhaopinSpider
  5. pip install diskcache
  6. pip install tinydb
  7. scrapy crawl ZhaopinSpider -o chongqing.json

 

ZhaopinSpider

 

# -*- coding: utf-8 -*-
import os
import json from tinydb import TinyDB, Query from furl import furl import scrapy class ZhaopinspiderSpider(scrapy.Spider): name = 'ZhaopinSpider' allowed_domains = ['www.zhaopin.com', 'sou.zhaopin.com', 'fe-api.zhaopin.com'] start_urls = ['https://www.zhaopin.com/citymap'] cache_db
= TinyDB('ZhaopinSpider-cache.json') # 快取資料庫 allowed_cities = ['重慶', ]# '成都', '上海', '深圳', '昆明', '杭州', '貴陽', '寧波'] ## 允許的城市 F = furl('https://fe-api.zhaopin.com/c/i/sou?pageSize=90&kt=3') # URL母版 PAGE_SIZE = 90 # 分頁大小 def get_city_code(self, city_name): '''(根據城市名)獲取城市程式碼
''' Q = Query() city = self.cache_db.get(Q.name.search(city_name)) if isinstance(city, dict): return city['code'] else: print('@' * 100) print(type(city)) def init_city_info(self, response): '''初始化城市資訊''' # 取原始碼 script_text = response.xpath('//script[text()[contains(., "__INITIAL_STATE__")]]/text()').extract_first() # 去收尾空格 script_text = script_text.strip() # 預處理為符合json規範的資料 script_json = script_text[script_text.index('=') + 1:] # 將json字串轉為字典 script_dict = json.loads(script_json) ''' # 儲存取得的json, 便於除錯檢視 with open('text.json', 'wt', encoding='utf-8') as f: json.dump(script_dict, f, indent=4, ensure_ascii=False) ''' ''' city_list = [] # 儲存城市列表 # 將字典中的城市提取到列表中,便於查詢 for ch in script_dict['cityList']['cityMapList']: city_list.extend(script_dict['cityList']['cityMapList'][ch]) # 篩選出重慶,並獲取城市碼 city_code = (list(filter(lambda city: city['name'] == '重慶', city_list)) or [{'code': None}])[0]['code'] ''' for ch in script_dict['cityList']['cityMapList']: for city in script_dict['cityList']['cityMapList'][ch]: self.cache_db.insert(city) def parse(self, response): # if not os.path.exists('ZhaopinSpider-cache.json'): if not bool(self.eache_db.all()): self.init_city_info(response) # 迭代每一個要爬取的城市 for city_name in self.allowed_cities: # 啟動 爬取某個城市 第一個請求 # import ipdb; ipdb.set_trace() yield self.request_city(city_name) def request_city(self, city_name, page_start=0): '''構造 爬取某個具體的城市 的請求物件''' city_code = self.get_city_code(city_name) url_data = { 'cityId': city_code, 'kw': 'python', 'start': page_start } # 要爬取的頁面的URL url = self.F.copy().add(url_data).url # import ipdb; ipdb.set_trace() req = scrapy.Request(url, callback=self.parse_city, dont_filter=False) # 使用 meta 傳遞附加資料,在 callback 中可以通過 respo.meta 取得 req.meta['city_name'] = city_name req.meta['page_start'] = page_start return req def parse_city(self, response): '''解析具體的頁面''' # 解析json格式的響應結果 resp_dict = json.loads(response.body_as_unicode()) # 總共所能爬取的條數 num_found = resp_dict['data']['numFound'] # 獲取當前請求的 page_start page_start = response.meta['page_start'] # 下一次請求,需要的 start 引數 next_start = page_start + self.PAGE_SIZE # import ipdb; ipdb.set_trace() # 判斷是否有下一頁 if next_start < num_found: # 獲取當前請求的 城市名 city_name = response.meta['city_name'] # 傳送下一頁請求 yield self.request_city(city_name, page_start=next_start) # 解析資料 for item in resp_dict['data']['results']: # TODO: 解析資料,只取我們需要的資訊 item['spiderName'] = self.name # 返回每一條資料 yield item