1. 程式人生 > >scrapy 抓取拉鉤 ajax

scrapy 抓取拉鉤 ajax

# -*- coding: utf-8 -*-
import scrapy
from LagouSpider.items import LagouspiderItem
import json


class LagouSpider(scrapy.Spider):
    name = 'lagou'
    # allowed_domains = ['lagou.com']
    url = 'https://www.lagou.com/jobs/positionAjax.json?'
    page = 2
    allpage = 0

    def start_requests(self):
        yield scrapy.FormRequest(self.url, formdata={
            'first': 'false',
            'pn': str(self.page),
            'kd': 'python',
            'city': '北京'
        }, callback=self.parse)

    def parse(self, response):
        item = LagouspiderItem()
        data = json.loads(response.body)
        result = data['content']['positionResult']['result']

        for each in result:
            item['city'] = each['city']
            item['money'] = each['salary']
            yield item

需在setting中設定headers:

DEFAULT_REQUEST_HEADERS = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Referer': 'https://www.lagou.com/jobs/list_Python?labelWords=&fromSearch=true&suginput=1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66\
    .0.3359.181 Safari/537.36'
}