Python爬取拉勾網招聘資訊存入資料庫
阿新 • • 發佈:2018-12-27
先抓包分析我們想要獲取的資料,很明顯都是動態資料,所以直接到Network下的XHR裡去找,這裡我們找到具體資料後,就要去尋分析求地址與請求資訊了。
還有需要提交的表單資訊
分析完畢之後,我們就可以開始寫我們的爬蟲專案了。
一.編寫Item
item編寫比較簡單
# 拉鉤職位資訊 class LagouItem(scrapy.Item): # 城市 city = scrapy.Field() # 公司 companyFullName = scrapy.Field() # 公司規模 companySize = scrapy.Field() # 地區 district = scrapy.Field() # 教育程度 education = scrapy.Field() # 地點 linestaion = scrapy.Field() # 招聘職務 positionName = scrapy.Field() # 招聘要求 jobNature = scrapy.Field() # 工資 salary = scrapy.Field() # 工作經驗 workYear = scrapy.Field() # 崗位釋出時間 createTime = scrapy.Field()
二.編寫Pipelines
因為我這裡是將資料存入資料庫中,所以編寫pipline之前記得建立好資料庫和表,不知道的可以去看我之前寫的文章,這裡就不說怎麼建立了。
import pymysql def process_item(self, item, spider): # 如果爬蟲名是movie if spider.name == 'lagou': try: self.cursor.execute("insert into Lagou (city, companyName, companySize, district, \ linestaion, positionName, jobNature, education, salary, workYear, showTime) \ VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (item['city'], item['companyFullName'], \ item['companySize'], item['district'], item['linestaion'], item['positionName'], \ item['jobNature'], item['education'], item['salary'], item['workYear'], item['createTime'])) self.conn.commit() except pymysql.Error: print("Error%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s" % (item['city'], item['companyFullName'], \ item['companySize'], item['district'], item['linestaion'], item['positionName'],\ item['jobNature'], item['education'], item['salary'], item['workYear'], item['createTime'])) return item
三.編寫Spiders
最後就是編寫我們的蜘蛛了。# -*-coding:utf-8-*- from scrapy.spiders import Spider from scrapy import FormRequest from scrapy.selector import Selector from Mycrawl.items import LagouItem import random import json import time class LagouSpider(Spider): # 爬蟲名字,重要 name = 'lagou' headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'https://www.lagou.com/jobs/list_Python?labelWords=&fromSearch=true&suginput=1'} allow_domains = ['lagou.com'] url = "https://www.lagou.com/jobs/positionAjax.json?" # &needAddtionalResult=true&isSchoolJob=0" page = 1 allpage = 0 def start_requests(self): yield FormRequest(self.url, headers=self.headers, formdata={ 'first': 'false', 'pn': str(self.page), 'kd': 'Python', 'city':'廣州' }, callback=self.parse ) def parse(self, response): # print(response.body) item = LagouItem() data = json.loads(response.body.decode('utf-8')) result = data['content']['positionResult']['result'] totalCount = data['content']['positionResult']['totalCount'] resultSize = data['content']['positionResult']['resultSize'] for each in result: item['city'] = each['city'] item['companyFullName'] = each['companyFullName'] item['companySize'] = each['companySize'] item['district'] = each['district'] item['education'] = each['education'] item['linestaion'] = each['linestaion'] item['positionName'] = each['positionName'] item['jobNature'] = each['jobNature'] item['salary'] = each['salary'] item['createTime'] = each['createTime'] item['workYear'] = each['workYear'] yield item time.sleep(random.randint(5, 20)) if int(resultSize): self.allpage = int(totalCount) / int(resultSize) + 1 if self.page < self.allpage: self.page += 1 yield FormRequest(self.url, headers=self.headers, formdata={ 'first': 'false', 'pn': str(self.page), 'kd': 'Python', 'city':'廣州' }, callback=self.parse )
編寫完畢後執行蜘蛛爬取資料。