python爬取拉勾網資料儲存到mysql資料庫
阿新 • • 發佈:2019-01-02
環境:python3
相關包:requests , json , pymysql
思路:1.通過chrome F12找到拉鉤請求介面,分析request的各項引數
2.模擬瀏覽器請求拉鉤介面
3.預設返回的json不是標準格式 , 對返回的json資料進行處理轉換為標準格式
4.利用pymysql模組進行db操作
#coding:utf-8
import random
import urllib
import json
import pymysql
import requests
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" ,
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6" ,
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5" ,
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
]
#隨機模擬一個瀏覽器的UA
def get_random_userAgent():
userAgent = random.choice(USER_AGENTS)
return userAgent
#得到請求拉鉤介面返回的json資料
def get_job_all_json(pn=1,kd='python',city='上海'):
headers = {
'User-Agent': get_random_userAgent(),
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'Cookie': 'JSESSIONID=ABAAABAAADEAAFID589F81DDA4B135EA73D59382D94193B; _gat=1; user_trace_token=20170918201032-5e70e65e-9c6a-11e7-9196-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGUID=20170918201032-5e70e916-9c6a-11e7-9196-5254005c3644; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=index_search; _gid=GA1.2.1042499452.1505736518; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1505736518; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1505736559; _ga=GA1.2.2038003268.1505736518; LGSID=20170918201032-5e70e7a7-9c6a-11e7-9196-5254005c3644; LGRID=20170918201112-76a14753-9c6a-11e7-9196-5254005c3644; SEARCH_ID=23d97ca16048467a93241983f07b9f32'
}
data = {
'first': 'true',
'pn': pn, #page number
'kd': kd
}
city = urllib.parse.quote(city)
res = requests.post(
'https://www.lagou.com/jobs/positionAjax.json?'
'city={0}&'
'needAddtionalResult=false&'
'isSchoolJob=0'.format(city,0), data=data, headers=headers)
print('status_code:',res.status_code)
print('text:',res.text)
return res.text
#得到資料庫連線
def get_db_conn():
conn = pymysql.connect(host='localhost', user='root', passwd='admin', db='lagou', port=3306, charset='utf8')
return conn
#存入資料庫
def insert_into_db(conn,jobs):
cur = conn.cursor()
#cur.execute('truncate spider') #清空現有資料
for job in jobs:
positionName = job['positionName']
salary = job['salary']
education = job['education']
companyFullName = job['companyFullName']
workYear = job['workYear']
companyLabelList = str(job['companyLabelList']).replace('\'','')
companySize = job['companySize']
#print(positionName, salary, education, companyFullName, workYear, companyLabelList, companySize)
sql = 'insert into spider(positionName , salary , education , companyFullName , workYear , companyLabelList , companySize) ' \
'values(\''+positionName+'\',\''+salary+'\',\''+education+'\',\''+companyFullName+'\',\''+workYear+'\',\''+companyLabelList+'\',\''+companySize+'\')'
print('sql:',sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
#對返回的不標準json進行處理
def get_job_result_json(jsonString):
job_result = jsonString['content']['positionResult']['result'] # List
j1 = str(job_result).replace("'", "\"")
j2 = j1.replace("None", "\"None\"")
return j2
if __name__ =='__main__':
job = 'hadoop'
city = '北京'
for i in range(1,11):
pn = i
jsonString = json.loads(get_job_all_json(pn,job,city))
job_json = get_job_result_json(jsonString)
jobs = json.loads(job_json)
conn = get_db_conn()
insert_into_db(conn,jobs)
print("done ...")
資料庫中的資料如圖:
資料庫表結構:
/*
Navicat MySQL Data Transfer
Source Server : mysql
Source Server Version : 50022
Source Host : localhost:3306
Source Database : lagou
Target Server Type : MYSQL
Target Server Version : 50022
File Encoding : 65001
Date: 2017-10-05 10:34:57
*/
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for spider
-- ----------------------------
DROP TABLE IF EXISTS `spider`;
CREATE TABLE `spider` (
`id` int(11) NOT NULL auto_increment,
`positionName` varchar(255) collate utf8_bin default NULL,
`salary` varchar(255) collate utf8_bin default NULL,
`education` varchar(255) collate utf8_bin default NULL,
`companyFullName` varchar(255) collate utf8_bin default NULL,
`workYear` varchar(255) collate utf8_bin default NULL,
`companyLabelList` varchar(255) collate utf8_bin default NULL,
`companySize` varchar(255) collate utf8_bin default NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;