1. 程式人生 > >爬取拉勾熱門城市“資料分析”崗位,並進行視覺化分析

爬取拉勾熱門城市“資料分析”崗位,並進行視覺化分析

首先,寫一個爬取崗位的爬蟲,如下:

# -*- coding:utf-8 -*-
from json import JSONDecodeError
import requests
import time
import pandas as pd


# 獲取儲存職位資訊的json物件,遍歷獲得公司名、福利待遇、工作地點、學歷要求、工作型別、釋出時間、職位名稱、薪資、工作年限

companyFullName = []
job_city = []
companySize = []
positionId = []
companyId = []
positionName = []
secondType = []
positionLables = []
industryField = []
industryLables = []
salary = []
positionAdvantage = []
workYear = []
stationname = []
education = []
createTime = []
longitude = []
latitude = []
info_dict = dict()


def get_json(url, datas):
    my_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Host': 'www.lagou.com',
        'Origin': 'https://www.lagou.com',
        'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
    }
    cookies = {
        'Cookie': '你的cookie'
    }
    for x in range(3):
        content = requests.post(url=url, cookies=cookies, headers=my_headers, data=datas)
        # content.encoding = 'utf-8'
        try:
            result = content.json()
        except JSONDecodeError:
            print('=====================解析失敗==============================\n',content)

        if 'content' not in result:
            print("=====================沒有資料==============================")
            time.sleep(60)
            continue

        info = result['content']['positionResult']['result']
        print(info)
        if len(info) < 1:
            return False

        for job in info:
            job_city.append(job['city'])
            # print(job['city'])
            companyId.append(job['companyId'])
            companyFullName.append(job['companyFullName'])
            companySize.append(job['companySize'])
            positionId.append(job['positionId'])
            positionName.append(job['positionName'])
            secondType.append(job['secondType'])
            positionLables.append(job['positionLables'])
            industryField.append(job['industryField'])
            industryLables.append(job['industryLables'])
            salary.append(job['salary'])
            positionAdvantage.append(job['positionAdvantage'])
            workYear.append(job['workYear'])
            stationname.append(job['stationname'])
            education.append(job['education'])
            createTime.append(job['createTime'])
            longitude.append(job['longitude'])
            latitude.append(job['latitude'])
        # break
        return True


def main():
    global citys
    for city in citys:
        for x in range(1, 30):
            url = 'https://www.lagou.com/jobs/positionAjax.json?&needAddtionalResult=false'
            datas = {
                'first': True,
                'pn': x,
                'kd': '資料分析',
                'city': city
            }
            isContinue = get_json(url, datas)
            if not isContinue:
                break
            time.sleep(20)
        time.sleep(10)
    info_dict['city'] = job_city
    info_dict['companyId'] = companyId
    info_dict['companyFullName'] = companyFullName
    info_dict['companySize'] = companySize
    info_dict['positionId'] = positionId
    info_dict['positionName'] = positionName
    info_dict['secondType'] = secondType
    info_dict['positionLables'] = positionLables
    info_dict['industryField'] = industryField
    info_dict['industryLables'] = industryLables
    info_dict['salary'] = salary
    info_dict['positionAdvantage'] = positionAdvantage
    info_dict['workYear'] = workYear
    info_dict['stationname'] = stationname
    info_dict['education'] = education
    info_dict['longitude'] = longitude
    info_dict['latitude'] = latitude

    frame = pd.DataFrame(info_dict)
    frame.to_csv("LGTotal.csv")


if __name__ == '__main__':
    citys = ['北京', '上海', '廣州', '深圳', '杭州', '廈門','成都','南京','武漢','西安','長沙','南京','天津','蘇州']
    main()

開啟儲存的csv檔案,部分資料如下:


通過清洗一些空資料和拆分薪資上下限等,進一步進行分析,並且視覺化,視覺化的工具為power bi。視覺化結果如下