1. 程式人生 > >用python爬取拉勾網招聘資訊並以CSV檔案儲存

用python爬取拉勾網招聘資訊並以CSV檔案儲存

爬取拉勾網招聘資訊

1、在網頁原始碼中搜索資訊,並沒有搜到,判斷網頁資訊使用Ajax來實現的
在這裡插入圖片描述

2、檢視網頁中所需的資料資訊,返回的是JSON資料;
在這裡插入圖片描述

3、條件為北京+資料分析師的公司一共40087家,而實際拉勾網展示的資料只有
15條/頁 * 30頁 = 450條,所以需要判斷想要展示的資料是否能在30頁展示完成,超過30頁則獲取30頁資料在這裡插入圖片描述

4、獲取請求頭與Form Data資料
在這裡插入圖片描述

5、將資料以CSV檔案儲存(首先建立檔案,可以最後將表頭資訊寫入)

with open('lagou.csv', 'w', newline = '', encoding='utf-8') as csvfile:
    fieldnames = ['businessZones', 'companyFullName', 'companyLabelList', 'companyShortName', 'companySize', 'district',
                  'education', 'financeStage', 'firstType', 'industryField', 'industryLables', 'linestaion',
                  'positionAdvantage', 'positionName', 'publisherId', 'salary', 'secondType', 'stationname', 'workYear']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

6、原始碼展示

import json
import requests
import math
import time
import csv

headers = {
            'Cookie':'LGUID=20160325221916-8e713da1-f294-11e5-baa9-5254005c3644; __guid=237742470.2209186392686119200.1542463319285.1892; WEBTJ-ID=20181117220200-16721fa777529f-063fc0ee1ebcef-5768397b-1049088-16721fa77761fb; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542463323; _ga=GA1.2.1018218803.1542463324; _gid=GA1.2.545935771.1542463324; user_trace_token=20181117220345-9917d84f-ea71-11e8-892e-5254005c3644; LGSID=20181117220345-9917db98-ea71-11e8-892e-5254005c3644; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D1%26rsv_idx%3D1%26tn%3Dbaidu%26wd%3Dlagouwang%26oq%3D%252526lt%25253BSDN%2525E5%25258F%252591%2525E5%2525B8%252583%2525E7%25259A%252584%2525E5%25258D%25259A%2525E5%2525AE%2525A2%2525E5%25258F%2525AF%2525E4%2525BB%2525A5%2525E4%2525BF%2525AE%2525E6%252594%2525B9%26rsv_pq%3Ded43d71700033d83%26rsv_t%3D43b1GYsCSHSQp1N%252FIp1eR1J3VXskMjt44RcbJkSNM8%252BbE%252Fc4aKUjcI%252FhflA%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_sug3%3D10%26rsv_sug1%3D3%26rsv_sug7%3D100%26rsv_sug2%3D0%26inputT%3D25206156%26rsv_sug4%3D25206157; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; JSESSIONID=ABAAABAAADEAAFI6E3B7886F4C8194B687AAD66C7925F67; index_location_city=%E5%85%A8%E5%9B%BD; SEARCH_ID=96d9378ed73e4c278dc3c4b140ecebaf; LGRID=20181117220435-b693f2da-ea71-11e8-a49f-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542463375; monitor_count=7',
            'Host':'www.lagou.com',
	        'Origin':'https://www.lagou.com',	 
	        'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88labelWords=sug&fromSearch=true&suginput=shujufenxishi',labelWords=sug&fromSearch=true&suginput=shujufenxishi',
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
            'X-Anit-Forge-Code':'0',
            'X-Anit-Forge-Token':'None',
            'X-Requested-With':'XMLHttpRequest'
}


# 建立一個csv檔案,並將表頭資訊寫入檔案中
with open('lagou.csv', 'w', encoding='utf-8') as csvfile:
    fieldnames = ['businessZones', 'companyFullName', 'companyLabelList', 'companyShortName', 'companySize', 'district',
                  'education', 'financeStage', 'firstType', 'industryField', 'industryLables', 'linestaion',
                  'positionAdvantage', 'positionName', 'publisherId', 'salary', 'secondType', 'stationname', 'workYear']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

# 判斷所查詢的資訊是否用30頁可以展示完,大於30頁的爬取30頁內容
def get_page(url, params):
    html = requests.post(url, data=params, headers=headers)
    json_data = json.loads(html.text)
    total_count = json_data['content']['positionResult']['totalCount'] # 獲取資訊公司資訊總數
    page_number = math.ceil(total_count / 15) if math.ceil(total_count / 15) < 30 else 30
    get_info(url, page_number)


def get_info(url, page):
    for pn in range(1, page + 1):
        params = {
            'first': 'true' if pn == 1 else 'false',  # 第一頁點選是true,其餘頁均為false
            'pn':str(pn), # 傳入頁面數的字串型別
            'kd':'資料分析師' # 想要獲取的職位
        }
        try:
            html = requests.post(url, data=params, headers=headers)
            json_data = json.loads(html.text)
            results = json_data['content']['positionResult']['result'] # 獲取JSON資料內容
            for result in results: # 獲取每條資料並以字典型別儲存
                infos = {
                    'businessZones' : result['businessZones'],
                'companyFullName' : result['companyFullName'],
                'companyLabelList' : result['companyLabelList'],
                'companyShortName' : result['companyShortName'],
                'companySize' : result['companySize'],
                'district' : result['district'],
                'education' : result['education'],
                'financeStage' : result['financeStage'],
                'firstType' : result['firstType'],
                'industryField' : result['industryField'],
                'industryLables' : result['industryLables'],
                'linestaion' : result['linestaion'],
                'positionAdvantage' : result['positionAdvantage'],
                'positionName' : result['positionName'],
                'publisherId' : result['publisherId'],
                'salary' : result['salary'],
                'secondType' : result['secondType'],
                'stationname' : result['stationname'],
                'workYear' : result['workYear']
                }
                print('-------------')
                print(infos)
                write_to_file(infos) # 呼叫寫入檔案函式
            time.sleep(2)
        except requests.RequestException :
            pass


# 將資料追加寫入之前建立的lagou.csv檔案中
def write_to_file(content):
    with open('lagou.csv', 'a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writerow(content)
        csvfile.close()

# 傳入url ,(url中引數包括,城市:北京;),引用get_page函式
if __name__ == '__main__':
    url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false'
    params = {
        'first': 'true',
        'pn': '1',
        'kd': '資料分析師'
    }
    get_page(url, params)