用python爬取拉勾網招聘資訊並以CSV檔案儲存
阿新 • • 發佈:2018-11-19
爬取拉勾網招聘資訊
1、在網頁原始碼中搜索資訊,並沒有搜到,判斷網頁資訊使用Ajax來實現的
2、檢視網頁中所需的資料資訊,返回的是JSON資料;
3、條件為北京+資料分析師的公司一共40087家,而實際拉勾網展示的資料只有
15條/頁 * 30頁 = 450條,所以需要判斷想要展示的資料是否能在30頁展示完成,超過30頁則獲取30頁資料
4、獲取請求頭與Form Data資料
5、將資料以CSV檔案儲存(首先建立檔案,可以最後將表頭資訊寫入)
with open('lagou.csv', 'w', newline = '', encoding='utf-8') as csvfile: fieldnames = ['businessZones', 'companyFullName', 'companyLabelList', 'companyShortName', 'companySize', 'district', 'education', 'financeStage', 'firstType', 'industryField', 'industryLables', 'linestaion', 'positionAdvantage', 'positionName', 'publisherId', 'salary', 'secondType', 'stationname', 'workYear'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader()
6、原始碼展示
import json import requests import math import time import csv headers = { 'Cookie':'LGUID=20160325221916-8e713da1-f294-11e5-baa9-5254005c3644; __guid=237742470.2209186392686119200.1542463319285.1892; WEBTJ-ID=20181117220200-16721fa777529f-063fc0ee1ebcef-5768397b-1049088-16721fa77761fb; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542463323; _ga=GA1.2.1018218803.1542463324; _gid=GA1.2.545935771.1542463324; user_trace_token=20181117220345-9917d84f-ea71-11e8-892e-5254005c3644; LGSID=20181117220345-9917db98-ea71-11e8-892e-5254005c3644; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D1%26rsv_idx%3D1%26tn%3Dbaidu%26wd%3Dlagouwang%26oq%3D%252526lt%25253BSDN%2525E5%25258F%252591%2525E5%2525B8%252583%2525E7%25259A%252584%2525E5%25258D%25259A%2525E5%2525AE%2525A2%2525E5%25258F%2525AF%2525E4%2525BB%2525A5%2525E4%2525BF%2525AE%2525E6%252594%2525B9%26rsv_pq%3Ded43d71700033d83%26rsv_t%3D43b1GYsCSHSQp1N%252FIp1eR1J3VXskMjt44RcbJkSNM8%252BbE%252Fc4aKUjcI%252FhflA%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_sug3%3D10%26rsv_sug1%3D3%26rsv_sug7%3D100%26rsv_sug2%3D0%26inputT%3D25206156%26rsv_sug4%3D25206157; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; JSESSIONID=ABAAABAAADEAAFI6E3B7886F4C8194B687AAD66C7925F67; index_location_city=%E5%85%A8%E5%9B%BD; SEARCH_ID=96d9378ed73e4c278dc3c4b140ecebaf; LGRID=20181117220435-b693f2da-ea71-11e8-a49f-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542463375; monitor_count=7', 'Host':'www.lagou.com', 'Origin':'https://www.lagou.com', 'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88labelWords=sug&fromSearch=true&suginput=shujufenxishi',labelWords=sug&fromSearch=true&suginput=shujufenxishi', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'X-Anit-Forge-Code':'0', 'X-Anit-Forge-Token':'None', 'X-Requested-With':'XMLHttpRequest' } # 建立一個csv檔案,並將表頭資訊寫入檔案中 with open('lagou.csv', 'w', encoding='utf-8') as csvfile: fieldnames = ['businessZones', 'companyFullName', 'companyLabelList', 'companyShortName', 'companySize', 'district', 'education', 'financeStage', 'firstType', 'industryField', 'industryLables', 'linestaion', 'positionAdvantage', 'positionName', 'publisherId', 'salary', 'secondType', 'stationname', 'workYear'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() # 判斷所查詢的資訊是否用30頁可以展示完,大於30頁的爬取30頁內容 def get_page(url, params): html = requests.post(url, data=params, headers=headers) json_data = json.loads(html.text) total_count = json_data['content']['positionResult']['totalCount'] # 獲取資訊公司資訊總數 page_number = math.ceil(total_count / 15) if math.ceil(total_count / 15) < 30 else 30 get_info(url, page_number) def get_info(url, page): for pn in range(1, page + 1): params = { 'first': 'true' if pn == 1 else 'false', # 第一頁點選是true,其餘頁均為false 'pn':str(pn), # 傳入頁面數的字串型別 'kd':'資料分析師' # 想要獲取的職位 } try: html = requests.post(url, data=params, headers=headers) json_data = json.loads(html.text) results = json_data['content']['positionResult']['result'] # 獲取JSON資料內容 for result in results: # 獲取每條資料並以字典型別儲存 infos = { 'businessZones' : result['businessZones'], 'companyFullName' : result['companyFullName'], 'companyLabelList' : result['companyLabelList'], 'companyShortName' : result['companyShortName'], 'companySize' : result['companySize'], 'district' : result['district'], 'education' : result['education'], 'financeStage' : result['financeStage'], 'firstType' : result['firstType'], 'industryField' : result['industryField'], 'industryLables' : result['industryLables'], 'linestaion' : result['linestaion'], 'positionAdvantage' : result['positionAdvantage'], 'positionName' : result['positionName'], 'publisherId' : result['publisherId'], 'salary' : result['salary'], 'secondType' : result['secondType'], 'stationname' : result['stationname'], 'workYear' : result['workYear'] } print('-------------') print(infos) write_to_file(infos) # 呼叫寫入檔案函式 time.sleep(2) except requests.RequestException : pass # 將資料追加寫入之前建立的lagou.csv檔案中 def write_to_file(content): with open('lagou.csv', 'a', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writerow(content) csvfile.close() # 傳入url ,(url中引數包括,城市:北京;),引用get_page函式 if __name__ == '__main__': url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false' params = { 'first': 'true', 'pn': '1', 'kd': '資料分析師' } get_page(url, params)