1. 程式人生 > >爬蟲之拉勾網職位獲取

爬蟲之拉勾網職位獲取

重點在於演示urllib.request.Request()請求中各項引數的 書寫格式 譬如: url data headers... Demo
 1 import urllib.request
 2 import urllib.parse
 3 import json, jsonpath, csv
 4 
 5 url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false"
 6 headers = {
 7     "Accept": "application/json, text/javascript, */*; q=0.單執行緒",
 8     "Accept-Encoding
": "gzip, deflate, br", 9 "Accept-Language": "zh-CN,zh;q=0.9", 10 "Connection": "keep-alive", 11 "Content-Length": "38", 12 "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", 13 "Cookie": "_ga=GA1.2.1963509933.1531996888; user_trace_token=20180719184127-4a8c7914-8b40-11e8-9eb6-525400f775ce; LGUID=20180719184127-4a8c7df2-8b40-11e8-9eb6-525400f775ce; JSESSIONID=ABAAABAAAIAACBI0F0B14254DA54E3CCF3B1F22FE32B179; _gid=GA1.2.1918046323.1536408617; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536408620; X_HTTP_TOKEN=339034308973d0bd323cc0b9b6b3203a; LG_LOGIN_USER_ID=24096d6ba723e146bd326de981ab924b23c1f21775136c3a8be953e855211e61; _putrc=95519B7FB60FCF58123F89F2B170EADC; login=true; unick=%E9%A9%AC%E7%BB%A7%E4%B8%9A; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=55; gate_login_token=3b4fa15daef090780ae377bbcd66dc83af9af0cc6a7f1dd697770790f3b9f9ef; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180908221639-cd6d2a72-b371-11e8-b62b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E7%2588%25AC%25E8%2599%25AB%3Fcity%3D%25E4%25B8%258A%25E6%25B5%25B7%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; SEARCH_ID=e559a417b4464fd9bc0b439a67ef0a5a; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536416580; LGRID=20180908222259-afed9b74-b372-11e8-b62b-5254005c3644
", 14 "Host": "www.lagou.com", 15 "Origin": "https://www.lagou.com", 16 "Referer": "https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=", 17 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36
", 18 "X-Anit-Forge-Code": "0", 19 "X-Anit-Forge-Token": "None", 20 "X-Requested-With": "XMLHttpRequest"} 21 # params = {"city": "上海", "needAddtionalResult": "false"} 22 list_position = [] 23 for pn in range(1, 5): 24 data = { 25 "first": "false", 26 "pn": pn, 27 "kd": "爬蟲" 28 } 29 # params = urllib.parse.urlencode(params) 30 # url = url + params 31 data = urllib.parse.urlencode(data).encode('utf-8') 32 req = urllib.request.Request(url, data=data, headers=headers) 33 print('正在請求第%d頁' % pn) 34 str_data = urllib.request.urlopen(req).read() 35 with open('03.html', 'wb') as f: 36 f.write(str_data) 37 # 轉換成python物件 38 data_list = json.loads(str_data) 39 job_list = jsonpath.jsonpath(data_list, "$..result")[0] 40 41 for item in job_list: 42 position_dict = {} 43 position_dict['positionName'] = item.get('positionName') 44 position_dict['createTime'] = item.get('createTime') 45 position_dict['url'] = 'https://www.lagou.com/jobs/' + str(item.get('positionId')) + '.html' 46 47 position_dict['salary'] = item.get('salary') 48 position_dict['workYear'] = item.get('workYear') 49 position_dict['companySize'] = item.get('companySize') 50 list_position.append(position_dict) 51 52 # 儲存到json檔案 53 json.dump(list_position, open('03.json', 'w')) 54 55 # 儲存到csv檔案 'gbk' codec can't encode character '\u200b' in position 0: illegal multibyte seq 56 csv_writer = csv.writer(open('04.csv', 'w', encoding='utf-8')) 57 sheets = list_position[0].keys() # 表頭 58 row_content = [] 59 for item in list_position: 60 row_content.append(item.values()) # 內容 61 try: 62 csv_writer.writerow(sheets) 63 csv_writer.writerows(row_content) 64 except Exception as e: 65 print(e)