簡單python爬蟲爬取拉鉤網
阿新 • • 發佈:2019-02-14
因為個人需求,爬取了拉鉤網資料探勘相關職位的資料
首先先進入到拉鉤的首頁,搜尋資料探勘,得到相關職位的列表,按F12,檢視網路檢視html,可以看到職位列表並不在html
所以肯定是通過XHR非同步載入的,再切換到XHR,可以找到4個,點開檢視,可以看到在一個請求中有我們需要的資訊:
再切換到一個具體的職位中檢視,可以看到,有一串編號,
跳回到剛剛的xhr中檢視,可以看到有一個positionID與之對應,所以我們需要通過獲取positionID來獲取具體職位資訊
將職位資訊頁面和XHR的請求頭儲存下來,作為頭,進行網路請求,下面是具體的程式碼:
這個爬蟲很簡陋,非常簡陋,有機會修改一下# -*- coding: utf-8 -*- """ Created on Wed Apr 25 20:10:19 2018 @author: cy """ '''啟動/調配爬蟲''' import requests import json from bs4 import BeautifulSoup as bs import re import time XHR_HEAD_PATH = r'C:\Users\cy\Desktop\lagou\xhr_head.txt'#獲取xhr的HEAD XHR_URL = r'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' POS_HEAD_PATH = r'C:\Users\cy\Desktop\lagou\pos_head.txt' #獲取職位列表及職位資訊的HEAD PAGE_NUMBER = 29 #搜尋得到的職位列表的頁面 SAVE_PATH = r'C:\Users\cy\Desktop\lagou\save.txt' #資訊儲存的的路徑 #獲取XHR def get_xhr(head,num): data={'first':'true','kd':'資料探勘','pn':num} re = requests.post(XHR_URL,headers = head, data = data) if re.status_code == 200: re_text = re.text else: re_text = None print('%d 頁面訪問錯誤'%num) return re_text #解析xhr,獲取positionID def get_posID(xhr_text): if xhr_text == None: return None xhr_json = json.loads(xhr_text) result = xhr_json['content']['positionResult']['result'] id_list = [] for i in range(15): id_list.append(result[i]['positionId']) return id_list #獲取崗位資訊html def get_posInfo(id,head): if id == None: print('id為空') return None url = r'https://www.lagou.com/jobs/'+str(id)+'.html' print(url) re = requests.get(url,headers = head) if re.status_code != 200: return None pos_html = re.text return pos_html #解析崗位資訊 def analysis_pos(pos_html): if pos_html == None: print('崗位資訊頁面為空') return None soup = bs(pos_html) job_name = soup.find_all('span',class_='name')[0].text dd = soup.find_all('dd',class_='job_request') pattern = re.compile(r'>(.*?)</span>') result = pattern.findall(str(dd)) for i in range(len(result)): result[i] = result[i].replace('/','').strip() job_info = soup.find_all('dd',class_='job_bt') job_infos = str(job_info) return job_name, result, job_infos def run(): with open(XHR_HEAD_PATH,'r') as file: #組裝訪問xhr的head xhr_text = file.read() xhr_sp = xhr_text.split('\n') xhr_head = {} n = len(xhr_sp) for i in range(n//2): xhr_head[xhr_sp[i*2].strip()] = xhr_sp[i*2+1].strip() print('xhr_head載入成功') with open(POS_HEAD_PATH,'r') as file: #組裝訪問xhr的head pos_text = file.read() pos_sp = pos_text.split('\n') pos_head = {} n = len(pos_sp) for i in range(n//2): pos_head[pos_sp[i*2].strip()] = pos_sp[i*2+1].strip() print('pos_head載入成功') with open(SAVE_PATH,'w+',encoding='utf-8') as file: for num in range(PAGE_NUMBER): xhr_json = get_xhr(xhr_head,num+1) #獲取xhr返回的json time.sleep(10) for i in posIDs: posInfoHtml = get_posInfo(i,pos_head) #訪問對應positionID的頁面,獲取html print('已訪問%d頁面'%i) job_name, result, job_info = analysis_pos(posInfoHtml) #對獲取的HTML頁面進行解析 time.sleep(10) line = job_name+'0000'+str(result)+'0000'+job_info file.write(line+'\n') print('頁面%d已儲存'%i) if __name__ =='__main__': run() print('執行結束')