1. 程式人生 > >簡單python爬蟲爬取拉鉤網

簡單python爬蟲爬取拉鉤網

因為個人需求,爬取了拉鉤網資料探勘相關職位的資料

首先先進入到拉鉤的首頁,搜尋資料探勘,得到相關職位的列表,按F12,檢視網路檢視html,可以看到職位列表並不在html


所以肯定是通過XHR非同步載入的,再切換到XHR,可以找到4個,點開檢視,可以看到在一個請求中有我們需要的資訊:


再切換到一個具體的職位中檢視,可以看到,有一串編號,

跳回到剛剛的xhr中檢視,可以看到有一個positionID與之對應,所以我們需要通過獲取positionID來獲取具體職位資訊

將職位資訊頁面和XHR的請求頭儲存下來,作為頭,進行網路請求,下面是具體的程式碼:

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 25 20:10:19 2018

@author: cy
"""

'''啟動/調配爬蟲'''
import requests
import json
from bs4 import BeautifulSoup as bs
import re
import time

XHR_HEAD_PATH = r'C:\Users\cy\Desktop\lagou\xhr_head.txt'#獲取xhr的HEAD
XHR_URL = r'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' 
POS_HEAD_PATH = r'C:\Users\cy\Desktop\lagou\pos_head.txt' #獲取職位列表及職位資訊的HEAD
PAGE_NUMBER = 29 #搜尋得到的職位列表的頁面
SAVE_PATH = r'C:\Users\cy\Desktop\lagou\save.txt' #資訊儲存的的路徑

#獲取XHR
def get_xhr(head,num):
    data={'first':'true','kd':'資料探勘','pn':num}
    re = requests.post(XHR_URL,headers = head, data = data)
    if re.status_code == 200:
        re_text = re.text
    else:
        re_text = None
        print('%d 頁面訪問錯誤'%num)
    return re_text
        

#解析xhr,獲取positionID
def get_posID(xhr_text):
    if xhr_text == None:
        return None
    xhr_json = json.loads(xhr_text)
    result = xhr_json['content']['positionResult']['result']
    id_list = []
    for i in range(15):
        id_list.append(result[i]['positionId'])
    return id_list

#獲取崗位資訊html
def get_posInfo(id,head):
    if id == None:
        print('id為空')
        return None
   
    url = r'https://www.lagou.com/jobs/'+str(id)+'.html'
    print(url)
    re = requests.get(url,headers = head)
    if re.status_code != 200:
        return None
    pos_html = re.text
    return pos_html
            
            

#解析崗位資訊
def analysis_pos(pos_html):
    if pos_html == None:
        print('崗位資訊頁面為空')
        return None
    soup = bs(pos_html)
    job_name = soup.find_all('span',class_='name')[0].text
   
    dd = soup.find_all('dd',class_='job_request')
    pattern = re.compile(r'>(.*?)</span>')
    result = pattern.findall(str(dd))
    for i in range(len(result)):
        result[i]  = result[i].replace('/','').strip()
        
    job_info = soup.find_all('dd',class_='job_bt')
    job_infos = str(job_info)
    return job_name, result, job_infos
    


def run():
    with open(XHR_HEAD_PATH,'r') as file:  #組裝訪問xhr的head
        xhr_text = file.read()
        xhr_sp = xhr_text.split('\n')
        xhr_head = {}
        n = len(xhr_sp)
        for i in range(n//2):
            xhr_head[xhr_sp[i*2].strip()] = xhr_sp[i*2+1].strip()
        print('xhr_head載入成功')
     
    with open(POS_HEAD_PATH,'r') as file:  #組裝訪問xhr的head
        pos_text = file.read()
        pos_sp = pos_text.split('\n')
        pos_head = {}
        n = len(pos_sp)
        for i in range(n//2):
            pos_head[pos_sp[i*2].strip()] = pos_sp[i*2+1].strip()
        print('pos_head載入成功')
    
    with open(SAVE_PATH,'w+',encoding='utf-8') as file:        
        for num in range(PAGE_NUMBER):
            xhr_json = get_xhr(xhr_head,num+1)  #獲取xhr返回的json
            time.sleep(10)
            for i in posIDs:
                posInfoHtml = get_posInfo(i,pos_head)  #訪問對應positionID的頁面,獲取html
                print('已訪問%d頁面'%i)
                job_name, result, job_info = analysis_pos(posInfoHtml) #對獲取的HTML頁面進行解析
                time.sleep(10)
                line = job_name+'0000'+str(result)+'0000'+job_info
                file.write(line+'\n')  
                print('頁面%d已儲存'%i)

if __name__ =='__main__':
    run()
    print('執行結束')


這個爬蟲很簡陋,非常簡陋,有機會修改一下