1. 程式人生 > >python爬取拉鉤網資料

python爬取拉鉤網資料

import requests
import re#引用正則匹配
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}#偽裝瀏覽器,製作一個請求頭
def local():
    url="https://www.lagou.com/";
    response=requests.get(url,headers = headers);
    result=response.text;
    rep=r' <a href="(.*?)" data-lg-tj-id="4A00" data-lg-tj-no=".*?" data-lg-tj-cid="idnull">(.*?)</a>'
result=re.findall(rep,result) return result def postion(url): response=requests.get(url,headers = headers) ggg = []; soup = BeautifulSoup(response.text, 'html.parser') for news in soup.select('.default_list'): # 定位 # print(news) place = news.find_all(class_='add'
)[0].text ggg.append(place) companyName = news.select('a')[1].text ggg.append(companyName) companyClass = news.find_all(class_='industry')[0].text.replace(' ', '') ggg.append(companyClass) companySpeak = news.find_all(class_='li_b_r')[0].text ggg.append(companySpeak) workMoney = news.find_all(class_='money'
)[0].text ggg.append(workMoney) workNeed = news.find_all(class_='li_b_l')[0].text.split('k')[-1] ggg.append(workNeed) url = news.find_all(class_='position_link')[0]['href'] ggg.append(url) return ggg for url,title in local(): result=postion(url) for item in result: print(item)

還有些不足之處,以後會努力改進,僅供大家參考!