Beautiful Soup爬蟲——爬取智聯招聘的資訊並存入資料庫
阿新 • • 發佈:2018-12-13
本人目前在校本科萌新…第一次寫有所不足還請見諒
前期準備
智聯招聘網頁 讓我們來搜尋一下python 發現網頁跳轉到這 讓我們看一下原始碼 發現並沒有我們所需要的資料 一開始我不信邪用requests嘗試了一下
import requests headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0', 'Host': 'sou.zhaopin.com', 'Referer': 'https://www.zhaopin.com/', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } url = 'https://sou.zhaopin.com/?pageSize=60&jl=530&kw=python&kt=3' re = requests.get(url,headers=headers) print(re.text)
import requests headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0', 'Host': 'sou.zhaopin.com', 'Referer': 'https://www.zhaopin.com/', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cookie':'ZP_OLD_FLAG=true' } url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京&kw=python&sm=0&p=1' re = requests.get(url,headers=headers) print(re.text)
cookie這裡表示是舊版網頁 發現確實有招聘的資料,這裡就不發截圖了。
程式碼
我用了json儲存了一些變數,方便更改 spider.json
{
"host":"localhost",
"user":"root",
"password":"",
"dbname":"vacation",
"port":3306,
"city":"北京",
"keyword":"python",
"page":90,
"Cookie":"ZP_OLD_FLAG=true;"
}
程式碼
from bs4 import BeautifulSoup import requests from requests.exceptions import RequestException import pymysql import json f = open("spider.json",encoding='utf-8') setting = json.load(f) host = setting['host'] user = setting['user'] password = setting['password'] dbname = setting['dbname'] port = setting['port'] city = setting['city'] keyword = setting['keyword'] pagenum = setting['page'] Cookie = setting['Cookie'] def get_one_page(city, keyword, page): ''' 獲取網頁html內容並返回 ''' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0', 'Host': 'sou.zhaopin.com', 'Referer': 'https://www.zhaopin.com/', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cookie':Cookie } url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&sm=0&p={}'.format(city,keyword,page) try: # 獲取網頁內容,返回html資料 response = requests.get(url, headers=headers) # 通過狀態碼判斷是否獲取成功 if response.status_code == 200: return response.text return None except RequestException as e: return None def readonepage(html,db): cur = db.cursor() soup = BeautifulSoup(html,'lxml') for x in soup.find_all('td'): try: sybo = x.get('class') if sybo ==['zwmc']: jobname = x.div.a.get_text() #崗位名稱 jobhref = x.div.a.get('href') if jobhref[9] == 'i': pass list = get_detailed(jobhref) list.append(jobname) print(jobname) sql = "INSERT INTO companyinfo(company_name,work_experience,edu_background,salary,describes,work_city,work_address,nature,types,scales,url,benefits,station,station_id)VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s',%d)" % ( list[0], list[1], list[2], list[3], list[4], list[5], list[6], list[7], list[8], list[9], list[10], list[11], list[12],counterid()) # try: cur.execute(sql) db.commit() except Exception as e: pass def counterid(last=[0]):#用來儲存資料庫的id #last[0]將列表裡面的第一個元素取出,然後加1,賦值給next next = last[0] + 1 #修改列表裡面第一個元素的值 last[0] = next #返回此時執行的次數 return next def get_detailed(href): res = requests.get(href) soup = BeautifulSoup(res.text, 'lxml') for x in soup.find_all('ul'): try: sybo = x.get('class') # print(sybo) if sybo == ['terminal-ul', 'clearfix']: jobinfor = x.get_text() str = jobinfor.split('\n') salary = str[1].split(':')[1]#薪水 salary = "".join(salary.split()) #去掉特殊符號 city = str[2].split(':')[1]#城市 exp = str[5].split(':')[1]#工作經驗 edu = str[6].split(':')[1]#學歷 # numb = str[7].split(':')[1]需求人數 except Exception as e: print(e) for x in soup.find_all('div'): try: sybo = x.get('class') if sybo == ['company-box']: str2 = x.get_text().split('\n') while '' in str2: str2.remove('') if '檢視公司地圖' in str2: str2.remove('檢視公司地圖') comname = str2[0]#公司名稱 scale = str2[1].split(':')[1]#企業規模 nature = str2[2].split(':')[1]# 民營 國營 type = str2[3].split(':')[1]# 型別:計算機/教育/so on place = str2[-1]#具體地址 if len(str2) == 6:#有的公司沒有網址 website = ' ' else: website = str2[4].split(':')[1]#公司網站 # print(comname, scale, nature, type, place, website) if sybo == ['tab-inner-cont']: sty = x.get('style') if sty == None: descrip = x.get_text().split('\n')[1]#工作需求 descrip = "".join(descrip.split())#去掉特殊符號 # print(descrip) if sybo == ['welfare-tab-box']: fuli='' for elem in x: fuli = fuli + elem.string +' ' # print(x.get_text()) except Exception as e: print(e) return [comname,exp,edu,salary,descrip,city,place,nature,type,scale,website,fuli] def main(city, keyword, pages): db = pymysql.connect(host=host, user=user , password=password, db=dbname, port=port) for i in range(pages): html = get_one_page(city, keyword, i) readonepage(html,db) db.close() if __name__ == '__main__': main(city, keyword, pagenum)
資料庫結構 資料庫裡儲存的資訊