1. 程式人生 > >Beautiful Soup爬蟲——爬取智聯招聘的資訊並存入資料庫

Beautiful Soup爬蟲——爬取智聯招聘的資訊並存入資料庫

本人目前在校本科萌新…第一次寫有所不足還請見諒

前期準備

智聯招聘網頁 智聯招聘搜尋網頁 讓我們來搜尋一下python 網頁跳轉到這 發現網頁跳轉到這 讓我們看一下原始碼 無 發現並沒有我們所需要的資料 一開始我不信邪用requests嘗試了一下

import requests
headers = {
       'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
       'Host': 'sou.zhaopin.com',
       'Referer': 'https://www.zhaopin.com/',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
       'Accept-Encoding': 'gzip, deflate',
       'Accept-Language': 'zh-CN,zh;q=0.9',
       }
url = 'https://sou.zhaopin.com/?pageSize=60&jl=530&kw=python&kt=3'
re = requests.get(url,headers=headers)
print(re.text)
import requests
headers = {
       'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
       'Host': 'sou.zhaopin.com',
       'Referer': 'https://www.zhaopin.com/',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
       'Accept-Encoding': 'gzip, deflate',
       'Accept-Language': 'zh-CN,zh;q=0.9',
       'Cookie':'ZP_OLD_FLAG=true'
       }
url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京&kw=python&sm=0&p=1'
re = requests.get(url,headers=headers)
print(re.text)

cookie這裡表示是舊版網頁 發現確實有招聘的資料,這裡就不發截圖了。

程式碼

我用了json儲存了一些變數,方便更改 spider.json

{
  "host":"localhost",
  "user":"root",
  "password":"",
  "dbname":"vacation",
  "port":3306,
  "city":"北京",
  "keyword":"python",
  "page":90,
  "Cookie":"ZP_OLD_FLAG=true;"
}

程式碼


from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException
import pymysql
import json
f = open("spider.json",encoding='utf-8')
setting = json.load(f)
host = setting['host']
user = setting['user']
password = setting['password']
dbname = setting['dbname']
port = setting['port']
city = setting['city']
keyword = setting['keyword']
pagenum = setting['page']
Cookie = setting['Cookie']
def get_one_page(city, keyword, page):
   '''
   獲取網頁html內容並返回
   '''
   headers = {
       'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
       'Host': 'sou.zhaopin.com',
       'Referer': 'https://www.zhaopin.com/',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
       'Accept-Encoding': 'gzip, deflate',
       'Accept-Language': 'zh-CN,zh;q=0.9',
       'Cookie':Cookie
   }
   url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&sm=0&p={}'.format(city,keyword,page)
   try:
       # 獲取網頁內容,返回html資料
       response = requests.get(url, headers=headers)
       # 通過狀態碼判斷是否獲取成功
       if response.status_code == 200:
           return response.text
       return None
   except RequestException as e:
       return None
def readonepage(html,db):
    cur = db.cursor()
    soup = BeautifulSoup(html,'lxml')
    for x in soup.find_all('td'):
        try:
            sybo = x.get('class')
            if sybo ==['zwmc']:
                jobname = x.div.a.get_text() #崗位名稱
                jobhref = x.div.a.get('href')
                if jobhref[9] == 'i':
                    pass
                list = get_detailed(jobhref)
                list.append(jobname)
                print(jobname)
                sql = "INSERT INTO companyinfo(company_name,work_experience,edu_background,salary,describes,work_city,work_address,nature,types,scales,url,benefits,station,station_id)VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s',%d)" % (
                list[0], list[1], list[2], list[3], list[4], list[5], list[6], list[7], list[8], list[9], list[10],
                list[11], list[12],counterid())
                    # try:
                cur.execute(sql)
                db.commit()
        except Exception as e:
            pass
def counterid(last=[0]):#用來儲存資料庫的id
    #last[0]將列表裡面的第一個元素取出,然後加1,賦值給next
    next = last[0] + 1
    #修改列表裡面第一個元素的值
    last[0] = next
    #返回此時執行的次數
    return next
def get_detailed(href):
    res = requests.get(href)
    soup = BeautifulSoup(res.text, 'lxml')
    for x in soup.find_all('ul'):
        try:
            sybo = x.get('class')
            # print(sybo)
            if sybo == ['terminal-ul', 'clearfix']:
                jobinfor = x.get_text()
                str = jobinfor.split('\n')
                salary = str[1].split(':')[1]#薪水
                salary = "".join(salary.split()) #去掉特殊符號
                city = str[2].split(':')[1]#城市
                exp = str[5].split(':')[1]#工作經驗
                edu = str[6].split(':')[1]#學歷
                # numb = str[7].split(':')[1]需求人數
        except Exception as e:
            print(e)
    for x in soup.find_all('div'):
        try:
            sybo = x.get('class')
            if sybo == ['company-box']:
                str2 = x.get_text().split('\n')
                while '' in str2:
                    str2.remove('')
                    if '檢視公司地圖' in str2:
                        str2.remove('檢視公司地圖')
                comname = str2[0]#公司名稱
                scale = str2[1].split(':')[1]#企業規模
                nature = str2[2].split(':')[1]# 民營 國營
                type = str2[3].split(':')[1]# 型別:計算機/教育/so on
                place = str2[-1]#具體地址
                if len(str2) == 6:#有的公司沒有網址
                    website = ' '
                else:
                    website = str2[4].split(':')[1]#公司網站
                # print(comname, scale, nature, type, place, website)
            if sybo == ['tab-inner-cont']:
                sty = x.get('style')
                if sty == None:
                    descrip = x.get_text().split('\n')[1]#工作需求
                    descrip = "".join(descrip.split())#去掉特殊符號
                    # print(descrip)
            if sybo == ['welfare-tab-box']:
                fuli=''
                for elem in x:
                    fuli = fuli + elem.string +' '
                # print(x.get_text())
        except Exception as e:
            print(e)

    return [comname,exp,edu,salary,descrip,city,place,nature,type,scale,website,fuli]

def main(city, keyword, pages):
    db = pymysql.connect(host=host, user=user , password=password, db=dbname, port=port)
    for i in range(pages):
        html = get_one_page(city, keyword, i)
        readonepage(html,db)
    db.close()


if __name__ == '__main__':
   main(city, keyword, pagenum)

資料庫結構 在這裡插入圖片描述 資料庫裡儲存的資訊 ..