1. 程式人生 > >爬取51job招聘網

爬取51job招聘網

import urllib.request
from bs4 import BeautifulSoup
import time
import pymongo
import pymysql

#https://search.51job.com/list/170200,000000,0000,00,9,99,python,2,2.html

def handle_request(keyword,page,url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36’,
}
url = url.format(keyword,page)
request = urllib.request.Request(url=url,headers=headers)
return request
#用bs4解析
def parse_content(content,db):
soup = BeautifulSoup(content,‘lxml’)
div_list = soup.select(’#resultList > .el’)[1:]

# print(div_list)
#逐一解析
for os in div_list:
    #公司職業
    jobname = os.select('.t1 > span > a')[0]['title']
    #公司名稱
    company = os.select('.t2 > a ')[0]['title']
    #工作地點
    area = os.select('.t3')[0].string
    #薪資
    salary = os.select('.t4')[0].string
    #釋出時間
    publish_time = os.select('.t5')[0].string

    #print(salary,publishtime)
    items = {
        '公司職業':jobname,
        '公司名稱':company,
        '工作地點':area,
        '薪資':salary,
        '釋出時間':publish_time,
    }
    #String = str(items)
    #print(items)
    #fp.write(String,'\n')
    save_to_mysql(db,items)
    #fp.insert(items)

#第一種是儲存到mysql中
def connect_db():
db=pymysql.Connect(host=‘localhost’,port=3306,user=‘root’,password=‘123456’,database=‘51job’,charset=‘utf8’)
#兩種引擎,一種是innodb 一種是myisam
return db

#第二種是儲存到mongodb中
def connect_mongodb():
#連線mongodb
client = pymongo.MongoClient(host=‘localhost’,port=27017)
return client
#如果用mysql需要自己建立資料庫,再建立對應的表格
def save_to_mysql(db,items):
#獲取cursor
cursor = db.cursor()
#拼接sql語句

sql = 'insert into job(jobname, company, area, salary, publish_time) values("%s","%s","%s","%s","%s")' % (items['公司職業'], items['公司名稱'], items['工作地點'], items['薪資'], items['釋出時間'])
try:
    cursor.execute(sql)
    db.commit()
except Exception as e:
    print(e)
    db.rollback()

def main():
keyword = input(‘請輸入要搜尋的關鍵字-’)
start_page = int(input(‘請輸入起始頁碼-’))
end_page = int(input(‘請輸入結束頁碼-’))
url = ‘https://search.51job.com/list/010000,000000,0000,00,9,99,{},2,{}.html’
#fp = open(‘job.txt’,‘w’,encoding=‘utf8’)
db = connect_db()
#一次遍歷每一頁的資料
#client = connect_mongodb()
#選擇mongodb的資料庫
#db = client.job51
#選擇mongodb的集合
#fp = db.job

for page in range(start_page,end_page + 1):
    print('正在爬取--第%s頁--....' % page)
    request = handle_request(keyword,page,url)
    content = urllib.request.urlopen(request).read().decode('gbk')
    parse_content(content,db)
    print('結束爬取--第%s頁--...' % page)
    time.sleep(2)
db.close()
#fp.close()
#client.close()

if name == ‘main’:
main()