1. 程式人生 > >python3 爬蟲 爬取智聯招聘崗位資訊

python3 爬蟲 爬取智聯招聘崗位資訊

這套程式基於python3 ,使用requests和re正則表示式,只需要將程式儲存為.py檔案後,即可將抓取到的資料儲存到指定路徑的Excel檔案中。程式在終端中啟動,啟動命令:

#python3 檔名.py 關鍵字 城市
python3 zhilian.py python 杭州

程式碼如下:

# coding:utf-8
import requests
import re
import xlwt
import sys,os

workbook = xlwt.Workbook(encoding='utf-8')
booksheet = workbook.add_sheet('Sheet 1'
, cell_overwrite_ok=True) class ZhiLian(object): def __init__(self): self.start_url = 'https://m.zhaopin.com/{}/?keyword={}&pageindex={}&maprange=3&islocation=0&order=4' self.headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Mobile Safari/537.36"
} self.test_url = '<section class="job-list.*?".*?>.*?<div class="job-name fl ">(.*?)</div>.*?<div class="fl">(.*?)</div>.*?<div class="comp-name fl">(.*?)</div>.*?<span class="ads">(.*?)</span>.*?<div class="time fr">(.*?)</div>'
self.select_city_url = 'https://m.zhaopin.com/searchjob/selectcity' self.test_city = ' <a data-code="(.*?)" href="/(.*?)/">(.*?)</a>' def parse_url(self, url): '''傳送請求''' response = requests.get(url, headers=self.headers) return response.content.decode() def get_data(self, test_url, content): '''獲取資料''' content_list = re.findall(test_url, content, re.S) return content_list def get_content(self, content_list, DATA): '''提取資料''' for content in content_list: DATA.append((content[0], content[1], content[2], content[3], content[4])) def save_content(self, DATA, city, key_words): '''儲存到excel''' for i, row in enumerate(DATA): for j, col in enumerate(row): booksheet.write(i, j, col) #判斷儲存的路徑,如果和我的路徑不一樣,會自動儲存到當前程式檔案所在目錄 if(os.path.isdir('/home/itcast/Desktop/')): workbook.save('/home/itcast/Desktop/{}_{}.xls'.format(city,key_words)) else: workbook.save('{}_{}.xls'.format(city, key_words)) def select_city(self, url, search_city): '''選擇城市,返回城市程式碼''' city_dict = {} city_code = None r = requests.get(url, headers=self.headers) content = r.content.decode() city_content = re.findall(self.test_city, content, re.S) # print(city_content) # print(len(city_content)) # 構造一個字典儲存城市資訊 for city in city_content: # '566': ['tangshan', '唐山'] city_dict[city[0]] = [city[1], city[2]] # print(len(city_dict)) for keys, value in city_dict.items(): if search_city == value[1]: city_code = value[0] + '-' + keys # print(city_code) return city_code def deal_city(self, city): '''處理城市資訊''' city_code = self.select_city(self.select_city_url, city) if city_code == None: print("查詢城市不存在,請重試") sys.exit() return city_code def run(self, city, key_words): # 1.start_url # 2.傳送請求,獲取響應 i = 1 DATA = [('崗位', '月薪', '公司', '城市', '釋出時間')] city_code = self.deal_city(city) while True: url = self.start_url.format(city_code, key_words, i) content = self.parse_url(url) content_list = self.get_data(self.test_url, content) self.get_content(content_list, DATA) # 儲存資料 self.save_content(DATA, city, key_words) # 判斷是否還有資料 限制儲存最大頁數 if (len(content_list) == 0 or i>100): print("儲存完成,共{}頁資料".format(i - 1)) break print("正在儲存第{}頁資料".format(i)) i += 1 if __name__ == '__main__': key_words = sys.argv[1] city = sys.argv[2] zhilian = ZhiLian() zhilian.run(city, key_words)

爬取結果如下:
爬取結果