1. 程式人生 > >簡易爬蟲爬取51job招聘資訊

簡易爬蟲爬取51job招聘資訊

import re
import urllib.request

class Grab(object):
    # 定義類屬性
    num = 0
    def __init__(self):
        # 請求的網址
        self.url = "http://search.51job.com/list/010000,000000,0000,00,9,99,python,2,1.html?"
        # 請求頭
        self.headers = {"Host": "search.51job.com",
                "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
} def openurl(self): # 建立請求物件 fin_url = urllib.request.Request(url=self.url,headers=self.headers) # 請求網址 read_data = urllib.request.urlopen(fin_url) # 讀取內容 data = read_data.read().decode("gbk") # 正則匹配字元,取出存放資訊的連結 mes_list = re.findall(r"http://jobs\.51job\.com.+\.html"
, data) # 遍歷存放連結的列表 for i in mes_list: self.deal(i) def deal(self,url): # 請求取出的網址 files = urllib.request.urlopen(url) # 讀取網站 data = files.read() # 正則匹配內容 find_list = re.findall(r"<p>.*</p>",data.decode("gbk"
)) # 開啟檔案寫入 new_file = open("zhaopin"+"/"+str(Grab.num)+".txt","w") new_file.write(str(find_list)) new_file.close() Grab.num += 1 def main(): # 建立物件 g = Grab() g.openurl() if __name__ == "__main__": main()