1. 程式人生 > >前程無憂爬蟲實戰(通過輸入關鍵字爬取任意職位並自動儲存為.csv文字)

前程無憂爬蟲實戰(通過輸入關鍵字爬取任意職位並自動儲存為.csv文字)

![0e644a1fa9dc00c3e7c752bdf4382aa2.jpg](https://upload-images.jianshu.io/upload_images/9136378-72ab92577ff68f7d.jpg?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)

#這裡只分析主要的思路和要注意的點。有什麼不懂的可以評論提問,研究給出的程式碼理解。
##1.通過在前程無憂上面輸入關鍵字搜尋可以發現,前程無憂的資料不是ajax載入的,是普通的一個get請求,只需要構造url傳送請求,編寫解析的規則就可以了,這裡推薦採用xpath編寫解析的規則。解析這些非結構性的資料首先考慮xpath,xpath不行的話就用正則。獲取最大爬取頁碼資料只需要通過xpath定位那個最大頁數就可以了,然後把那個數字提取出來,在寫個if判斷。

2.程式碼的實現如下
 

#_author:'DJS'
#date:2018-11-19

import csv
import re

import requests
from lxml import etree
headers = {
    "cache-control": "no-cache",
    "postman-token": "72a56deb-825e-3ac3-dd61-4f77c4cbb4d8",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",

}
def get_url(key1):
    try:
        i=0
        url = "https://search.51job.com/list/030800%252C040000%252C030200,000000,0000,00,9,99,{},2,1.html"
        response = requests.get(url.format(key1), headers=headers)
        html = etree.HTML(response.content.decode('gbk'))
        max_page =int("".join(re.findall('(\d+)',"".join(html.xpath("//span[@class='td']/text()")))))
        while True:
            i+=1
            url = "https://search.51job.com/list/030800%252C040000%252C030200,000000,0000,00,9,99,{},2,{}.html"
            url = url.format(key1,i)
            print("*"*100)
            print("正在爬取第%d頁" % i)
            print("*"*100)
            yield url
            #print("正在爬取%d頁"%i)
            if max_page == i:
                break
    except:
        print("獲取不到連結,已處理")

def pase_page(key1):
    try:
        for i in get_url(key1):
            url = i
            #print(url)
            response = requests.get(url,headers=headers)
            html = etree.HTML(response.content.decode('gbk'))  # 解碼成gbk後輸出,請求的是gbk,但是python預設的是
            #輸出的是utf-8,所以把utf-8解碼成gbk就可以輸出了,這樣請求和輸出就一樣了,decode 相當於輸出
            #編碼的輸入和輸出要一致。
            lists = html.xpath("//div[@id='resultList']//div[@class='el']")
            for list in lists:
                item = {}
                item["職位"] = "".join(list.xpath("./p/span/a/text()")).replace('\r\n', '').replace(' ', '')
                item["公司名稱"] = "".join(list.xpath("./span[@class='t2']/a/text()")).replace('\r\n', '').replace(' ', '')
                item["工作地點"] = "".join(list.xpath("./span[@class='t3']/text()")).replace('\r\n', '').replace(' ', '')
                item["薪資"] = "".join(list.xpath("./span[@class='t4']/text()")).replace('\r\n', '').replace(' ', '')
                item["釋出時間"] = "".join(list.xpath("./span[@class='t5']/text()")).replace('\r\n', '').replace(' ', '')
                yield item
    except:
        print("返回資料異常,已處理")

def save_excel(key1):
    try:
        header = ['職位', '公司名稱', '工作地點', '薪資', '釋出時間']
        with open(key1+'前程無憂職位資訊.csv', 'w', newline='') as f:  # w是寫入
            # 標頭在這裡傳入,作為第一行資料
            writer = csv.DictWriter(f, header)
            writer.writeheader()
        for i in pase_page(key1):
            item = i
            header = ['職位', '公司名稱', '工作地點', '薪資', '釋出時間']
            with open(key1+'前程無憂職位資訊.csv', 'a', newline='') as f:  # a是追加
                writer = csv.DictWriter(f, header)
                writer.writerow(item)
                #print(item)
    except:
        print("儲存資料異常,已處理")

if __name__ == '__main__':
    key1 = input('請輸入要爬取的職位:')
    save_excel(key1)