1. 程式人生 > >網路爬蟲筆記(Day5)——騰訊社招&拉勾網

網路爬蟲筆記(Day5)——騰訊社招&拉勾網

分析過程與鏈家是一樣的。

騰訊社招完整程式碼如下:

import requests
from lxml import etree
from mysql_class import Mysql  # 自己封裝好的Mysql類


def txshezhao(keywords, page):
    '''
    :param keywords: 指定搜尋關鍵字進行資料爬取
    :param page: 用來控制爬取頁碼範圍
    :return: 將相關資訊儲存於text資料庫的tengxun表中
    '''
    count = 0
    while count <= page:   # 指定爬取前20頁
        url = 'https://hr.tencent.com/position.php?keywords={}&lid=2156&tid=87&start={}#a'.format(keywords, count*10)
        count += 1
        
        headers = {
            'Cookie': '_ga=GA1.2.552710032.1529846866; pgv_pvi=5319122944; PHPSESSID=a7let8q1aup7j9p40mubjq8h64; pgv_si=s6819970048',
            'Host': 'hr.tencent.com',
            'Referer': 'https://hr.tencent.com/position.php?keywords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E8%AF%8D&lid=2156&tid=87',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        }
        
        res = requests.get(url, headers=headers)
        html = etree.HTML(res.text)
        
        for every in range(2, 12):  # (2,12)
            res_href = html.xpath('//table[@class="tablelist"]/tr[{}]/td[1]/a/@href'.format(every))
            href = 'https://hr.tencent.com/' + res_href[0]
            # print(href)   # 拿到每一頁的10個崗位的URL
            res = requests.get(href, headers=headers)
            # print(res.text)
            html1 = etree.HTML(res.text)
            info1 = html1.xpath('//td[@id="sharetitle"]//text()')
            
            job_name = str(info1[0])
            # print(job_name)
            res_msg = html1.xpath('//tr[@class ="c bottomline"]/td//text()')
            # print(res_msg)   # ['工作地點:', '北京', '職位類別:', '技術類', '招聘人數:', '1人']
            
            address = str(res_msg[1])
            # print(address)  # 北京
            
            category = str(res_msg[3])
            # print(category)
            
            number = str(res_msg[5])
            # print(number)
    
            information_list = html1.xpath('//table[@class="tablelist textl"]/tr[4]/td/ul//text()')
            req_info = ''
            for req_info1 in information_list:
                message = str(req_info1)
                req_info += message
                
            information = req_info
            # print(information)
            
            data = (job_name, address,category, number, information)
            Insert.mysql_op(sql, data)
    
        
if __name__ == '__main__':
    # MySQL語句
    Insert = Mysql()
    # 要執行的sql 語句
    sql = '''INSERT INTO tengxun (job_name, address, category, number, information) VALUES(%s, %s, %s, %s, %s)'''
    
    print('請在下面輸入關鍵字進行爬取資料:')
    keywords = input()
    txshezhao(keywords, 5)

拉鉤網完整程式碼如下:

import requests
from lxml import etree
import pymysql


class Mysql(object):
    '''執行資料操作封裝類'''
    
    def __init__(self):
        '''連線資料庫、建立遊標'''
        self.db = pymysql.connect(host="localhost", user="root", password="8888", database="test")
        self.cursor = self.db.cursor()
    
    def mysql_op(self, sql, data):
        '''MySQL語句'''
        self.cursor.execute(sql, data)
        self.db.commit()
    
    def __del__(self):
        '''關閉遊標、關閉資料庫'''
        self.cursor.close()
        self.db.close()


# MySQL語句
Insert = Mysql()
# 要執行的sql 語句
sql = '''INSERT INTO lagou (company, job_name, salary, adress, jingyan, school,job_des) VALUES(%s, %s, %s, %s, %s, %s, %s)'''

url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false'
headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    # 'Cookie': 'JSESSIONID=ABAAABAAAGFABEF780FE198208BF21A58749B6B7C26C915; _ga=GA1.2.1321423683.1534510673; _gid=GA1.2.581729554.1534510673; user_trace_token=20180817205757-29e3715f-a21d-11e8-a9f0-5254005c3644; LGUID=20180817205757-29e375b6-a21d-11e8-a9f0-5254005c3644; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=search_code; X_HTTP_TOKEN=87d99de12746e518d50f2fe7fede59a0; PRE_UTM=; _gat=1; LGSID=20180818000633-829355a0-a237-11e8-a9f0-5254005c3644; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D33WZv6WWqh6LDiUr0dWxB6F4E9letiquzVMR10EQdIG%26wd%3D%26eqid%3Dd381bb6900049a8a000000035b76c64a; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534510675,1534521990,1534522180; LGRID=20180818001023-0bf26e44-a238-11e8-91ae-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534522221; SEARCH_ID=86e7b49dc5cc476fb33fbb41c7409cf0',
    'Cookie': 'JSESSIONID=ABAAABAAAGFABEF780FE198208BF21A58749B6B7C26C915; _ga=GA1.2.1321423683.1534510673; _gid=GA1.2.581729554.1534510673; user_trace_token=20180817205757-29e3715f-a21d-11e8-a9f0-5254005c3644; LGUID=20180817205757-29e375b6-a21d-11e8-a9f0-5254005c3644; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=search_code; X_HTTP_TOKEN=87d99de12746e518d50f2fe7fede59a0; PRE_UTM=; LGSID=20180818000633-829355a0-a237-11e8-a9f0-5254005c3644; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D33WZv6WWqh6LDiUr0dWxB6F4E9letiquzVMR10EQdIG%26wd%3D%26eqid%3Dd381bb6900049a8a000000035b76c64a; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534510675,1534521990,1534522180; LGRID=20180818001050-1bed8a51-a238-11e8-91ae-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534522248; SEARCH_ID=444ab1d908b04a32b195b1ac433ef583',
    'Host': 'www.lagou.com',
    'Origin': 'https://www.lagou.com',
    'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    'X-Anit-Forge-Code': '0',
    'X-Anit-Forge-Token': 'None',
    'X-Requested-With': 'XMLHttpRequest',
}
for page in range(1, 30):
    form = {
        'first': 'false',
        'pn': page,
        'kd': '資料分析'
    }
    
    response = requests.post(url, headers=headers, data=form)
    html = response.json()
    for url0 in range(15):  # 15
       
        info = html["content"]["positionResult"]["result"][url0]["positionId"]  # 4605300-----<class 'int'>
       
        url1 = 'https://www.lagou.com/jobs/' + str(info) + '.html'
        # print(url1)
        
        res = requests.get(url1, headers=headers)
        res_html = res.text
        res_element = etree.HTML(res_html)
        if res_element.xpath('//div[@class="job-name"]/div[1]') == []:
            break
        company = res_element.xpath('//div[@class="job-name"]/div[1]')[0].text

        job_name = res_element.xpath('//div[@class="job-name"]/span')[0].text

        salary = res_element.xpath('//dd[@class="job_request"]/p/span[1]')[0].text

        adress = res_element.xpath('//dd[@class="job_request"]/p/span[2]')[0].text

        jingyan = res_element.xpath('//dd[@class="job_request"]/p/span[3]')[0].text

        school = res_element.xpath('//dd[@class="job_request"]/p/span[4]')[0].text
        # description = res_element.xpath('//dd[@class="job_bt"]/h3')[0].text
        # print(description)
        
        des_msg = res_element.xpath('//dd[@class="job_bt"]/div//text()')
        # print(des_msg)
        job_des = ''
        for des_msg_one in des_msg:
            job_des += str(des_msg_one).strip('\n')
        print(job_des)

        data = (str(company), str(job_name), str(salary), str(adress).strip('/'), str(jingyan).strip('/'), str(school).strip('/'), str(job_des))
        Insert.mysql_op(sql, data)
        

鏈家、拉勾、Boss、等等這些網頁可以拿來學習練手,請不要過多的爬取資料。