1. 程式人生 > >boss直聘的反爬取和隨機代理

boss直聘的反爬取和隨機代理

from bs4 import BeautifulSoup
import requests
import ip_proxy
from urllib import parse
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
def get_boss_info(my_ip,detailed_url):
    #url = 'https://www.zhipin.com/job_detail/7e883f0c3a336cb51n142968FFM~.html?ka=search_list_1'
proxy = { 'http': 'http://' + my_ip.ip_proxy_str, 'https': 'http://' + my_ip.ip_proxy_str } response = requests.get(detailed_url, headers=headers, proxies = proxy, timeout=5) soup = BeautifulSoup(response.text, 'lxml') title = soup.find('h1').text #div_ele = soup.find('div', class_="name")
#print(div_ele) salary = soup.find('span', class_="badge").text.replace('\n', '').strip() print(title) print(salary) gezhong_info = soup.select('div.info-primary > p')[0].text.replace('\n', '').strip() print(gezhong_info) gangwei_info = soup.select('div.text')[0].text print(gangwei_info)
# 獲取詳情頁的url
def get_detail_url(my_ip, url):
    # url = 'https://www.zhipin.com/c101010100/h_101010100/?query=python&page=2&ka=page-2'
    proxy = {
        'http': 'http://' + my_ip.ip_proxy_str,
        'https': 'http://' + my_ip.ip_proxy_str
    }
    response = requests.get(url, headers = headers, proxies=proxy, timeout=5)

    soup = BeautifulSoup(response.text, 'lxml')
    #a_ele_list = soup.select('h3.name > a')
    a_ele_list = soup.select('div.job-list > ul > li div.info-primary > h3 > a')

    for a_ele in a_ele_list:
        # 屬性值的獲取可以通過類似字典的方式獲取
        a_href = a_ele['href']
        # 拼接詳情頁的連結
        href = parse.urljoin(url, a_href)
        print('詳情頁的href: ' + href)
        # 重試三次, 獲取代理訪問boss直聘, 三次沒有成功訪問就跳過
        for i in range(0,3):
            try:
                # 獲取詳情頁的資訊
                get_boss_info(my_ip, href)
                break
            except Exception as e:
                print(e)
                my_ip.update_ip_proxy_str()
def get_all_info(my_ip):
    base_url = 'https://www.zhipin.com/c101010100/h_101010100/?query=python&page=%s&ka=page-%s'
    for i in range(1,4):
        # 每一個分頁的url
        url = base_url % (i, i)
        # 迴圈處理, 如果proxy不好使, 就需要換代理, 如果重試4次依然不好使,就跳過
        for i in range(0, 4):
            try:
                # 迴圈四次訪問boss直聘的網站, 分頁的內容
                # get_detail_url(my_ip, url)
                get_detail_url(my_ip, url)
                break
            except Exception as e:
                print(e)
                my_ip.update_ip_proxy_str()

if __name__ == '__main__':
    my_ip = ip_proxy.ip_getter()
    # 獲取一個ip
    # proxy_str = '36.27.143.72:21450'
    # print(proxy_str)
    # 獲取所有的boss直聘資訊
    get_all_info(my_ip)

# with open('boss.html', 'wb') as f:
#     f.write(response.content)
#代理的獲取檔案
import requests

class ip_getter(object):
    def __init__(self):
        self.ip_proxy_str = get_ip_string()

    def update_ip_proxy_str(self):
        self.ip_proxy_str = get_ip_string()
        print('get one ip : ' + self.ip_proxy_str)


def get_ip_string():
    url = 'http://dps.kdlapi.com/api/getdps/?orderid=963491899590153&num=1&pt=1&ut=1&dedup=1&sep=1'
    response = requests.get(url)

    return response.text