1. 程式人生 > >ip代理池學習

ip代理池學習

aid req tex xpath ip代理 權重 pre tps 登錄

代理的作用

網上有許多售賣代理的網站,也有免費的,不過其功效性會能影響。通過代理網站,我們可以向訪問的目標訪問器隱藏自己的真實ip,避免ip地址以訪問頻率過高等原因被封。

步驟
1.搜集一個免費的代理
2.通過urllib.requestProxyHandler構造一個代理,以字典形式,鍵名是協議

proxy = '95.45.235.178:40056'
proxy_handler = ProxyHandler({
    'http': 'http://'+proxy,
    'https': 'https://'+proxy
})

3.通過urllib.requestbuild_opener構造一個請求方法

opener = build_opener(proxy_handler)

4.發起請求

from urllib.error import URLError
from urllib.request import ProxyHandler,  build_opener


proxy = '95.45.235.178:40056'
# 如果代理需要登錄,則可以這樣寫
# proxy = 'username:[email protected]:40056
proxy_handler = ProxyHandler({
    'http': 'http://'+proxy,
    'https': 'https://'+proxy
})
opener = build_opener(proxy_handler)
try:
    response = opener.open('http://httpbin.org/get')
    print(response.read().decode("utf-8"))
except URLError as e:
    print(e.reason)

完整代碼

from lxml import etree
from time import sleep
import requests
from requests.exceptions import ProxyError, ConnectTimeout, ReadTimeout


headers = {
    'User-Agent': ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
                   ' (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'),
}

# 0 1 2 .... 代表權重,9表示可用,0表示不可用
ip_poor = {
    0: [], 1: []
}


# 爬取免費 ip
def crawl_ip():
    ip_list = []
    for page in range(1, 11):
        response = requests.get("https://www.kuaidaili.com/free/inha/{}/".format(page))
        response.encoding = 'utf-8'
        html = etree.HTML(response.text)
        ip = html.xpath('//*[@id="list"]/table/tbody/tr/td[1]/text()')
        port = html.xpath('//*[@id="list"]/table/tbody/tr/td[2]/text()')
        ip_list.append([i[0]+':'+i[1] for i in zip(ip, port)])
        sleep(2)
    return ip_list


# 測試 ip 的可用性
def test_ip(ip_addr):
    proxies = {
        'http': 'http://'+ip_addr,
        'https': 'https://'+ip_addr,
    }
    try:
        resp = requests.get('http://httpbin.org/get', proxies=proxies, headers=headers, timeout=5)
        return True
    except (ProxyError, ConnectTimeout, ReadTimeout):
        return False


# 測試入口
def test():
    # while True:
    use = [ip for ip in ip_poor[1]]
    for ip in use:
        if test_ip(ip) is False:
            ip_poor[0].append(ip)
            ip_poor[1].pop(ip_poor[1].index(ip))
    not_use = [ip for ip in ip_poor[0]]
    for ip in not_use:
        if test_ip(ip) is False:
            ip_poor.pop(ip_poor[0].index(ip))
        else:
            ip_poor[1].append(ip)
            ip_poor.pop(ip_poor[0].index(ip))
    return ip_poor


def main():
    ip_list = crawl_ip()
    for item in ip_list:
        for i in item:
            ip_poor[1].append(i)
    test()


if __name__ == '__main__':
    main()

ip代理池學習