1. 程式人生 > >建立爬蟲代理IP池

建立爬蟲代理IP池

web odin pro __main__ headers XML Coding txt文件 端口號

#!/usr/bin/python3.5
# -*- coding:utf-8 -*-

import time
import tempfile
from lxml import etree
from urllib import request


user_agent = ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0‘


def get_content(url):                   # 獲取網頁內容
    global user_agent
    headers = {‘User-Agent‘: user_agent}
    req = request.Request(url=url, headers=headers)
    res = request.urlopen(req)
    return res.read().decode(‘utf-8‘)

def get_info(tmp,content):              # 提取網頁信息 / ip 端口
    ip_list = etree.HTML(content).xpath(‘//table[contains(@id,"ip_list")]/tr/td[2]/text()‘)
    port_list = etree.HTML(content).xpath(‘//table[contains(@id,"ip_list")]/tr/td[3]/text()‘)
    for i in range(0,len(ip_list)):
        out = u""
        out += u"" + ip_list[i]
        out += u":" + port_list[i]
        tmp.write((out + u"\n").encode(‘utf-8‘))          # 所有ip和端口號寫入data文件

def verify_ip(ip,port,test_url):        # 驗證 ip+port 有效性
    global user_agent
    headers = {‘User-Agent‘: user_agent,‘Host‘: ‘www.12306.cn‘,‘Referer‘: ‘http://www.12306.cn/‘}
    proxy = {‘http‘:‘http://%s:%s‘%(ip,port)}
    print(proxy)

    proxy_handler = request.ProxyHandler(proxy)
    opener = request.build_opener(proxy_handler)
    request.install_opener(opener)

    req = request.Request(url=test_url,headers=headers)
    time.sleep(1)
    try:
        res = request.urlopen(req)
        time.sleep(2)
        content = res.read()
        if content:
            print(‘{0}:{1} is ok‘.format(ip,port))
            with open("proxy_info.txt", "a") as fd:       # 可用ip+port保存到proxy_info.txt文件中
                fd.write(ip + u":" + port + "\n")
        else:
            print(‘{0}:{1} is unavailable‘.format(ip,port))
    except request.URLError as e:
        print(e.reason)


def verify_ip2(ip,port,test_url):
    import requests
    try:
        response = requests.get(test_url,proxies={‘http‘:‘http://{0}:{1}‘.format(ip,port)},timeout=2)
        # print(response.status_code)
    except Exception as e:
        print("{0}:{1} failed".format(ip,port),e)
    else:
        print("{0}:{1} is ok".format(ip,port))
        with open("proxy_info.txt", "a") as fd:  # 可用ip+port保存到proxy_info.txt文件中
            fd.write(ip + u":" + port + "\n")


if __name__ == ‘__main__‘:
    url = ‘http://www.xicidaili.com/nn/‘
    test_url = "http://httpbin.org/"
    url_list = [ url + str(i) for i in range(1,2) ]
    tmp = tempfile.TemporaryFile()
    for url in url_list:
        content = get_content(url)
        time.sleep(2)
        get_info(tmp,content)

    tmp.seek(0)
    for item in tmp.readlines():
        item = item.decode(‘utf-8‘)
        # verify_ip(item.split(u":")[0],item.split(u":")[1].strip(),test_url)
        verify_ip2(item.split(u":")[0],item.split(u":")[1].strip(),test_url)
    tmp.close()

  

建立爬蟲代理IP池