1. 程式人生 > >構建自己的IP代理池

構建自己的IP代理池

藉助免費的西刺IP代理構建自己IP代理池

需要安裝的Python庫
  • requests
  • scrapy.Selector
  • pymysql

資料庫表

create table proxy_ip (
  no BIGINT AUTO_INCREMENT,
  ip VARCHAR(20) UNIQUE NOT NULL,
  port VARCHAR(255) NOT NULL,
  address VARCHAR(20) DEFAULT '',
  proxy_type VARCHAR(5),
  speed DECIMAL DEFAULT 0,
  PRIMARY KEY (no)
) DEFAULT
CHARSET = utf8;

程式碼如下:

import threading
import requests
import time
from scrapy import Selector
import pymysql
import sys

DB_URL = 'localhost'
DB_USER = 'username'
DB_PASSWORD = 'password'
DB_NAME = 'spider_data'
DB_CHARSET = 'utf8'


class MyProxy():

    conn = pymysql.connect(DB_URL, DB_USER, DB_PASSWORD,
DB_NAME, charset=DB_CHARSET) cursor = conn.cursor() def __init__(self): DeleteIPThread().start() def get_ip(self): ''' 從資料庫中隨機拿一個有效IP 返回None時表示沒有地址可用了 :return: (ip, port, speed, type) or None ''' sql = ''' select ip,port,speed,proxy_type from proxy_ip order by rand() limit 1; '''
self.cursor.execute(sql) if self.cursor.arraysize > 0: # (ip, port, speed, type) res = self.cursor.fetchone() if self.judge_ip(res[0], res[1]): return res else: return self.get_ip() self.crawl_ips() return self.get_ip() def crawl_ips(self): ''' 爬取西刺免費代理的地址池 :return: 無返回 ''' headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh,en-US;q=0.9,en;q=0.8,zh-TW;q=0.7,zh-CN;q=0.6", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJTZjNDNmNjgzZWY5OWQ4ZWRmNTA5MzU3YWJiOGJlYWMwBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMVBsU3h6aU0xa25KWlZXZE5qZ0tGd21xYkJtc3J0K2w0YlEwdUhlNjFBN009BjsARg%3D%3D--abe7f4154a205b8515bfb204e3fe924006ae1d68", "Host": "www.xicidaili.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36" } url = "http://www.xicidaili.com/nn/1" response = None for i in range(10): try: response = requests.get(url, headers=headers, timeout=10) except requests.exceptions.Timeout: print("請求超時,第%d次重新請求..." % (i+1)) response = requests.get(url, headers=headers, timeout=10) if response.status_code == 200: break if response is None: print("網路太差,或者地址被封,11次請求均超時") return s = Selector(response) all_list = s.xpath('//table[@id="ip_list"]/tr')[1:] for item in all_list[1:]: try: line = item.xpath('./td') ip = line[1].xpath('string(.)').extract_first() port = line[2].xpath('string(.)').extract_first() address = '' if len(line[3].xpath('./a')) > 0: address = line[3].xpath('./a/text()').extract_first() address = str(address) type = line[5].xpath('string(.)').extract_first() speed = 0.0 if len(line[6].xpath('./div/@title')) > 0: speed_str = line[6].xpath('./div/@title').extract_first() speed = float(speed_str[:-1]) print(ip, port, address, type, speed) sql = ''' INSERT INTO proxy_ip(ip, port, address, proxy_type, speed) VALUES ('{0}', '{1}', '{2}', '{3}', '{4}'); ''' self.cursor.execute(sql.format(ip, port, address, type, speed)) self.conn.commit() except: print(sys.exc_info()) def judge_ip(self, ip, port): ''' 判斷給出的代理 ip 是否可用 :param ip: :param port: :return: ''' http_url = 'https://www.baidu.com/' proxy_url = 'http://{0}:{1}'.format(ip, port) try: proxy_dict = { 'http': proxy_url } print("正在測試代理IP是否可用 => ", proxy_url) response = requests.get(http_url, proxies=proxy_dict, timeout=5) except Exception as e: print("代理:", proxy_url, "不可用,即將從資料庫中刪除") self.delete_ip(ip) return False else: code = response.status_code if code >= 200 or code < 300: print("代理 => ", proxy_url, "可用") return True else: self.delete_ip(ip) return False def delete_ip(self, ip): ''' 刪除不可用的IP :param ip: :return: ''' sql = ''' delete from proxy_ip WHERE ip='{0}'; ''' self.cursor.execute(sql.format(ip)) self.conn.commit() class DeleteIPThread(threading.Thread): def __init__(self): super().__init__() self.daemon = True def run(self): conn = pymysql.connect(DB_URL, DB_USER, DB_PASSWORD, DB_NAME, charset=DB_CHARSET) cursor = conn.cursor() sql = "select ip,port from spider_data.proxy_ip;" proxy = MyProxy() while True: cursor.execute(sql) all_list = cursor.fetchall() for ip,port in all_list: print(ip, port) if proxy.judge_ip(ip, port) is False: proxy.delete_ip(ip) time.sleep(1) time.sleep(20) def circle_judge(self): pass if __name__ == '__main__': my_proxy = MyProxy() my_proxy.crawl_ips() # my_proxy.get_ip()

使用

建立物件後,呼叫crawl_ips()開始爬取代理IP,呼叫get_ip從資料庫中隨機選擇一條IP,並驗證是否可用,如果不可用則遞迴獲取可用的代理IP,當資料庫中的代理地址被用完後(全都不可用),則自動開始重新爬取代理。

在建立物件後,啟動守護執行緒維護代理池中的所有地址,將無效地址剔除。