1. 程式人生 > >多執行緒+代理ip池 爬蟲

多執行緒+代理ip池 爬蟲

# coding=utf-8

import tushare as ts
import pandas as pd
import requests
import json
import re
import time
from retrying import retry
from concurrent.futures import ThreadPoolExecutor
import random


def get_pro():
    list = ['122.114.31.177:808', '61.135.217.7:80', '113.121.243.109:808', '171.39.40.5:8123'
, '121.31.199.30:8123', '111.155.116.240:8123', '125.121.121.171:808', '115.213.178.192:808'] return list start = time.clock() # 計時-開始 urlnum = range(8) listdo = urlnum while True: listye = [] listno = [] event = [] @retry(stop_max_attempt_number=8) # 設定最大重試次數
def crawl(n): pro_list = get_pro() header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'} proxies_l = {'http': pro_list[random.randint(0, len(pro_list))], } print(proxies_l['http']) try: req = requests.get('http://httpbin.org/ip'
, headers=header, proxies=proxies_l) print('finish') listye.append(n) listdo.remove(n) print listdo return req.text except: print('no proxies') listno.append(n) # 多執行緒 def multithreading(): number = listdo with ThreadPoolExecutor(max_workers=10) as executor: for result in executor.map(crawl, number, chunksize=10): event.append(result) return event event = multithreading() print 'listye' print listye print 'listno' print listno print 'listdo' print listdo if len(listdo) == 0: break end = time.clock() # 計時-結束 print ("爬取完成 用時:") print (end - start)