1. 程式人生 > >【python】代理反ip限制獲取URL資料

【python】代理反ip限制獲取URL資料

#-*- coding:utf-8 -*-
import datetime
import queue
import threading
import time
from random import choice

import requests
import urllib3

urllib3.disable_warnings()

class Scraping:
    def __init__(self):
        #瀏覽器User Agent
        self.uas = [
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0"
, "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4", "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)", "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0"
, "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko", ] #主迴圈計數器 self.mainCounter=0 #Format開始時間 self.
startTimeStr = datetime.datetime.now().strftime('%H:%M:%S') #開始時間 self.startTime=datetime.datetime.now() #訊息佇列 self.msgQ=queue.Queue() #資料佇列 self.dataQ=queue.Queue() #代理佇列 self.proxyQ=queue.Queue() #tips顯示時間間隔 self.tipsTime=5 #開始狀態 self.status='stoped' #目標Url附加資料檔案控制代碼 self.readHwnd=open(r'源.txt') #最新proxy self.presentProxy='' #proxyQ最大數量 self.maxProxyQ=20 #proxyQ新增速度 self.getProxyQSpeed=0.6 #獲取代理api self.getProxyUrl='http://dynamic.goubanjia.com/dynamic/get/xxxxxxx.html?sep=3' #最大目標執行緒數量 self.maxThreadNum=15 #目標url self.targetUrl='https://xxx.xxx.com/?regnamesugg&username=' def getData(self): ''' 獲取目標Url附加資料(單行) ''' line=self.readHwnd.readline() if(not line): return '' else: line=line.strip('\n') return line def addDataThread(self): ''' 呼叫self.getData() 維護資料佇列self.dataQ,保持資料佇列長度 ''' while(self.status=='running'): if(self.dataQ.qsize()<self.maxThreadNum): data=self.getData() if(data==''): #資料讀取完畢 print('addData:\t資料讀取完畢') return self.dataQ.put(data) def getProxy(self): ''' 從代理api self.getProxyUrl獲取代理ip:port ''' try: ipReq = requests.get(self.getProxyUrl) ips=ipReq.text.split('\n') for ip in ips: #print('getProxy:\t獲取新ip'+ip) return ip except Exception as e: print('getProxy:\t'+str(e)) return self.presentProxy def addProxyThread(self): ''' 呼叫self.getProxy 維護代理佇列self.proxyQ ''' while(self.status=='running'): time.sleep(self.getProxyQSpeed) if(self.proxyQ.qsize()<self.maxProxyQ): proxy=self.getProxy() if(proxy!=self.presentProxy): self.proxyQ.put(proxy) self.presentProxy=proxy print('addProxy:\t新增新proxy '+proxy) def tipsThread(self): ''' 顯示tips ''' while(self.status=='running'): time.sleep(self.tipsTime) nowTime=datetime.datetime.now() runTime=(nowTime-self.startTime).seconds print('tips:\t執行時間:'+str(runTime)+'s\t速度:'+str(self.mainCounter/self.tipsTime)+ '\tmsgQ.qsize:'+str(self.msgQ.qsize())+'\tdataQ.qsize:'+str(self.dataQ.qsize())+'\tproxyQ.qsize:'+str(self.proxyQ.qsize())) self.mainCounter=0 def workThread(self): ''' 工作執行緒 從代理佇列self.proxyQ,資料佇列self.dataQ獲取資料,請求目標Url self.targetUrl 返回{'data':data,'result':result}至訊息佇列self.msgQ return 1 : id已存在 return 0 : id不存在 return -1 : ip訪問限制 return -2 : id特殊error(非法id) return -3 : 連線拒絕/連線超時 ''' while(self.status=='running'): time.sleep(0.1) try: proxy=self.proxyQ.get() except queue.Empty: continue try: data=self.dataQ.get() except queue.Empty: #返還proxy self.proxyQ.put(proxy) headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding":"gzip, deflate, br", "Accept-Language":"zh-CN,zh;q=0.9", "User-Agent":choice(self.uas), } try: req = requests.get(self.targetUrl+data,timeout=3,proxies={'https':proxy,'http':proxy},headers=headers,verify=False) self.msgQ.put({ 'data':data, 'result':self.checkRawText(req.text,proxy) }) except: #連線錯誤 #不返還proxy self.msgQ.put({ 'data':data, 'result':-3 }) def checkRawText(self,text,proxy): ''' 檢查讀取Url返回值 ''' if(text.find('"errno":0')!=-1): #無error #返還proxy self.proxyQ.put(proxy) if(text.find('"userexsit":1')!=-1): #已存在 return 1 else: #不存在 return 0 elif(text.find('"errno":500010')!=-1): #ip限制 #不返還proxy return -1 else: #特殊error #返還proxy self.proxyQ.put(proxy) return -2 def handleMsgThread(self): ''' 返回值處理執行緒 讀取訊息佇列self.msgQ,正常返回值記錄,異常返回值打回資料佇列self.dataQ重新讀取 ''' while(self.status=='running'): try: msg=self.msgQ.get() if(msg['result']==1): self.mainCounter+=1 print('handleMsg:\t'+msg['data']+'×') a=open(r'存在.txt','a+') a.write(msg['data']+'\n') a.close elif(msg['result']==0): self.mainCounter+=1 print('handleMsg:\t'+msg['data']+'√') a=open(r'不存在.txt','a+') a.write(msg['data']+'\n') a.close elif(msg['result']==-1): print('handleMsg:\t'+msg['data']+'ip限制') self.dataQ.put(msg['data']) elif(msg['result']==-2): self.mainCounter+=1 print('handleMsg:\t'+msg['data']+'特殊error') elif(msg['result']==-3): print('handleMsg:\t'+msg['data']+'連線錯誤') self.dataQ.put(msg['data']) except queue.Empty: time.sleep(0.2) def start(self): ''' 開啟訊息執行緒 開啟代理佇列維護執行緒 開啟資料佇列維護執行緒 開啟返回值處理執行緒 開啟self.maxThreadNum個工作執行緒 ''' self.status='running' tipsThread=threading.Thread(target=self.tipsThread) tipsThread.start() proxyThread=threading.Thread(target=self.addProxyThread) proxyThread.start() dataThread=threading.Thread(target=self.addDataThread) dataThread.start() handleMsgThread=threading.Thread(target=self.handleMsgThread) handleMsgThread.start() for a in range(0,self.maxThreadNum): tmpThread=threading.Thread(target=self.workThread) tmpThread.start() scr=Scraping() scr.start()

執行結果:

addProxy:       新增新proxy 119.96.195.76:58269
handleMsg:      一爭×
addProxy:       新增新proxy 117.63.204.66:25444
tips:   		執行時間:5s     速度:0.2        msgQ.qsize:0    dataQ.qsize:2   proxyQ.qsize:0
handleMsg:      一從連線錯誤
handleMsg:      一但連線錯誤
addProxy:       新增新proxy 144.123.71.189:53086
tips:   		執行時間:10s    速度:0.0        msgQ.qsize:0    dataQ.qsize:3   proxyQ.qsize:0
handleMsg:      一冼×
addProxy:       新增新proxy 106.112.171.133:33564
handleMsg:      一別×
handleMsg:      一從×
handleMsg:      一但×
tips:   		執行時間:15s    速度:0.8        msgQ.qsize:0    dataQ.qsize:0   proxyQ.qsize:0
addProxy:       新增新proxy 122.4.28.184:22336
addProxy:       新增新proxy 123.180.71.236:63368
tips:   		執行時間:20s    速度:0.0        msgQ.qsize:0    dataQ.qsize:0   proxyQ.qsize:0
addProxy:       新增新proxy 123.163.131.188:43554
addProxy:       新增新proxy 121.228.52.101:62493
tips:   		執行時間:25s    速度:0.0        msgQ.qsize:0    dataQ.qsize:0   proxyQ.qsize:0
addProxy:       新增新proxy 183.147.252.249:19525
addProxy:       新增新proxy 110.88.127.24:56712