1. 程式人生 > >python學習筆記——爬蟲2——反反爬

python學習筆記——爬蟲2——反反爬

import requests
import re
import random
import time

#建立一個反反爬的類
class download:
    def __init__(self):

        self.iplist = [] ##初始化一個list用來存放獲取到的ip
        html = requests.get(' http://haoip.cc/tiqu.htm')##使用requests中的get方法獲取頁面的內容
        iplistn = re.findall(r'r/>(.*?)<b',html.text,re.S)##正則表示式,表示從html中獲取所有r/><b中的內容,re.S的意思是包括匹配包括換行符,findall返回的是列表
        for ip in iplistn:
            i = re.sub('\n','',ip)##利用re.sub替換方法,將\n替換為空
            self.iplist.append(i.strip())##將兩端去除空格後新增到上面的list裡面

        self.user_agent_list=[
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]

    def get(self,url,timeout,proxy=None,num_retries=6):##給函式一個預設引數proxy為空,預設num_retries為6次
        UA = random.choice(self.user_agent_list)##從user_agent_list中隨機取出一個字串。
        headers = {'User_Agent':UA}##構造一個完整的User_Agent

        if proxy == None:##當代理為空時,不使用代理獲取response
            try:
                return requests.get(url,headers=headers)##返回一個requests.get的頁面檔案,呼叫隨機的headers,伺服器以為我們是真的瀏覽器了
            except:##如果上面的程式碼執行報錯則執行下面的程式碼
                if num_retries >0: ##num_retries是限定的重試次數
                    time.sleep(10) ##延遲10秒
                    print(u'獲取網頁出錯,10s後將獲取倒數第:',num_retries,u'次')
                    return self.get(url,timeout,num_retries-1)##呼叫自身,並減1,實現迴圈6次
                else:
                    print(u'開始使用代理')
                    time.sleep(10)
                    IP = ''.join(str(random.choice(self.iplist)).strip())##將從self.iplist中隨機獲取的字串處理成需要的格式。去除兩邊空格後,用join拼接?
                    proxy = {'http':IP}
                    return self.get(url,timeout,proxy)##代理不為空的時候

        else: ##當代理不為空
            try:
                IP = ''.join(str(random.choice(self.iplist)).strip())##將從self.iplist中隨機獲取的字串處理成需要的格式。去除兩邊空格後,用join拼接?
                proxy = {'http':IP}##構造成一個代理
                return requests.get(url,headers=headers,proxies = proxy,timeout=timeout)##使用代理獲取response
            except:
                if num_retries >0:
                    time.sleep(10)
                    IP = ''.join(str(random.choice(self.iplist)).strip())
                    proxy = {'http':IP}
                    print(u'正在更換代理,10s後將重新獲取倒數第',num_retries,u'次')
                    print(u'當前代理是:',proxy)
                    return self.get(url,timeout,proxy,num_retries-1)
                else:
                    print(u'代理也不好使!取消代理')
                    return self.get(url,3)

request = download()