1. 程式人生 > >python爬蟲URL重試機制實現(python2.7以及python3.5)

python爬蟲URL重試機制實現(python2.7以及python3.5)

應用場景:

狀態不是200的URL重試多次

程式碼比較簡單還有部分註釋

python2.7實現:

# -*-coding:utf-8-*-
"""
ayou
"""

import requests

def url_retry(url,num_retries=3):
    print("access!")
    try:
        request = requests.get(url,timeout=60)
        #raise_for_status(),如果不是200會丟擲HTTPError錯誤
        request.raise_for_status()
        html = request.content
    except requests.HTTPError as e:
        html=None
        if num_retries>0:
            #如果不是200就重試,每次遞減重試次數
            return url_retry(url,num_retries-1)
    #如果url不存在會丟擲ConnectionError錯誤,這個情況不做重試
    except requests.exceptions.ConnectionError as e:
        return
    return html

url_retry("http://httpbin.org/status/404")

python3.5實現:

# -*-coding:utf-8-*-
"""
ayou
"""
import asyncio
import aiohttp

async def print_page(url,num_retries=3):
    async with aiohttp.ClientSession() as session:
        try:
            async with session.get(url,timeout=60) as response:
                print("access!")
                  #raise_for_status(),如果不是200會丟擲HttpProcessingError錯誤
                response.raise_for_status()
                body = await response.text()
        except aiohttp.errors.HttpProcessingError as e:
            body = None
            if num_retries > 0:
                  #如果不是200就重試,每次遞減重試次數
                return await print_page(url, num_retries - 1)
        #不存在URL會丟擲ClientResponseError錯誤
        except aiohttp.errors.ClientResponseError as e:
            return e
    session.close()
    print(body)
    return body

def main():
    #這是一個不存在URL
    # url = 'http://httpbin.org/status/404111'
    #這是一個404的URL
    url = 'http://httpbin.org/status/404'
    loop = asyncio.get_event_loop()
    loop.run_until_complete(print_page(url))
    loop.close()

if __name__ == '__main__':
    main()

爬蟲URL重試機制封裝成修飾器(python2.7以及python3.5以上)

python2.7版本:

# -*-coding:utf-8-*-
"""
ayou
"""
import requests

#定義一個重試修飾器,預設重試一次
def retry(num_retries=1):
    #用來接收函式
    def wrapper(func):
        #用來接收函式的引數
        def wrapper(*args,**kwargs):
            #為了方便看丟擲什麼錯誤定義一個錯誤變數
            last_exception =None
            #迴圈執行包裝的函式
            for _ in range(num_retries):
                try:
                    #如果沒有錯誤就返回包裝的函式,這樣跳出迴圈
                    return func(*args, **kwargs)
                except Exception as e:
                    #捕捉到錯誤不要return,不然就不會迴圈了
                    last_exception = e
            #如果要看丟擲錯誤就可以丟擲
            # raise last_exception
        return wrapper
    return wrapper

if __name__=="__main__":
    @retry(5)
    def url_retry(url):
        request = requests.get(url, timeout=60)
        print("access!")
        request.raise_for_status()
        html = request.content
        print(html)
        return html

    url_retry("http://httpbin.org/status/404")
    # url_retry("http://httpbin.org/status/404111")
    # url_retry("http://www.baidu.com")
python3.5以上版本:
# -*-coding:utf-8-*-
"""
ayou
"""
import aiohttp,asyncio

#定義一個重試修飾器,預設重試一次
def retry(num_retries=1):
    #用來接收函式
    def wrapper(func):
        #用來接收函式的引數
        def wrapper(*args,**kwargs):
            #為了方便看丟擲什麼錯誤定義一個錯誤變數
            last_exception =None
            #迴圈執行包裝的函式
            for _ in range(num_retries):
                try:
                    #如果沒有錯誤就返回包裝的函式,這樣跳出迴圈
                    return func(*args, **kwargs)
                except Exception as e:
                    #捕捉到錯誤不要return,不然就不會迴圈了
                    last_exception = e
            #如果要看丟擲錯誤就可以丟擲
            # raise last_exception
        return wrapper
    return wrapper

async def print_page(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url,timeout=60) as response:
            print("access!")
              #raise_for_status(),如果不是200會丟擲HttpProcessingError錯誤
            response.raise_for_status()
            body = await response.text()
    session.close()
    print(body)
    return body

@retry(5)
def loop_get():
    # url = "http://www.baidu.com"
    # url = 'http://httpbin.org/status/404111'
    url = 'http://httpbin.org/status/404'
    loop = asyncio.get_event_loop()
    loop.run_until_complete(print_page(url))
    loop.close()

if __name__ == '__main__':
    loop_get()