1. 程式人生 > >[Python] [爬蟲] 2.批量政府網站的招投標、中標資訊爬取和推送的自動化爬蟲——驗證模組

[Python] [爬蟲] 2.批量政府網站的招投標、中標資訊爬取和推送的自動化爬蟲——驗證模組

目錄

1.Intro

2.Source


1.Intro

檔名:authentication.py

模組名:驗證模組

引用庫:

urllib2 requests pymongo socket
gc retry spiderData(自定義庫)  

自定義引用檔案:spiderData,包含了一個網頁返回狀態碼的字典,鍵為網頁狀態碼,值為網頁狀態碼對應的資訊。由於沒有合適的狀態碼返回值資訊,所以自己寫了個字典用於儲存狀態碼和狀態資訊的鍵值對,其中包含了常見的網頁錯誤狀態碼和錯誤資訊:

httpStatusCode = {
        "300": "Multiple Choices",
        "301": "Moved Permanently",
        "302": "Move temporarily",
        "303": "See Other",
        "304": "Not Modified",
        "305": "Use Proxy",
        "306": "Switch Proxy",
        "307": "Temporary Redirect",
        "400": "Bad Request",
        "401": "Unauthorized",
        "402": "Payment Required",
        "403": "Forbidden",
        "404": "Not Found",
        "405": "Method Not Allowed",
        "406": "Not Acceptable",
        "407": "Proxy Authentication Required",
        "408": "Request Timeout",
        "409": "Conflict",
        "410": "Gone",
        "411": "Length Required",
        "412": "Precondition Failed",
        "413": "Request Entity Too Large",
        "414": "Request-URI Too Long",
        "415": "Unsupported Media Type",
        "416": "Requested Range Not Satisfiable",
        "417": "Expectation Failed",
        "421": "Too many connections",
        "422": "Unprocessable Entity",
        "423": "Locked",
        "424": "Failed Dependency",
        "425": "Unordered Collection",
        "426": "Upgrade Required",
        "449": "Retry With",
        "451": "Unavailable For Legal Reasons",
        "500": "Internal Server Error",
        "501": "Not Implemented",
        "502": "Bad Gateway",
        "503": "Service Unavailable",
        "504": "Gateway Timeout",
        "505": "HTTP Version Not Supported",
        "506": "Variant Also Negotiates",
        "507": "Insufficient Storage",
        "509": "Bandwidth Limit Exceeded",
        "510": "Not Extended",
        "600": "Unparseable Response Headers"
    }

功能:用於驗證MongoDB資料庫連線狀態、網頁連通性(HTTP狀態碼)、代理IP可用性。


2.Source

#!/usr/bin/env Python
# -*- coding: utf-8 -*-
'''
# Author  : YSW
# Time    : 2018/6/6 14:01
# File    : authentication.py
# Version : 1.1
# Describe: 驗證模組
# Update  :
        1.新增了retry庫,可多次嘗試網站連通性,直到連線超時。
'''

import urllib2
import requests
import socket
import spiderData
import pymongo
import gc
from retry import retry

class Authentication(object):
    def __init__(self, headers):
        print("[*] 初始化驗證模組")
        self.headers = headers

    def dataBaseVerify(self, dbParams):
        '''
        驗證資料庫連線狀態
        :param dbParams: 資料庫連線引數
        :return: 驗證通過返回 True,否則返回 False
        '''
        print("[+] 正在驗證 MongoDB 資料庫連線狀態")
        try:
            userName = dbParams["userName"]
            port = dbParams["port"]
            pymongo.MongoClient(userName, port)
            print("[+] 資料庫驗證通過")
            return True
        except Exception, e:
            print("[+] 資料庫驗證失敗")
            print("ERROR: " + str(e.message))
            return False

    @retry(tries=5, delay=2)
    def httpCodeVerify(self, url):
        '''
        驗證 HTTP 狀態碼
        :return: 驗證通過返回 True,否則返回 False
        '''
        print("[+] 正在驗證 HTTP 狀態碼:{0}".format(url))
        try:
            request = urllib2.Request(url, headers=self.headers)
            urllib2.urlopen(request)
            print("[+] HTTP 驗證通過:{0}".format(url))
            return True
        except urllib2.HTTPError, e:
            print("[+] HTTP 驗證失敗:{0}".format(url))
            print("ERROR: " + str(e.code) + " " + spiderData.httpStatusCode[str(e.code)])
            return False

    def proxyVerify(self, url, protocol, ip, port):
        '''
        檢查代理IP是否可用
        :param ip:代理IP
        :param port:代理埠
        :param protocol:代理協議
        :return:返回檢查結果
        '''
        check_url = url
        proxy_url = "{0}://{1}:{2}".format(protocol, ip, port)
        print("[+] 正在驗證代理 IP 可用性")
        socket_timeout = 30
        socket.setdefaulttimeout(socket_timeout)
        try:
            proxy_dict = {
                protocol: proxy_url
            }
            response = requests.get(check_url, proxies=proxy_dict, headers=self.headers)
            code = response.status_code
            print(str(code))
            if code >= 200 and code < 300:
                print("[+] 可用的代理IP和埠: {0}:{1}:{2}".format(protocol, ip, port))
                print("[+] 驗證通過")
                return True
            else:
                print("[-] 不可用的代理IP和埠: {0}:{1}:{2}".format(protocol, ip, port))
                return False
        except Exception, e:
            print("[-] 不可用的代理IP和埠: {0}:{1}:{2}".format(protocol, ip, port))
            print("ERROR: " + str(e.message))
            return False
        finally:
            gc.collect()