1. 程式人生 > >【python】獲取http響應

【python】獲取http響應

ont 如果 perror header port turn bsp set obj

一個相對完整的http請求,輸入ip和端口,輸出響應碼,響應頭,響應體,是否超時,以及出錯時的錯誤信息

處理包括:

1.協議處理,如果是443用https,其他用http

2.HTTPError處理,HTTPError一般是401,403,404之類的錯誤,雖然報錯,但是也有響應頭。註意獲取錯誤信息時要用str(e),其他的比如repr(e)得到的不是字符串,e.read()是響應體,不是錯誤原因

3.URLError處理,一般是Connection refused之類的錯誤。註意獲取錯誤信息時要用str(e.reason)

4.響應體gzip解壓

5.響應體編碼轉換

# coding=utf8
import urllib2 import chardet import traceback import StringIO import re import gzip def plugin_homepage(data, timeout): ip = data["ip"] port = data["port"] if port == 443: url = "https://%s:%s/" % (ip, port) else: url = "http://%s:%s/" % (ip, port) is_timeout, error_reason, code, header, body, title
= get_html(url, timeout) res = {"ip": ip, "port": port, "rsp_header": header, "rsp_body": body, "code": code, "title": title, "is_timeout": is_timeout, "error_reason": error_reason} return res def get_html(url, timeout): user_agent
= Mozilla/4.0 (compatible; MSIE 5.5; Windows NT) headers = {User-Agent: user_agent} is_timeout = False error_reason = None code = None header = None body = None title = None try: request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request, timeout=timeout) code = response.getcode() body = response.read() header = str(response.headers) except urllib2.HTTPError, e: # 處理http錯誤 # print "str(e):%s\nrepr(e):%s\ne:%s\ne.read():%s\n" % (str(e), repr(e), e, e.read()) error_reason = str(e) body = e.read() header = e.headers except urllib2.URLError, e: print traceback.print_exc() error_reason = str(e.reason) if error_reason == "timed out": # 判斷是否超時 is_timeout = True return is_timeout, error_reason, code, header, body, title except Exception, e: print traceback.print_exc() error_reason = str(e) return is_timeout, error_reason, code, header, body, title if not header: return is_timeout, error_reason, code, header, body, title # 解壓gzip if Content-Encoding in header and gzip in header[Content-Encoding]: html_data = StringIO.StringIO(body) gz = gzip.GzipFile(fileobj=html_data) body = gz.read() # 編碼轉換 try: html_encode = get_encode(header, body).strip() if html_encode and len(html_encode) < 12: body = body.decode(html_encode).encode(utf-8) except: pass # 獲取title try: title = re.search(r<title>(.*?)</title>, body, flags=re.I | re.M) if title: title = title.group(1) except: pass return is_timeout, error_reason, code, str(header), body, title # 獲取html編碼 def get_encode(header, body): try: m = re.search(r<meta.*?charset=(.*?)"(>| |/), body, flags=re.I) if m: return m.group(1).replace(", ‘‘) except: pass try: if Content-Type in header: Content_Type = header[Content-Type] m = re.search(r.*?charset=(.*?)(;|$), Content_Type, flags=re.I) if m: return m.group(1) except: pass chardit1 = chardet.detect(body) encode_method = chardit1[encoding] return encode_method if __name__ == "__main__": data = {"ip": "127.0.0.1", "port": 80} res = plugin_homepage(data, 3) print res

【python】獲取http響應