1. 程式人生 > >Python爬蟲之Urllib庫的基本使用

Python爬蟲之Urllib庫的基本使用

狀態碼 chrom 異常處理 false 基本 sta col thead kit

# get請求
import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode(utf-8))

# post請求
import urllib.parse
import urllib.request
data = bytes(urllib.parse.urlencode({"word":"hello"}), encoding=utf8)
response = urllib.request.urlopen(http://httpbin.org/post
, data=data) print(response.read()) import urllib.request response = urllib.request.urlopen(http://httpbin.org/get, timeout=1) print(response.read()) import socket import urllib.request import urllib.error try: response = urllib.request.urlopen(http://httpbin.org/get, timeout = 0.1) except urllib.error.URLError as e:
if isinstance(e.reason, socket.timeout): print(TIME OUT) # 響應類型 import urllib.request response = urllib.request.urlopen(http://www.python.org) print(type(response)) # 狀態碼、響應頭 import urllib.request response = urllib.request.urlopen(http://www.python.org) print(response.status) print(response.getheaders())
print(response.getheader(server)) # Request import urllib.request request = urllib.request.Request(http://python.org) response = urllib.request.urlopen(request) print(response.read().decode(utf-8)) from urllib import request, parse url = http://httpbin.org/post headers = { User-Agent: user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36, Host:httpbin.org } dict = { name:Germey } data = bytes(parse.urlencode(dict), encoding = utf-8) req = request.Request(url = url, data = data, headers = headers, method = POST) response = request.urlopen(req) print(response.read().decode(utf-8)) from urllib import request, parse url = http://httpbin.org/post dict = { name: Germey } data = bytes(parse.urlencode(dict), encoding = utf-8) req = request.Request(url = url, data = data, method = POST) req.add_header(User-Agent, user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36) response = request.urlopen(req) print(response.read().decode(utf-8)) #代理 import urllib.request proxy_handler = urllib.request.ProxyHandler({ http: http://127.0.0.1:9743, https: https://127.0.0.1:9743 }) opener = urllib.request.build_opener(proxy_handler) response = opener.open(http://httpbon.org/get) print(response.read()) # cookie import http.cookiejar, urllib.request cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open(http://www.baidu.com) for item in cookie: print(item.name + " = " + item.value) # 保存cookie為1.txt import http.cookiejar, urllib.request filename = 1.txt cookie = http.cookiejar.MozillaCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open(http://www.baidu.com) cookie.save(ignore_discard = True, ignore_expires = True) # 另外一種方式保存cookie import http.cookiejar, urllib.request filename = 1.txt cookie = http.cookiejar.LWPCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open(http://www.baidu.com) cookie.save(ignore_discard = True, ignore_expires = True) # 讀取cookie import http.cookiejar, urllib.request cookie = http.cookiejar.LWPCookieJar() cookie.load(1.txt, ignore_discard = True, ignore_expires = True) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open(http://www.baidu.com) print(response.read().decode(utf-8)) # 異常處理 from urllib import request, error try: response = request.urlopen(http://lidonghao.com) except error.URLError as e: print(e.reason) from urllib import request, error try: response = request.urlopen(http://www.baidu.com/101) except error.HTTPError as e: print(e.reason, e.code, sep = \n) except error.URLError as e: print(e.reason) else: print(Request Successfully) import socket import urllib.request import urllib.error try: response = urllib.request.urlopen("https://www.baidu.com", timeout = 0.01) except urllib.error.URLError as e: print(type(e.reason)) if isinstance(e.reason, socket.timeout): print("TIME OUT")
 1 # 解析URL
 2 # urlparse
 3 from urllib.parse import urlparse
 4 result = urlparse(http://www.baidu.com/index.html;user?id=5#comment)
 5 print(type(result), result)
 6 
 7 from urllib.parse import urlparse
 8 result = urlparse(www.baidu.com/index.html;user?id=5#comment, scheme = "https")
 9 print(result)
10 
11 from urllib.parse import urlparse
12 result = urlparse(http://www.baidu.com/index.html;user?id=5#comment, scheme = "https")
13 print(result)
14 
15 from urllib.parse import urlparse
16 result = urlparse(http://www.baidu.com/index.html;user?id=5#comment, allow_fragments = False)
17 print(result)
18 
19 from urllib.parse import urlparse
20 result = urlparse(http://www.baidu.com/index.html#comment, allow_fragments = False)
21 print(result)
 1 # urlunparse
 2 from urllib.parse import urlunparse
 3 data = [http, www.baidu.com, index,html, user, a=6, comment]
 4 print(urlunparse(data))
 5 
 6 # urljoin
 7 from urllib.parse import urljoin
 8 print(urljoin(http://www.baidu.com, FAQ.html))
 9 print(urljoin(http://www.baidu.com, https://cuiqingcai.com/FAQ.html))
10 print(urljoin(http://www.baidu.com/about.html, https://cuiqingcai.com/FAQ.html))
11 print(urljoin(http://www.baidu.com/about.html, http://cuiqingcai.com/FAQ.html?question=2))
12 print(urljoin(http://www.baidu.com?wd=abc, https://cuiqingcai.com/index.php))
13 print(urljoin(http://www.baidu.com, ?category=2#comment))
14 print(urljoin(www.baidu.com, ?category=2#comment))
15 print(urljoin(www.baidu.com#comment, ?category=2))
16 
17 # urlencode
18 from urllib.parse import urlencode
19 params = {
20     name:germey,
21     age:22
22 }
23 base_url = http://www.baidu.com
24 url = base_url + urlencode(params)
25 print(url)

Python爬蟲之Urllib庫的基本使用