Python爬蟲:urllib內建庫基本使用
阿新 • • 發佈:2019-02-13
urllib庫包含以下模組
urllib.request 請求模組
urllib.error 異常處理模組
urllib.parse url解析模組
urllib.robotparser robots.txt解析模組
py2 vs. py3
python2
urllib.urlopen()
python3
urllin.request.urlopen()
引入需要的模組
from urllib import request
from urllib import parse
from urllib import error
from http import cookiejar
import socket
request請求
請求url,請求引數, 請求資料, 請求頭
urlopen
urlopen(url, data=None, timeout, *, cafile=None,
capath=None, cadefault=False, context=None)
# 傳送get請求
def foo1():
response = request.urlopen("http://www.baidu.com")
# 位元組 -> utf-8解碼 -> 字串
print(response.read().decode("utf-8" ))
# 傳送post請求
def foo2():
data = bytes(parse.urlencode({"word": "hello"}), encoding="utf-8")
response = request.urlopen("http://httpbin.org/post", data=data)
print(response.read())
# 設定超時時間並捕獲異常
def foo3():
try:
response = request.urlopen("http://httpbin.org/post", timeout=0.1)
print(response.read())
except error.URLError as e:
print(type(e.reason)) # <class 'socket.timeout'>
if isinstance(e.reason, socket.timeout):
print("超時錯誤:", e)
response響應
# 狀態碼,響應頭
def foo4():
response = request.urlopen("http://www.baidu.com")
print(type(response))
# from http.client import HTTPResponse
# <class 'http.client.HTTPResponse'>
print(response.status)
print(response.getheaders())
print(response.getheader("Server"))
Request請求物件
def foo5():
req = request.Request("http://www.baidu.com")
response = request.urlopen(req)
print(response.read().decode("utf-8"))
# 帶瀏覽器資訊的請求1
def foo6():
url = "http://httpbin.org/post"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)",
"Host": "httpbin.org"
}
dct = {"name": "Tom"}
data = bytes(parse.urlencode(dct), encoding="utf-8")
req = request.Request(url=url, data=data, headers=headers)
response = request.urlopen(req)
print(response.read().decode("utf-8"))
# 帶瀏覽器資訊的請求2
def foo7():
url = "http://httpbin.org/post"
dct = {"name": "Tom"}
data = bytes(parse.urlencode(dct), encoding="utf-8")
req = request.Request(url=url, data=data, method="POST")
req.add_header("User-Agent",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)")
response = request.urlopen(req)
print(response.read().decode("utf-8"))
代理
def foo8():
proxy_handler = request.ProxyHandler({
"http": "http://183.159.94.185:18118",
"https": "https://183.159.94.187:18118",
})
opener = request.build_opener(proxy_handler)
response = opener.open("http://www.baidu.com")
print(response.read())
cookie
def foo9():
cookie = cookiejar.CookieJar()
cookie_handler = request.HTTPCookieProcessor(cookie)
opener = request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
print(response.status)
for item in cookie:
print(item.name, item.value)
# 儲存cookie1
def foo10():
filename = "cookie.txt"
cookie = cookiejar.MozillaCookieJar(filename)
cookie_handler = request.HTTPCookieProcessor(cookie)
opener = request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
cookie.save(ignore_discard=True, ignore_expires=True)
# 儲存cookie2
def foo11():
filename = "cookie1.txt"
cookie = cookiejar.LWPCookieJar(filename)
cookie_handler = request.HTTPCookieProcessor(cookie)
opener = request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
cookie.save(ignore_discard=True, ignore_expires=True)
# 讀取cookie
def foo12():
filename = "cookie1.txt"
cookie = cookiejar.LWPCookieJar()
cookie.load(filename, ignore_discard=True, ignore_expires=True)
cookie_handler = request.HTTPCookieProcessor(cookie)
opener = request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
print(response.read().decode("utf-8"))
異常處理
error主要有:’URLError’, ‘HTTPError’, ‘ContentTooShortError’
def foo13():
try:
response = request.urlopen("http://www.xxooxxooxox.com/xxx")
print(response.status)
except error.HTTPError as e: # 子類異常
print(e.name, e.reason, e.code, e.headers, sep="\n")
except error.URLError as e: # 父類異常
print(e.reason)
else:
print("successful")
parse 模組解析url
urlparse(url, scheme='', allow_fragments=True)
def foo14():
result = parse.urlparse("http://www.baidu.com/xxx.html;user?id=5#comment")
print(type(result), result, sep="\n")
"""
<class 'urllib.parse.ParseResult'>
ParseResult(scheme='http', netloc='www.baidu.com', path='/xxx.html',
params='user', query='id=5', fragment='comment')
"""
# scheme 為預設協議資訊 連結中協議資訊優先
result = parse.urlparse("www.baidu.com", scheme="https")
print(result)
"""
ParseResult(scheme='https', netloc='', path='www.baidu.com',
params='', query='', fragment='')
"""
result = parse.urlparse("http://www.baidu.com", scheme="https")
print(result)
"""
ParseResult(scheme='http', netloc='www.baidu.com', path='',
params='', query='', fragment='')
"""
# allow_fragments 引數決定錨點拼接的位置
result = parse.urlparse("http://www.baidu.com/xxx.html;user?id=5#comment",
allow_fragments=True)
print(result)
"""
ParseResult(scheme='http', netloc='www.baidu.com', path='/xxx.html',
params='user', query='id=5', fragment='comment')
"""
result = parse.urlparse("http://www.baidu.com/xxx.html;user?id=5#comment",
allow_fragments=False)
print(result)
"""
ParseResult(scheme='http', netloc='www.baidu.com', path='/xxx.html',
params='user', query='id=5#comment', fragment='')
"""
result = parse.urlparse("http://www.baidu.com/xxx.html;user#comment",
allow_fragments=False)
print(result)
"""
ParseResult(scheme='http', netloc='www.baidu.com', path='/xxx.html',
params='user#comment', query='', fragment='')
"""
# urlunparse 拼接url連結,注意順序
def foo15():
data = ["http", "www.baidu.com", "index.html", "user", "a=6", "comment"]
print(parse.urlunparse(data))
# http://www.baidu.com/index.html;user?a=6#comment
# urljoin 拼接url,類似os.path.join, 後者優先順序高
def foo16():
print(parse.urljoin("http://www.baidu.com", "index.html"))
print(parse.urljoin("http://www.baidu.com", "http://www.qq.com/index.html"))
print(parse.urljoin("http://www.baidu.com/index.html", "http://www.qq.com/?id=6"))
"""
http://www.baidu.com/index.html
http://www.qq.com/index.html
http://www.qq.com/?id=6
"""
# urlencode將字典轉為url中的引數形式
def foo17():
params ={
"name": "Tom",
"age": 18
}
# 這裡 ? 沒了
url = parse.urljoin("http://www.baidu.com/?", parse.urlencode(params))
print(url)
# http://www.baidu.com/name=Tom&age=18
url = "http://www.baidu.com/?" + parse.urlencode(params)
print(url)
# http://www.baidu.com/?name=Tom&age=18