1. 程式人生 > >Python爬蟲:urllib內建庫基本使用

Python爬蟲:urllib內建庫基本使用

urllib庫包含以下模組

urllib.request 請求模組
urllib.error 異常處理模組
urllib.parse url解析模組
urllib.robotparser robots.txt解析模組

py2 vs. py3

python2
urllib.urlopen()

python3
urllin.request.urlopen()

引入需要的模組

from urllib import request
from urllib import parse
from urllib import error
from http import cookiejar
import
socket

request請求

請求url,請求引數, 請求資料, 請求頭

urlopen

urlopen(url, data=None, timeout, *, cafile=None, 
    capath=None, cadefault=False, context=None)

# 傳送get請求
def foo1():
    response = request.urlopen("http://www.baidu.com")
    # 位元組 -> utf-8解碼 -> 字串
    print(response.read().decode("utf-8"
)) # 傳送post請求 def foo2(): data = bytes(parse.urlencode({"word": "hello"}), encoding="utf-8") response = request.urlopen("http://httpbin.org/post", data=data) print(response.read()) # 設定超時時間並捕獲異常 def foo3(): try: response = request.urlopen("http://httpbin.org/post", timeout=0.1) print(response.read()) except
error.URLError as e: print(type(e.reason)) # <class 'socket.timeout'> if isinstance(e.reason, socket.timeout): print("超時錯誤:", e)

response響應


# 狀態碼,響應頭
def foo4():
    response = request.urlopen("http://www.baidu.com")
    print(type(response))
    # from http.client import HTTPResponse
    # <class 'http.client.HTTPResponse'>

    print(response.status)
    print(response.getheaders())
    print(response.getheader("Server"))

Request請求物件

def foo5():
    req = request.Request("http://www.baidu.com")
    response = request.urlopen(req)
    print(response.read().decode("utf-8"))

# 帶瀏覽器資訊的請求1
def foo6():
    url = "http://httpbin.org/post"
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)",
        "Host": "httpbin.org"
    }
    dct = {"name": "Tom"}

    data = bytes(parse.urlencode(dct), encoding="utf-8")
    req = request.Request(url=url, data=data, headers=headers)
    response = request.urlopen(req)
    print(response.read().decode("utf-8"))


# 帶瀏覽器資訊的請求2
def foo7():
    url = "http://httpbin.org/post"
    dct = {"name": "Tom"}
    data = bytes(parse.urlencode(dct), encoding="utf-8")

    req = request.Request(url=url, data=data, method="POST")
    req.add_header("User-Agent",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)")

    response = request.urlopen(req)

    print(response.read().decode("utf-8"))

代理


def foo8():
    proxy_handler = request.ProxyHandler({
        "http": "http://183.159.94.185:18118",
        "https": "https://183.159.94.187:18118",
        })
    opener = request.build_opener(proxy_handler)
    response = opener.open("http://www.baidu.com")
    print(response.read())

cookie


def foo9():
    cookie = cookiejar.CookieJar()
    cookie_handler = request.HTTPCookieProcessor(cookie)
    opener = request.build_opener(cookie_handler)
    response = opener.open("http://www.baidu.com")
    print(response.status)
    for item in cookie:
        print(item.name, item.value)

# 儲存cookie1
def foo10():
    filename = "cookie.txt"
    cookie = cookiejar.MozillaCookieJar(filename)
    cookie_handler = request.HTTPCookieProcessor(cookie)
    opener = request.build_opener(cookie_handler)
    response = opener.open("http://www.baidu.com")
    cookie.save(ignore_discard=True, ignore_expires=True)

# 儲存cookie2
def foo11():
    filename = "cookie1.txt"
    cookie = cookiejar.LWPCookieJar(filename)
    cookie_handler = request.HTTPCookieProcessor(cookie)
    opener = request.build_opener(cookie_handler)
    response = opener.open("http://www.baidu.com")
    cookie.save(ignore_discard=True, ignore_expires=True)

# 讀取cookie
def foo12():
    filename = "cookie1.txt"
    cookie = cookiejar.LWPCookieJar()
    cookie.load(filename, ignore_discard=True, ignore_expires=True)
    cookie_handler = request.HTTPCookieProcessor(cookie)
    opener = request.build_opener(cookie_handler)
    response = opener.open("http://www.baidu.com")
    print(response.read().decode("utf-8"))

異常處理

error主要有:’URLError’, ‘HTTPError’, ‘ContentTooShortError’


def foo13():
    try:
        response = request.urlopen("http://www.xxooxxooxox.com/xxx")
        print(response.status)
    except error.HTTPError as e:  # 子類異常
        print(e.name, e.reason, e.code, e.headers, sep="\n")
    except error.URLError as e:  # 父類異常
        print(e.reason)
    else:
        print("successful")

parse 模組解析url

urlparse(url, scheme='', allow_fragments=True)

def foo14():
    result = parse.urlparse("http://www.baidu.com/xxx.html;user?id=5#comment")
    print(type(result), result, sep="\n")
    """
    <class 'urllib.parse.ParseResult'>
    ParseResult(scheme='http', netloc='www.baidu.com', path='/xxx.html', 
            params='user', query='id=5', fragment='comment')
    """

    # scheme 為預設協議資訊 連結中協議資訊優先
    result = parse.urlparse("www.baidu.com", scheme="https")
    print(result)
    """
    ParseResult(scheme='https', netloc='', path='www.baidu.com',
          params='', query='', fragment='')
    """

    result = parse.urlparse("http://www.baidu.com", scheme="https")
    print(result)
    """
    ParseResult(scheme='http', netloc='www.baidu.com', path='', 
            params='', query='', fragment='')
    """

    # allow_fragments 引數決定錨點拼接的位置
    result = parse.urlparse("http://www.baidu.com/xxx.html;user?id=5#comment",
                    allow_fragments=True)
    print(result)
    """
    ParseResult(scheme='http', netloc='www.baidu.com', path='/xxx.html', 
            params='user', query='id=5', fragment='comment')
    """

    result = parse.urlparse("http://www.baidu.com/xxx.html;user?id=5#comment",
                    allow_fragments=False)
    print(result)
    """
    ParseResult(scheme='http', netloc='www.baidu.com', path='/xxx.html', 
            params='user', query='id=5#comment', fragment='')

    """

    result = parse.urlparse("http://www.baidu.com/xxx.html;user#comment",
                    allow_fragments=False)
    print(result)
    """
    ParseResult(scheme='http', netloc='www.baidu.com', path='/xxx.html', 
            params='user#comment', query='', fragment='')

    """

# urlunparse 拼接url連結,注意順序
def foo15():
    data = ["http", "www.baidu.com", "index.html", "user", "a=6", "comment"]
    print(parse.urlunparse(data))
    # http://www.baidu.com/index.html;user?a=6#comment

# urljoin 拼接url,類似os.path.join, 後者優先順序高
def foo16():
    print(parse.urljoin("http://www.baidu.com", "index.html"))
    print(parse.urljoin("http://www.baidu.com", "http://www.qq.com/index.html"))
    print(parse.urljoin("http://www.baidu.com/index.html", "http://www.qq.com/?id=6"))
    """
    http://www.baidu.com/index.html
    http://www.qq.com/index.html
    http://www.qq.com/?id=6
    """

# urlencode將字典轉為url中的引數形式
def foo17():
    params ={
        "name": "Tom",
        "age": 18
    }
    # 這裡 ? 沒了
    url = parse.urljoin("http://www.baidu.com/?", parse.urlencode(params))
    print(url)
    # http://www.baidu.com/name=Tom&age=18

    url = "http://www.baidu.com/?" + parse.urlencode(params)
    print(url)
    # http://www.baidu.com/?name=Tom&age=18