1. 程式人生 > >Python3爬蟲學習筆記(1.urllib庫詳解)

Python3爬蟲學習筆記(1.urllib庫詳解)

1.什麼是爬蟲:略,到處都有講解。

雖然是入門,不過沒有Python基礎的同學看起來可能費勁,建議稍學下Python

之前學習前端知識也是為了能看懂HTML,便於爬蟲學習,建議瞭解下前端知識

2.requests庫初識:

列印百度的原始碼:

import requests
reponse = requests.get("http://www.baidu.com")
print(reponse.text)
列印頭部資訊:
import requests
reponse = requests.get("http://www.baidu.com")
print(reponse.headers)
列印狀態碼(200表示成功):

import requests
reponse = requests.get("http://www.baidu.com")
print(reponse.status_code)

模擬頭部的User-Agent資訊:

import requests
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}
reponse = requests.get("http://www.baidu.com",
headers=headers) print(reponse.status_code)
獲取圖片的二進位制格式:
import requests
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}
reponse = requests.get("https://www.baidu.com/img/bd_logo1.png",headers=headers)
print(reponse.content)
儲存圖片到本地:
import requests
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}
reponse = requests.get("https://www.baidu.com/img/bd_logo1.png",headers=headers)
with open("pic.gif","wb") as f:
    f.write(reponse.content)

3.Urllib庫詳解:

獲取原始碼:

import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode("utf-8"))

以POST形式傳遞一個字典(這個URL是做HTTP測試用的,應當記下):

import urllib.parse
import urllib.request
data = bytes(urllib.parse.urlencode({"word":"hello"}), encoding="utf-8")
response = urllib.request.urlopen("http://httpbin.org/post", data=data)
print(reponse.read())
抓住異常(超時):
import socket
import urllib.error
import urllib.request
try:
    response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.1)
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print("TIME OUT")
獲取響應型別:
import urllib.request
response = urllib.request.urlopen("http://www.python.org")
print(type(response))
#<class 'http.client.HTTPResponse'>
狀態碼和響應頭:
import urllib.request
response = urllib.request.urlopen("http://www.python.org")
print(response.status)
print(response.getheaders())
print(response.getheader("Server"))
構造Request:
import urllib.request
request = urllib.request.Request("http://python.org")
response = urllib.request.urlopen(request)
print(response.read().decode("utf-8"))
完整構造:
from urllib import request, parse
url = "http://www.python.org/post"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
"Host": "httpbin.org"
}
dict = {
    "name": "Germey"
}
data = bytes(parse.urlencode(dict), encoding="utf8")
req = request.Request(url=url, data=data, headers=headers, method="POST")
response = request.urlopen(req)
print(response.read().decode("utf-8"))
另外一種實現方式:
from urllib import request, parse
url = "http://httpbin.org/post"
dict = {
    "name": "Germey"
}
data = bytes(parse.urlencode(dict), encoding="utf8")
req = request.Request(url=url, data=data,method="POST")
req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36")
response = request.urlopen(req)
print(response.read().decode("utf-8"))
代理(用於不斷改變IP防止爬蟲被封):
import urllib.request
proxy_handler = urllib.request.ProxyHandler({
    "http": "http://222.222.169.60:53281",
"https": "http://222.222.169.60:53281"
})
opener = urllib.request.build_opener(proxy_handler)
response = opener.open("http://www.baidu,com")
print(response.read())
獲取Cookie:
import http.cookiejar
import urllib.request
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open("http://www.baidu.com")
for item in cookie:
    print(item.name+"="+item.value)

儲存Cookie到文字:

import http.cookiejar
import urllib.request
filename = "cookie.txt"
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open("http://www.baidu.com")
cookie.save(ignore_discard=True, ignore_expires=True)
另外一種儲存方法:
import http.cookiejar
import urllib.request
filename = "cookie.txt"
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open("http://www.baidu.com")
cookie.save(ignore_discard=True, ignore_expires=True)
使用剛才儲存的cookie訪問:
import http.cookiejar
import urllib.request
cookie = http.cookiejar.LWPCookieJar()
cookie.load("cookie.txt", ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open("http://www.baidu.com")
print(response.read().decode("utf-8"))
異常處理:
from urllib import request,error
try:
    response = request.urlopen("http://heiheiyiqing.com/index.htm")
except error.URLError as e:
    print(e.reason)

異常處理詳細:

from urllib import request,error
try:
    response = request.urlopen("http://heiheiyiqing.com/index.htm")
except error.HTTPError as e:
    print(e.reason, e.code, e.headers, sep='\n')
except error.URLError as e:
    print(e.reason)
else:
    print("Request Successfully")

URL解析:

from urllib.parse import urlparse
result = urlparse("https://www.baidu.com/index.php?tn=monline_3_dg")
print(type(result),result)

URL拼接:

from urllib.parse import urlunparse
data = ["http","www.baidu.com", "index.html", "user", "a=6", "comment"]
print(urlunparse(data))

URL轉碼:

from urllib.parse import urlencode
params = {
    "name": "yiqing",
"age": "18"
}
base_url = "http://www.baidu.com?"
url = base_url+urlencode(params)
print(url)