1. 程式人生 > >爬蟲1-urllib,Request,opener,proxy

爬蟲1-urllib,Request,opener,proxy

一, urllib (直接請求網址)

from urllib import request
with request.urlopen('http://www.runoob.com') as f:
    if f.status == 200:  #200 f.status返回狀態碼;f.reason:OK
        data=f.read()  # 讀取返回的主體內容,資料格式為位元組碼
            #print(data.decode())
            #print(f.getheaders()) # 讀取返回的頭資訊,頭資訊格式為元祖列表
            # for k,v in f.getheaders():
            #      print(k,v)

        try:     # 把爬到的資料裝入檔案
            with open('first.html', 'w+') as fp:
                fp.write(data.decode())
                fp.close()
        except Exception as ex:
            print(ex)

二, Request (模仿不同瀏覽器,不同的請求頭)

如果我們要想模擬瀏覽器傳送GET請求,就需要使用Request物件,

通過往Request物件新增HTTP頭,我們就可以把請求偽裝成瀏覽器

用不同的瀏覽器在傳送請求的時候,會有不同的User-Agent頭。

from urllib import request,parse
import random

url='http://www.runoob.com'
query_obj={"s":"js"}
query_string=parse.urlencode(query_obj)# get提交資料 要對資料urlencode()
url=url+"/?"+query_string
  # print(url)  http://www.runoob.com/?s=js

req=request.Request(url)
ua_list = [  不同瀏覽器的頭部
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]

user_agent=random.choice(ua_list)# 隨機讀取列表
req.add_header('User-Agent',user_agent) #隨機模仿一個瀏覽器

# print(dir(req))
# print(req.full_url) 獲取請求的完整地址
# print(req.headers['User-agent']) #獲取請求頭資訊 req.get_header('User-agent')

with request.urlopen(req) as f:
    data=f.read()
    print(data.decode())

三,opener (使ip從不同的出口出去)

opener是 urllib.OpenerDirector 的例項,

我們之前一直都在使用的urlopen,它是一個特殊的opener(也就是模組幫我們構建好的)

但是基本的urlopen()方法不支援代理、cookie等其他的HTTP/HTTPS高階功能。

from urllib import request,parse,error
import random
import json
import ssl
ssl._create_default_https_context = ssl._create_unverified_context #用於解決https不能爬取的問題

url="https://www.meishij.net/chufang/diy/wancan/?&"

qs={
    "page":2
}
url=url+parse.urlencode(qs)
req=request.Request(url)

ua_list = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]

# 隨機讀取列表
user_agent=random.choice(ua_list)
req.add_header('User-Agent',user_agent)

# 構建一個HTTPHandler 處理器物件,支援處理HTTPS請求
http_handler = request.HTTPSHandler(debuglevel=1)

# 呼叫request.build_opener()方法,建立支援處理HTTP請求的opener物件
opener = request.build_opener(http_handler)  # 使ip從不同的出口出去

try:
    with opener.open(req) as f:   # 原來這裡是request.urlopen(req)
        data = f.read()            # 這裡data為位元組碼

        with open('ttt.json', 'wb') as fp: #若用w+ 括號裡寫encoding='utf-8' 下面data要data.decode()
            fp.write(data)
            fp.close()

except error.HTTPError as err: #若http出錯
    pass
except error.URLError as err:  #若url出錯
    pass
except Exception as err:
    pass

四,proxy(代理ip)

ProxyHandler處理器(代理設定)

使用代理IP,這是爬蟲/反爬蟲的第二大招,通常也是最好用的。

很多網站會檢測某一段時間某個IP的訪問次數(通過流量統計,系統日誌等),如果訪問次數多的不像正常人,它會禁止這個IP的訪問。

所以我們可以設定一些代理伺服器,每隔一段時間換一個代理,就算IP被禁止,依然可以換個IP繼續爬取。

會有時間延遲 網速要好

from urllib import request,parse,error
import random
import json
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

url="https://www.meishij.net/chufang/diy/wancan/?&"

qs={
    "page":2
}

url=url+parse.urlencode(qs)
req=request.Request(url)

ua_list = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]

# 隨機讀取列表
user_agent=random.choice(ua_list)
req.add_header('User-Agent',user_agent)


proxy_list = [
    {"https" : "116.192.167.32:32267"},
    {"https" : "14.117.176.252:808"},
    {"https" : "121.31.140.130:8123"}

]

# 隨機選擇一個代理
proxy = random.choice(proxy_list)

http_handler = request.ProxyHandler(proxy)
opener = request.build_opener(http_handler)

try:
    with opener.open(req) as f:
        data = f.read()
        with open('a.json', 'wb') as fp:
            fp.write(data)
            fp.close()
except error.HTTPError as err:
    print(err)
except error.URLError as err:
    print(err)
except Exception as err:
    print(err)