Python實現爬蟲設定代理IP和偽裝成瀏覽器的方法分享
阿新 • • 發佈:2019-01-03
Python實現爬蟲設定代理IP和偽裝成瀏覽器的方法分享
1.python爬蟲瀏覽器偽裝
1 2 3 4 5 6 7 8 9 10 11 12 |
#匯入urllib.request模組
import urllib.request
#設定請求頭
headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0") #建立一個opener
opener=urllib.request.build_opener()
#將headers新增到opener中
opener.addheaders=[headers]
#將opener安裝為全域性
urllib.request.install_opener(opener) #用urlopen開啟網頁
data=urllib.request.urlopen(url).read().decode('utf-8','ignore')
|
2.設定代理
1 2 3 4 5 6 7 8 9 10 |
#定義代理ip
proxy_addr="122.241.72.191:808"
#設定代理
proxy=urllib.request.ProxyHandle({'http':proxy_addr})
#建立一個opener
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandle)
#將opener安裝為全域性
urllib.request.install_opener(opener)
#用urlopen開啟網頁
data=urllib.request.urlopen(url).read().decode('utf-8','ignore')
|
3.同時設定用代理和模擬瀏覽器訪問
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
#定義代理ip
proxy_addr="122.241.72.191:808"
#建立一個請求
req=urllib.request.Request(url)
#新增headers
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
#設定代理
proxy=urllib.request.ProxyHandle("http":proxy_addr)
#建立一個opener
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandle)
#將opener安裝為全域性
urllib.request.install_opener(opener)
#用urlopen開啟網頁
data=urllib.request.urlopen(req).read().decode('utf-8','ignore')
|
4.在請求頭中新增多個資訊
1 2 3 4 5 6 7 |
import urllib.request
page_headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
"Host":"www.baidu.com",
"Cookie":"xxxxxxxx"
}
req=urllib.request.Request(url,headers=page_headers)
data=urllib.request.urlopen(req).read().decode('utf-8','ignore')
|
5.新增post請求引數
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
import urllib.request
import urllib.parse
#設定post引數
page_data=urllib.parse.urlencode([
('pn',page_num),
('kd',keywords)
])
#設定headers
page_headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
'Connection':'keep-alive',
'Host':'www.lagou.com',
'Origin':'https://www.lagou.com',
'Cookie':'JSESSIONID=ABAAABAABEEAAJA8F28C00A88DC4D771796BB5C6FFA2DDA; user_trace_token=20170715131136-d58c1f22f6434e9992fc0b35819a572b',
'Accept':'application/json, text/javascript, */*; q=0.01',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98?labelWords=&fromSearch=true&suginput=',
'X-Anit-Forge-Token':'None',
'X-Requested-With':'XMLHttpRequest'
}
#開啟網頁
req=urllib.request.Request(url,headers=page_headers)
data=urllib.request.urlopen(req,data=page_data.encode('utf-8')).read().decode('utf-8')
|
6.利用phantomjs模擬瀏覽器請求
1 2 3 4 5 6 7 8 9 |
#1.下載phantomjs安裝到本地,並設定環境變數
from selenium import webdriver
bs=webdriver.PhantomJS()
#開啟url
bs.get(url)
#獲取網頁原始碼
url_data=bs.page_source
#將瀏覽到的網頁儲存為圖片
bs.get_screenshot_as_file(filename)
|
7.phantomjs設定user-agent和cookie
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
bs = webdriver.PhantomJS(desired_capabilities=dcap)
bs.get(url)
#刪除cookie
bs.delete_all_cookies()
#設定cookie
#cookie格式:在瀏覽器cookie中檢視,一個cookie需要包含以下引數,domain、name、value、path
cookie={
'domain':'.www.baidu.com', #注意前面有.
'name':'xxxx',
'value':'xxxx',
'path':'xxxx'
}
#向phantomjs中新增cookie
bs.add_cookie(cookie)
|
8.利用web_driver工具
1 2 3 4 5 6 |
#1.下載web_driver工具(如chromdriver.exe)及對應的瀏覽器
#2.將chromdriver.exe放到某個目錄,如c:\chromdriver.exe
from selenium import webdriver
driver=webdriver.Chrome(executable_path="C:\chromdriver.exe")
#開啟url
driver.get(url)
|