1. 程式人生 > >Urllib模塊使用

Urllib模塊使用

保存 file 構建 live 使用 print lib lencod user

Urllib2基礎操作

1、打開網頁(urlopen)

打開一個網頁

import urllib2
response = urllib2.urlopen(‘http://www.baidu.com‘)
html= response.read()
print html

urlopen一般常用的有三個參數,它的參數如下:

urllib.requeset.urlopen(url,data,timeout)

data參數的使用(GET)

import urllib  
import urllib2  

data = {‘email‘:‘myemail‘, ‘password‘:‘password‘}  
params = urllib.urlencode(params) 
response= urllib.urlopen("%s?%s"%(uri, params))
code = response.getcode()

data參數的使用(POST)

import urllib  
import urllib2  

data = {‘email‘:‘myemail‘, ‘password‘:‘password‘}  
params = urllib.urlencode(data) 
response= urllib.urlopen(uri, params)
code = response.getcode() 

所以如果我們添加data參數的時候就是以post請求方式請求,如果沒有data參數就是get請求方式

timeout參數的使用

在某些網絡情況不好或者服務器端異常的情況會出現請求慢的情況,請求設置一個超時時間

import urllib2

response = urllib2.urlopen(‘http://www.baidu.com‘, timeout=1)
print(response.read())

2、打開網頁(request)

打開一個網頁

import urllib.request

request = urllib.request.Request(‘https://www.baidu.com‘)
response = urllib.request.urlopen(request)
print(response.read().decode(‘utf-8‘))

指定請求頭

import urllib2

# 制定請求頭
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64)"}

# 封裝請求
request = urllib2.Request(url=url, headers=headers)
response = urllib2.urlopen(request)
content = response.read().decode(‘utf-8‘)
print content

3、進階

增加代理

# 自定義headers
headers = {
    ‘Host‘:‘www.dianping.com‘,
    ‘Cookie‘: ‘JSESSIONID=F1C38C2F1A7F7BF3BCB0C4E3CCDBE245 aburl=1; cy=2;‘
    ‘User-Agent‘: "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
    }

proxy_handler = urllib2.ProxyHandler({‘http‘: ‘http://host:port‘})
opener = urllib2.build_opener(proxy_handler)
urllib2.install_opener(opener)
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
content = response.read().decode(‘utf-8‘)

操作cookie

import urllib2
import cookielib
import json

cookie = cookielib.CookieJar()
cookie_s = urllib2.HTTPCookieProcessor(cookie)  # 創建cookie處理器
opener = urllib2.build_opener(cookie_s)
# 構建opener
urllib2.install_opener(opener)
response= urllib2.urlopen(‘http://www.dianping.com‘).read()  # 讀取指定網站的內容  cj = urllib2.HTTPCookieProcessor(cookie)
print response    # 網頁HTML

# 查看cookie
print cookie, type(cookie)
for item in cookie:
    print ‘name:‘ + item.name + ‘-value:‘ + item.value

保存cookie

def saveCookie():
    # 設置保存cookie的文件
    filename = ‘cookie.txt‘
    # 聲明一個MozillaCookieJar對象來保存cookie,之後寫入文件
    cookie = cookielib.MozillaCookieJar(filename)
    # 創建cookie處理器
    handler = urllib2.HTTPCookieProcessor(cookie)
    # 構建opener
    opener = urllib2.build_opener(handler)
    # 創建請求
    res = opener.open(‘http://www.baidu.com‘)
    # 保存cookie到文件
    # ignore_discard的意思是即使cookies將被丟棄也將它保存下來
    # ignore_expires的意思是如果在該文件中cookies已經存在,則覆蓋原文件寫入
    cookie.save(ignore_discard=True, ignore_expires=True)

在文件中取出cookie

def getCookie():
    # 創建一個MozillaCookieJar對象
    cookie = cookielib.MozillaCookieJar()
    # 從文件中的讀取cookie內容到變量
    cookie.load(‘cookie.txt‘, ignore_discard=True, ignore_expires=True)
    # 打印cookie內容,證明獲取cookie成功
    for item in cookie:
        print ‘name:‘ + item.name + ‘-value:‘ + item.value
    # 利用獲取到的cookie創建一個opener
    handler = urllib2.HTTPCookieProcessor(cookie)
    opener = urllib2.build_opener(handler)
    res = opener.open(‘http://www.baidu.com‘)
    print res.read()

來個實例

def my_cookie_test():
    headers = {
        ‘User-Agent‘: "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
        ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
        ‘Accept-Language‘: ‘zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4‘,
        ‘Connection‘: ‘keep-alive‘,
        ‘Cookie‘: ‘cy=2; _lxsdk_cuid=16000a1a16cc8-0629d2ca3b9f7-40544230-100200-16000a1a16dc8; _lxsdk=16000a1a16cc8-0629d2ca3b9f7-40544230-100200-16000a1a16dc8; _lxsdk_s=16000a1a16f-c56-870-2aa%7C%7C23; _hc.v=44792549-7147-7394-ac0a-eefed1fa19a2.1511839081; s_ViewType=10‘,
        ‘Host‘: ‘www.dianping.com‘,
        ‘Referer‘: ‘http://www.dianping.com/shop‘,
        ‘Upgrade-Insecure-Requests‘: 1
    }
    # 請求cookie
    cj_a = cookielib.CookieJar()
    cj_s = urllib2.HTTPCookieProcessor(cj_a)
    proxy_s = urllib2.ProxyHandler({‘http‘: ‘0.0.0.0:8080‘})
    opener = urllib2.build_opener(proxy_s, cj_s)
    urllib2.install_opener(opener)
    try:
        request = urllib2.Request("http://www.dianping.com/shop/000000/", headers=headers)
        response = urllib2.urlopen(request)
        content = response.read().decode(‘utf-8‘)
        # HTML
        print content
        cookie_data = {}
        for item in cj_a:
            # print ‘請求之後:name:‘ + item.name + ‘-value:‘ + item.value
            cookie_data[item.name] = item.value
        cookie_str = json.dumps(cookie_data)
        with open(‘cookie.txt‘, ‘w‘) as f:
            f.write(cookie_str)
        print("cookies信息已保存到本地")
    except Exception as e:
        print e

網頁信息抽取。。。待下期。。。

Urllib模塊使用