1. 程式人生 > >用Python寫網路爬蟲系列(三)表單處理

用Python寫網路爬蟲系列(三)表單處理

import  urllib,urllib2
LOGIN_URL = r'http://example.webscraping.com/user/login'
LOGIN_EMAIL = '[email protected]'
LOGIN_PASSWORD ='qq123456'
data ={'email':LOGIN_EMAIL,'password':LOGIN_PASSWORD}
encoded_data = urllib.urlencode(data) # 首先把中文字元轉換為十六進位制,然後在每個字元前面加一個識別符號%
request = urllib2.Request(LOGIN_URL,encoded_data)
response = urllib2.urlopen(request)
response.geturl() #如果登陸成功會跳轉到主頁 否則會跳轉到登陸頁面
# 因為除了郵箱和密碼之外還需要提交其他的幾個域 但是這些域被隱藏了,但是可以通過lxml方法訪問
import lxml.html
def parse_form(html):
    tree = lxml.html.fromstring(html)
    data = {}
    for e in tree.cssselect('form input'):
        if e.get('name'):
            data[e.get('name')] = e.get('value')
    return data
# 遍歷表單裡面的所有input 標籤 然後用字典的形式返回name和value的屬性值
import pprint
html = urllib2.urlopen(LOGIN_URL).read()
form = parse_form(html)
pprint.pprint(form)
#_formkey 伺服器使用這個唯一的ID來避免表單避免多次提交,每次載入網頁都會產生不同的ID然後伺服器根據這個ID來判斷是否提交過
#修改過的程式碼
html = urllib2.urlopen(LOGIN_URL).read()
data = parse_form(html)
data['email'] = LOGIN_EMAIL
data['password'] = LOGIN_PASSWORD
encoded_data = urllib.urlencode(data)
request = urllib2.Request(LOGIN_URL,encoded_data)
response = urllib2.urlopen(request)
response.geturl()
#依舊沒有辦法正常執行 因為缺少一個很重要的部分 cookies
import  cookielib
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
html = opener.open(LOGIN_URL).read()
data = parse_form(html)
data['email'] = LOGIN_EMAIL
data['password'] = LOGIN_PASSWORD
encoded_data = urllib.urlencode(data)
request = urllib2.Request(LOGIN_URL, encoded_data)
response = opener.open(request)
response.geturl()
# 使用了urllib2.HTTPCookieProcessor增加了cookies的支援之後的程式碼 成功登陸
import mechanize  #自動化表單填寫
br = mechanize.Browser()
br.open(LOGIN_URL)
br.select_form(nr=0)
br['email'] = LOGIN_EMAIL
br['password'] = LOGIN_PASSWORD
response = br.submit()
br.open(COUNTRY_URL = 'http://example.webscraping.com/edit/United-Kingdom-239')
br.select_form(nr = 0)
print 'Population before:', br['population']
br['population'] = str(int(br['population']) + 1)
br.submit()
br.open(COUNTRY_URL = 'http://example.webscraping.com/edit/United-Kingdom-239')
br.select_form(nr=0)
print 'Population after:', br['population']