用Python寫網路爬蟲系列(三)表單處理
阿新 • • 發佈:2019-01-05
import urllib,urllib2 LOGIN_URL = r'http://example.webscraping.com/user/login' LOGIN_EMAIL = '[email protected]' LOGIN_PASSWORD ='qq123456' data ={'email':LOGIN_EMAIL,'password':LOGIN_PASSWORD} encoded_data = urllib.urlencode(data) # 首先把中文字元轉換為十六進位制,然後在每個字元前面加一個識別符號% request = urllib2.Request(LOGIN_URL,encoded_data) response = urllib2.urlopen(request) response.geturl() #如果登陸成功會跳轉到主頁 否則會跳轉到登陸頁面 # 因為除了郵箱和密碼之外還需要提交其他的幾個域 但是這些域被隱藏了,但是可以通過lxml方法訪問 import lxml.html def parse_form(html): tree = lxml.html.fromstring(html) data = {} for e in tree.cssselect('form input'): if e.get('name'): data[e.get('name')] = e.get('value') return data # 遍歷表單裡面的所有input 標籤 然後用字典的形式返回name和value的屬性值 import pprint html = urllib2.urlopen(LOGIN_URL).read() form = parse_form(html) pprint.pprint(form) #_formkey 伺服器使用這個唯一的ID來避免表單避免多次提交,每次載入網頁都會產生不同的ID然後伺服器根據這個ID來判斷是否提交過 #修改過的程式碼 html = urllib2.urlopen(LOGIN_URL).read() data = parse_form(html) data['email'] = LOGIN_EMAIL data['password'] = LOGIN_PASSWORD encoded_data = urllib.urlencode(data) request = urllib2.Request(LOGIN_URL,encoded_data) response = urllib2.urlopen(request) response.geturl() #依舊沒有辦法正常執行 因為缺少一個很重要的部分 cookies import cookielib cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) html = opener.open(LOGIN_URL).read() data = parse_form(html) data['email'] = LOGIN_EMAIL data['password'] = LOGIN_PASSWORD encoded_data = urllib.urlencode(data) request = urllib2.Request(LOGIN_URL, encoded_data) response = opener.open(request) response.geturl() # 使用了urllib2.HTTPCookieProcessor增加了cookies的支援之後的程式碼 成功登陸 import mechanize #自動化表單填寫 br = mechanize.Browser() br.open(LOGIN_URL) br.select_form(nr=0) br['email'] = LOGIN_EMAIL br['password'] = LOGIN_PASSWORD response = br.submit() br.open(COUNTRY_URL = 'http://example.webscraping.com/edit/United-Kingdom-239') br.select_form(nr = 0) print 'Population before:', br['population'] br['population'] = str(int(br['population']) + 1) br.submit() br.open(COUNTRY_URL = 'http://example.webscraping.com/edit/United-Kingdom-239') br.select_form(nr=0) print 'Population after:', br['population']