基於cookie登入+驗證碼如何爬取
阿新 • • 發佈:2018-12-12
例項:
需求:獲取人人網使用者登入過後的個人主頁資料
1 #雲打碼平臺登入,直接下載引用就好,無需更改 2 import http.client, mimetypes, urllib, json, time, requests 3 class YDMHttp: 4 5 apiurl = 'http://api.yundama.com/api.php' 6 username = '' 7 password = '' 8 appid = '' 9 appkey = '' 10 11 def __init__(self, username, password, appid, appkey):雲打碼平臺程式碼12 self.username = username 13 self.password = password 14 self.appid = str(appid) 15 self.appkey = appkey 16 17 def request(self, fields, files=[]): 18 response = self.post_url(self.apiurl, fields, files) 19 response = json.loads(response) 20 returnresponse 21 22 def balance(self): 23 data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 24 response = self.request(data) 25 if (response): 26 if (response['ret'] and response['ret'] < 0): 27 return response['ret'] 28 else: 29 return response['balance'] 30 else: 31 return -9001 32 33 def login(self): 34 data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 35 response = self.request(data) 36 if (response): 37 if (response['ret'] and response['ret'] < 0): 38 return response['ret'] 39 else: 40 return response['uid'] 41 else: 42 return -9001 43 44 def upload(self, filename, codetype, timeout): 45 data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} 46 file = {'file': filename} 47 response = self.request(data, file) 48 if (response): 49 if (response['ret'] and response['ret'] < 0): 50 return response['ret'] 51 else: 52 return response['cid'] 53 else: 54 return -9001 55 56 def result(self, cid): 57 data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} 58 response = self.request(data) 59 return response and response['text'] or '' 60 61 def decode(self, filename, codetype, timeout): 62 cid = self.upload(filename, codetype, timeout) 63 if (cid > 0): 64 for i in range(0, timeout): 65 result = self.result(cid) 66 if (result != ''): 67 return cid, result 68 else: 69 time.sleep(1) 70 return -3003, '' 71 else: 72 return cid, '' 73 74 def report(self, cid): 75 data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'} 76 response = self.request(data) 77 if (response): 78 return response['ret'] 79 else: 80 return -9001 81 82 def post_url(self, url, fields, files=[]): 83 for key in files: 84 files[key] = open(files[key], 'rb'); 85 res = requests.post(url, files=files, data=fields) 86 return res.text
1 def parse_codeImg(imgPath): 2 # 使用者名稱 3 username = 'xxxx'#自己註冊的使用者名稱 4 5 # 密碼 6 password = 'xxxx' #自己註冊的密碼 7 8 # 軟體ID,開發者分成必要引數。登入開發者後臺【我的軟體】獲得! 9 appid = 6372 10 11 # 軟體金鑰,開發者分成必要引數。登入開發者後臺【我的軟體】獲得! 12 appkey = '9b672eb204d7eede7ddeda5a87d7be08' 13 14 # 圖片檔案 15 filename = imgPath 16 17 # 驗證碼型別,# 例:1004表示4位字母數字,不同型別收費不同。請準確填寫,否則影響識別率。在此查詢所有型別 http://www.yundama.com/price.html 18 codetype = 2004 19 20 # 超時時間,秒 21 timeout = 30 22 23 # 檢查 24 if (username == 'username'): 25 print('請設定好相關引數再測試') 26 else: 27 # 初始化 28 yundama = YDMHttp(username, password, appid, appkey) 29 30 # 登陸雲打碼 31 uid = yundama.login(); 32 print('uid: %s' % uid) 33 34 # 查詢餘額 35 balance = yundama.balance(); 36 print('balance: %s' % balance) 37 38 # 開始識別,圖片路徑,驗證碼型別ID,超時時間(秒),識別結果 39 cid, result = yundama.decode(filename, codetype, timeout); 40 print('cid: %s, result: %s' % (cid, result)) 41 return result雲打碼平臺程式碼2
1 import requests 2 from lxml import etree 3 import json 4 import time 5 #建立一個session物件,會自動儲存cookie 6 session=requests.session() 7 #獲取人人網URL 8 url='http://www.renren.com' 9 #仿造headers 10 headers = { 11 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' 12 } 13 page_text=requests.get(url=url,headers=headers).text 14 #解析驗證碼圖片,儲存到本地 15 tree=etree.HTML(page_text) 16 code_img_url=tree.xpath('//img[@id="verifyPic_login"]/@src')[0] 17 code_img_data=requests.get(url=code_img_url,headers=headers).content 18 with open('./code.png','wb') as fp: 19 fp.write(code_img_data) 20 print("驗證碼儲存成功!!") 21 code_text=parse_codeImg('./code.png') 22 print(code_text) 23 #登入操作,獲取cookie 24 #此url以及data資料需要用Fidder工具抓包獲取 25 login_url="http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=20181131725329" 26 data={ 27 "email":"18526303496", 28 "icode":code_text, 29 "origURL":"http://www.renren.com/home", 30 "domain":"renren.com", 31 "key_id":"1", 32 "captcha_type":"web_login", 33 "password":"3f06abf49c06d3f2dfce6554f070677f2459a14159d738eb08f8f7922280f5b7", 34 "rkey":"3ca02f6d93a15caf7d0c0b3637abf5a8", 35 "f":'http%3A%2F%2Fwww.renren.com%2F969092014' 36 } 37 #使用session發起請求,將cookie儲存到session,保證請求成功, 38 session.post(url=login_url,headers=headers,data=data) 39 40 #進行個人主頁頁面的資料爬取 41 personoal_url='http://www.renren.com/969092014/profile' 42 page_text=session.get(url=personoal_url,headers=headers).text 43 with open('./renren.html','w',encoding='utf-8')as fp: 44 fp.write(page_text) 45 print('over')主程式碼