1. 程式人生 > >基於cookie登入+驗證碼如何爬取

基於cookie登入+驗證碼如何爬取

例項:

需求:獲取人人網使用者登入過後的個人主頁資料

 1 #雲打碼平臺登入,直接下載引用就好,無需更改
 2 import http.client, mimetypes, urllib, json, time, requests
 3 class YDMHttp:
 4 
 5     apiurl = 'http://api.yundama.com/api.php'
 6     username = ''
 7     password = ''
 8     appid = ''
 9     appkey = ''
10 
11     def __init__(self, username, password, appid, appkey):
12 self.username = username 13 self.password = password 14 self.appid = str(appid) 15 self.appkey = appkey 16 17 def request(self, fields, files=[]): 18 response = self.post_url(self.apiurl, fields, files) 19 response = json.loads(response) 20 return
response 21 22 def balance(self): 23 data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 24 response = self.request(data) 25 if (response): 26 if (response['ret'] and response['ret
'] < 0): 27 return response['ret'] 28 else: 29 return response['balance'] 30 else: 31 return -9001 32 33 def login(self): 34 data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 35 response = self.request(data) 36 if (response): 37 if (response['ret'] and response['ret'] < 0): 38 return response['ret'] 39 else: 40 return response['uid'] 41 else: 42 return -9001 43 44 def upload(self, filename, codetype, timeout): 45 data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} 46 file = {'file': filename} 47 response = self.request(data, file) 48 if (response): 49 if (response['ret'] and response['ret'] < 0): 50 return response['ret'] 51 else: 52 return response['cid'] 53 else: 54 return -9001 55 56 def result(self, cid): 57 data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} 58 response = self.request(data) 59 return response and response['text'] or '' 60 61 def decode(self, filename, codetype, timeout): 62 cid = self.upload(filename, codetype, timeout) 63 if (cid > 0): 64 for i in range(0, timeout): 65 result = self.result(cid) 66 if (result != ''): 67 return cid, result 68 else: 69 time.sleep(1) 70 return -3003, '' 71 else: 72 return cid, '' 73 74 def report(self, cid): 75 data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'} 76 response = self.request(data) 77 if (response): 78 return response['ret'] 79 else: 80 return -9001 81 82 def post_url(self, url, fields, files=[]): 83 for key in files: 84 files[key] = open(files[key], 'rb'); 85 res = requests.post(url, files=files, data=fields) 86 return res.text
雲打碼平臺程式碼
 1 def parse_codeImg(imgPath):
 2     # 使用者名稱
 3     username    = 'xxxx'#自己註冊的使用者名稱
 4 
 5     # 密碼
 6     password    = 'xxxx'    #自己註冊的密碼                     
 7 
 8     # 軟體ID,開發者分成必要引數。登入開發者後臺【我的軟體】獲得!
 9     appid       = 6372                                     
10 
11     # 軟體金鑰,開發者分成必要引數。登入開發者後臺【我的軟體】獲得!
12     appkey      = '9b672eb204d7eede7ddeda5a87d7be08'    
13 
14     # 圖片檔案
15     filename    = imgPath                       
16 
17     # 驗證碼型別,# 例:1004表示4位字母數字,不同型別收費不同。請準確填寫,否則影響識別率。在此查詢所有型別 http://www.yundama.com/price.html
18     codetype    = 2004
19 
20     # 超時時間,秒
21     timeout     = 30                                    
22 
23     # 檢查
24     if (username == 'username'):
25         print('請設定好相關引數再測試')
26     else:
27         # 初始化
28         yundama = YDMHttp(username, password, appid, appkey)
29 
30         # 登陸雲打碼
31         uid = yundama.login();
32         print('uid: %s' % uid)
33 
34         # 查詢餘額
35         balance = yundama.balance();
36         print('balance: %s' % balance)
37 
38         # 開始識別,圖片路徑,驗證碼型別ID,超時時間(秒),識別結果
39         cid, result = yundama.decode(filename, codetype, timeout);
40         print('cid: %s, result: %s' % (cid, result))
41         return result
雲打碼平臺程式碼2
 1 import requests
 2 from lxml import etree
 3 import json
 4 import time
 5 #建立一個session物件,會自動儲存cookie
 6 session=requests.session()
 7 #獲取人人網URL
 8 url='http://www.renren.com'
 9 #仿造headers
10 headers = {
11     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
12 }
13 page_text=requests.get(url=url,headers=headers).text
14 #解析驗證碼圖片,儲存到本地
15 tree=etree.HTML(page_text)
16 code_img_url=tree.xpath('//img[@id="verifyPic_login"]/@src')[0]
17 code_img_data=requests.get(url=code_img_url,headers=headers).content
18 with open('./code.png','wb') as fp:
19     fp.write(code_img_data)
20     print("驗證碼儲存成功!!")
21 code_text=parse_codeImg('./code.png')
22 print(code_text)
23 #登入操作,獲取cookie
24 #此url以及data資料需要用Fidder工具抓包獲取
25 login_url="http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=20181131725329"
26 data={
27     "email":"18526303496",
28     "icode":code_text,
29     "origURL":"http://www.renren.com/home",
30     "domain":"renren.com",
31     "key_id":"1",
32     "captcha_type":"web_login",
33     "password":"3f06abf49c06d3f2dfce6554f070677f2459a14159d738eb08f8f7922280f5b7",
34     "rkey":"3ca02f6d93a15caf7d0c0b3637abf5a8",
35     "f":'http%3A%2F%2Fwww.renren.com%2F969092014'   
36 }
37 #使用session發起請求,將cookie儲存到session,保證請求成功,
38 session.post(url=login_url,headers=headers,data=data)
39 
40 #進行個人主頁頁面的資料爬取
41 personoal_url='http://www.renren.com/969092014/profile'
42 page_text=session.get(url=personoal_url,headers=headers).text
43 with open('./renren.html','w',encoding='utf-8')as fp:
44     fp.write(page_text)
45     print('over')
主程式碼