# coding:utf-8
import unittest
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import _elementtree

class test:
    def __init__(self):

        self.first_name = 'hongyuan'  # 名
        self.last_name = 'guo'  # 姓
        self.email = '[email protected]'  # 郵箱
        self.password = ' 
[email protected]'  # 密碼
        self.password_two = '[email protected]'  # 二次輸入密碼
        self.response_field = 'other'  # 驗證碼

        self.driver = webdriver.Firefox()

    def testEle(self):
        driver = self.driver
        driver.maximize_window()
        driver.get("http://example.webscraping.com/places/default/user/register#")

        driver.find_element_by_id("auth_user_first_name").send_keys(self.first_name)
        driver.find_element_by_id("auth_user_last_name").send_keys(self.last_name)
        driver.find_element_by_id("auth_user_email").send_keys(self.email)
        driver.find_element_by_id("auth_user_password").send_keys(self.password)
        driver.find_element_by_id("auth_user_password_two").send_keys(self.password_two)
        driver.find_element_by_name("recaptcha_response_field").send_keys(self.response_field)

        driver.find_element_by_css_selector("input.btn").click()
        driver.switch_to.default_content()
        soup = BeautifulSoup(driver.page_source,'lxml')  # xml,html亦可

        print soup



if __name__ == "__main__":
    q = test()
    q.testEle()

載入驗證碼影象

以上的結果說明基本框架已經搭好，我們只要從驗證碼中提取文字，提交表單即可。

通過一下程式碼便可獲得並檢視驗證碼圖片

開始幾行使用 lxml 從表單中獲取影象資料。影象資料的字首定義了資料型別。在本例中，這是一張進行了 Base64 編碼的 PNG 影象，這種格式會使用 ASCII 編碼表示二進位制資料。我們可以通過在第一個逗號處分割的方法移除該字首。然後，使用 Base64 解碼影象資料，回到最初的二進位制格式。要想載入影象， PIL 需要一個類似檔案的介面，所以在傳給Image類之前，我們又使用了 BytesIO 對這個二進位制資料進行了封裝。

......
    def get_captcha(self,img_data):
        print img_data
        img = str(img_data).partition(',')[-1]
        binary_img_data = img.decode('base64')
        file_like = BytesIO(binary_img_data)
        img_ = Image.open(file_like)
        plt.imshow(img_)
        plt.show()

    def testEle(self):
        driver = self.driver
        driver.maximize_window()
        driver.get("http://example.webscraping.com/places/default/user/register#")
        soup_1 = BeautifulSoup(driver.page_source, 'lxml')
        img = soup_1. find('img').get('src')
        self.get_captcha(img)
......

光學字元識別

從圖片可以看出，驗證碼文字一般都是黑色的，背景則會更加明亮，所以我們可以通過檢查畫素是否為黑色將文字分離出來該處理過程又被稱為閣值化。通過Pillow可以很容易地實現該處理過程。

儲存圖片

img_.save('capcha_original.png')

將原圖片轉化為灰度圖儲存，最後只將灰度圖中純黑的部分儲存。

...... 
    img = Image.open('capcha_original.png')
    gray = img.convert('L')
    gray.save('capcha_gray.png')
    bw = gray.point(lambda x: 0 if x < 1 else 255, '1')
    bw.save('capcha_thresholded.png')
......

使用tesseract分析驗證碼

tesseract以及pytesseract的配置見

https://blog.csdn.net/sinat_36053757/article/details/78136005

...... 
bw = Image.open('capcha_thresholded.png')
print pytesseract.image_to_string(bw)
......

完整註冊提交程式碼

執行，成功註冊

# coding:utf-8
import unittest
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import _elementtree
from io import BytesIO
import lxml.html
from PIL import Image
import matplotlib.pyplot as plt
import pytesseract

class test:
    def __init__(self):

        self.first_name = 'hongyuan1'  # 名
        self.last_name = 'guo1'  # 姓
        self.email = '*'  # 郵箱
        self.password = '*'  # 密碼
        self.password_two = '*'  # 二次輸入密碼
        self.response_field = 'other'  # 驗證碼

        self.driver = webdriver.Firefox()

    def get_captcha(self,img_data):
        print img_data
        img = str(img_data).partition(',')[-1]
        binary_img_data = img.decode('base64')
        file_like = BytesIO(binary_img_data)
        img_ = Image.open(file_like)
        img_.save('capcha_original.png')
        gray = img_.convert('L')
        gray.save('capcha_gray.png')
        bw = gray.point(lambda x: 0 if x < 1 else 255, '1')
        bw.save('capcha_thresholded.png')
        print pytesseract.image_to_string(bw)
        return pytesseract.image_to_string(bw)

        #plt.imshow(img_)
        #plt.show()

    def testEle(self):
        driver = self.driver
        driver.maximize_window()
        driver.get("http://example.webscraping.com/places/default/user/register#")
        soup_1 = BeautifulSoup(driver.page_source, 'lxml')
        img = soup_1. find('img').get('src')
        self.response_field = self.get_captcha(img)

        driver.find_element_by_id("auth_user_first_name").send_keys(self.first_name)
        driver.find_element_by_id("auth_user_last_name").send_keys(self.last_name)
        driver.find_element_by_id("auth_user_email").send_keys(self.email)
        driver.find_element_by_id("auth_user_password").send_keys(self.password)
        driver.find_element_by_id("auth_user_password_two").send_keys(self.password_two)
        driver.find_element_by_name("recaptcha_response_field").send_keys(self.response_field)

        driver.find_element_by_css_selector("input.btn").click()
        driver.switch_to.default_content()
        soup_2 = BeautifulSoup(driver.page_source,'lxml')  # xml,html亦可

        print soup_2
if __name__ == "__main__":
    q = test()
    q.testEle()

結束語

1.這是是用最low的方法解決最low的驗證碼，其他方法，博主會繼續探索

2.這個提交表單頁面選的失敗，幸好博主有兩個郵箱，一個測試，一個最後實戰，建議大家將帶驗證碼的網頁收集起來，多多分享

3.這裡使用driver驅動瀏覽器，一個缺點是慢，第二個是必須同樣用這種方法的爬蟲程式才能解決驗證碼問題，還的潛心研習js,盡情破解加密欄位。

爬蟲簡單驗證碼處理，Tesseract簡單使用

思路

測試

解析網頁

程式碼

載入驗證碼影象

光學字元識別

使用tesseract分析驗證碼

完整註冊提交程式碼

結束語

爬蟲簡單驗證碼處理，Tesseract簡單使用

爬蟲之簡單驗證碼處理

scrapy+python當你的爬蟲遇到驗證碼處理方式之一

python爬蟲實現登陸簡單圖片驗證碼識別（Tesseract識別）

JFinal框架——簡單驗證碼

java生成簡單驗證碼圖片

驗證碼識別（最簡單之印刷體數字）

網站登錄簡單驗證碼

python簡單驗證碼

Python之簡單驗證碼實現

隨機數random、簡單驗證碼隨機

QT實現簡單驗證碼

python模塊——random模塊（簡單驗證碼實現）

基於TensorFlow的簡單驗證碼識別

Windows虛擬地址轉物理地址（原理+源碼實現，附簡單小工具）

macOS python3 簡單驗證碼識別

用python擷取螢幕特定位置（具體class）的圖片（多用於爬蟲時遇到的驗證碼擷取，再進行反反爬）

那些年，我爬過的北科(八)——反反爬蟲之驗證碼識別

爬蟲-驗證碼處理

Python實現簡單驗證碼的轉文字

爬蟲簡單驗證碼處理，Tesseract簡單使用

思路

測試

解析網頁

程式碼

載入驗證碼影象

光學字元識別

使用tesseract分析驗證碼

完整註冊提交程式碼

結束語

相關推薦