python之驗證碼識別特征向量提取和余弦相似性比較

阿新 • • 發佈：2017-08-30

wow gif .get extra time ade upd orm log

0.目錄

1.參考
2.沒事畫個流程圖
3.完整代碼
4.改進方向

1.參考

https://en.wikipedia.org/wiki/Cosine_similarity

https://zh.wikipedia.org/wiki/%E4%BD%99%E5%BC%A6%E7%9B%B8%E4%BC%BC%E6%80%A7

Cosine similarity
Given two vectors of attributes, A and B, the cosine similarity, cos(θ),
is represented using a dot product and magnitude as...
余弦相似性通過測量兩個向量的夾角的余弦值來度量它們之間的相似性。0度角的余弦值是1，
余弦相似度通常用於正空間，因此給出的值為0到1之間。
範數(norm)，是具有“長度”概念的函數。二維度的向量的歐氏範數就是箭號的長度。

Python 破解驗證碼

python3驗證碼機器學習

技術分享

這兩篇文章在計算矢量大小的時候函數參數都寫成 concordance調和，而不用 coordinate坐標，為何？？？

歐氏距離和余弦相似度

numpy中提供了範數的計算工具：linalg.norm()

所以計算cosθ起來非常方便（假定A、B均為列向量）：

num = float(A.T * B) #若為行向量則 A * B.T
denom = linalg.norm(A) * linalg.norm(B)
cos = num / denom #余弦值

2.沒事畫個流程圖

流程圖 Graphviz - Graph Visualization Software

技術分享

3.完整代碼

#!/usr/bin/env python
# -*- coding: UTF-8 -*
import os
import time
import re
from urlparse import urljoin

import requests
ss = requests.Session()
ss.headers.update({‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0‘})

from PIL import Image
# https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000/001431918785710e86a1a120ce04925bae155012c7fc71e000 

# 和StringIO類似，可以用一個bytes初始化BytesIO，然後，像讀文件一樣讀取：
from io import BytesIO
from string import ascii_letters, digits

import numpy as np 

# ip_port_type_tuple_list = []
  
class Mimvp():
    def __init__(self, num_width=None, feature_vectors=None, white_before_black=2, threshhold=100, max_nums=None, filepath=None, page=None):
        self.ip_port_type_tuple_list = []
        
        #fluent p189
        if feature_vectors is None:
            self.feature_vectors = []
        else:
            self.feature_vectors = list(feature_vectors)
             
        self.num_width = num_width
        self.white_before_black = white_before_black
        self.threshhold = threshhold
        self.max_nums = max_nums

        self.filepath = filepath
        
        if page is None:
            self.url = ‘http://proxy.%s.com/free.php?proxy=in_hp‘%‘mimvp‘
        else:
            self.url = ‘http://proxy.%s.com/free.php?proxy=in_hp&sort=&page=%s‘ %(‘mimvp‘, page)   

    def get_mimvp(self):

        # 預處理提取特征組需要取得 self.port_src_list
        if self.feature_vectors == []:
            self.extract_features()

        self.load_mimvp()
        self.get_port_list()
        self.merge_result()
        return self.ip_port_type_tuple_list
        
    def load_mimvp(self):
        resp = ss.get(self.url)
        self.ip_list = re.findall(r"class=‘tbl-proxy-ip‘.*?>(.*?)<", resp.text)
        self.port_src_list = re.findall(r"class=‘tbl-proxy-port‘.*?src=(.*?)\s*/>", resp.text)      #圖片鏈接
        self.type_list = re.findall(r"class=‘tbl-proxy-type‘.*?>(.*?)<", resp.text)
                 
    def get_port_list(self):
        self.port_list = []
        for src in self.port_src_list:
            port = self.get_port(src)
            self.port_list.append(port)
        
    def get_port(self, src):    
        img = self.load_image_from_src(src)
        split_imgs = self.split_image(img)
        
        port = ‘‘
        for split_img in split_imgs:
            vector = self.build_vector(split_img)    
            compare_results = []
            for t in self.feature_vectors:
                cos = self.cos_similarity(vector, t.values()[0])
                compare_results.append((cos, t.keys()[0]))
            # print sorted(compare_results, reverse=True)
            port += sorted(compare_results, reverse=True)[0][1]
        print port
        return port

    def load_image_from_src(self, src):
        src = urljoin(self.url, src)
        print src,
        resp = ss.get(src)

        fp = BytesIO(resp.content)
        img = Image.open(fp)
        return img
        
    def split_image(self, img):
        gray = img.convert(‘L‘)
        
        if self.num_width is None:
            img.show()
            print gray.getcolors()
            self.num_width = int(raw_input(‘num_width:‘))
            self.white_before_black = int(raw_input(‘white_before_black:‘))
            self.threshhold = int(raw_input(‘BLACK < (threshhold) < WHITE:‘))
            
        gray_array = np.array(gray)
        bilevel_array = np.where(gray_array<self.threshhold,1,0)  #標記黑點為1，方便後續掃描
        
        left_list = []
        # 從左到右按列求和
        vertical = bilevel_array.sum(0)
        # print vertical
        # 從左到右按列掃描，2白1黑確定為數字左邊緣
        for i,c in enumerate(vertical[:-self.white_before_black]):
            if self.white_before_black == 1:
                if vertical[i] == 0 and vertical[i+1] != 0:
                    left_list.append(i+1)
            else:                    
                if vertical[i] == 0 and vertical[i+1] == 0 and vertical[i+2] != 0:
                    left_list.append(i+2)
            if len(left_list) == self.max_nums:
                break
                
        # 分割可見圖片
        # bilevel = Image.fromarray(bilevel_array)    #0/1 手工提取特征 show顯示黑塊 還沒保存gif     
        bilevel = Image.fromarray(np.where(gray_array<self.threshhold,0,255))   
        # the left, upper, right, and lower pixel
        split_imgs = [bilevel.crop((each_left, 0, each_left+self.num_width, img.height)) for each_left in left_list]

        return split_imgs
        
    def build_vector(self, img):
        # img = Image.open(img)
        img_array = np.array(img) 
        # 先遍歷w，再遍歷h，總共w+h維度，不需要/255，標記黑點個數等多余處理
        return list(img_array.sum(0)) + list(img_array.sum(1))

    def cos_similarity(self, a, b):
        A = np.array(a)
        B = np.array(b)    
        dot_product = float(np.dot(A, B))   # A*(B.T) 達不到目的
        magnitude_product = np.linalg.norm(A) * np.linalg.norm(B)
        cos = dot_product / magnitude_product
        return cos
        
    def merge_result(self):
        for ip, port, _type in zip(self.ip_list, self.port_list, self.type_list):
            if ‘/‘ in _type:
                self.ip_port_type_tuple_list.append((ip, port, ‘both‘))        
            elif _type == ‘HTTPS‘:
                self.ip_port_type_tuple_list.append((ip, port, ‘HTTPS‘))
            else:
                self.ip_port_type_tuple_list.append((ip, port, ‘HTTP‘))
  
    def extract_features(self):
        if self.filepath is not None:
            img_list = self.load_images_from_filepath()
        else:
            self.load_mimvp()
            img_list = self.load_images_from_src_list()
        for img in img_list:
            split_imgs = self.split_image(img)
            for split_img in split_imgs:
                split_img.show()
                print split_img.getcolors()
                input = raw_input(‘input:‘)
                vector = self.build_vector(split_img)
                item = {input: vector}
                if item not in self.feature_vectors:
                    print item
                    self.feature_vectors.append(item)
            
        for i in sorted(self.feature_vectors):        
            print i,‘,‘   
    
    def load_images_from_filepath(self):
        img_list = []
        postfix = [‘jpg‘, ‘png‘, ‘gif‘, ‘bmp‘]
        for filename in [i for i in os.listdir(self.filepath) if i[-3:] in postfix]:
            file = os.path.join(self.filepath, filename)
            img_list.append(Image.open(file))
        return img_list
        
    def load_images_from_src_list(self):
        img_list = []
        for src in self.port_src_list:
            img = self.load_image_from_src(src)
            img_list.append(img)
        return img_list
    
   
    
if __name__ == ‘__main__‘:
   
    feature_vectors = [    
    {‘0‘: [4845, 5865, 5865, 5865, 5865, 4845, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
    {‘1‘: [5865, 5865, 3825, 6120, 6120, 6375, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1275, 1020, 1020, 1275, 1275, 1275, 1275, 1275, 1275, 255, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
    {‘2‘: [5100, 5610, 5610, 5610, 5610, 5355, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1275, 1020, 1275, 1275, 1275, 1275, 0, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
    {‘3‘: [5355, 5865, 5610, 5610, 5610, 4590, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1275, 765, 1275, 1275, 1020, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
    {‘4‘: [5610, 5865, 5865, 5865, 3825, 6120, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1275, 1020, 1020, 1020, 1020, 1020, 0, 1275, 1275, 1275, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
    {‘5‘: [4845, 5610, 5610, 5610, 5610, 5100, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 0, 1275, 1275, 1275, 255, 1275, 1275, 1275, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
    {‘6‘: [4590, 5610, 5610, 5610, 5610, 5355, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 765, 1275, 1275, 1275, 255, 1020, 1020, 1020, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
    {‘7‘: [6120, 6120, 6120, 5100, 5355, 5610, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 0, 1275, 1275, 1275, 1275, 1275, 1275, 1275, 1275, 1275, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
    {‘8‘: [4590, 5610, 5610, 5610, 5610, 4590, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1020, 510, 1020, 1020, 1020, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} ,
    {‘9‘: [5610, 5610, 5610, 5610, 5610, 4590, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1020, 255, 1275, 1275, 1275, 1275, 765, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} , 
    ]   
    
    # def __init__(self, feature_vectors=None, filepath=None, page=None):
    
    obj = Mimvp(num_width=6, feature_vectors=feature_vectors)
    # obj = Mimvp()    
    # obj = Mimvp(filepath=‘temp/‘)   
    
    ip_port_type_tuple_list = obj.get_mimvp()
    
    from pprint import pprint
    pprint(ip_port_type_tuple_list)

View Code

4.改進方向

記錄每個分割數字的x軸實際長度，這樣的話考慮到不對圖片的上下留白做處理，每個實際數字的h固定，而w不定，因此建立特征向量的時候改為先遍歷h，再遍歷w。

考慮到在比較余弦相似性的時候由於叉乘需要兩個向量具有相同的維度，這裏需要每次取最小維度再比較。

如此，在建立特征向量集的時候不需要提前指定每張分割數字為固定寬度。

python之驗證碼識別特征向量提取和余弦相似性比較

wow gif .get extra time ade upd orm log 0.目錄 1.參考2.沒事畫個流程圖3.完整代碼4.改進方向 1.參考 https://en.wikipedia.org/wiki/Cosine_similarity https://zh.wi

python之驗證碼識別特征向量提取和余弦相似性比較

0.目錄

1.參考

2.沒事畫個流程圖

3.完整代碼

4.改進方向

python之驗證碼識別特征向量提取和余弦相似性比較

python 豆瓣驗證碼識別總結

那些年，我爬過的北科(八)——反反爬蟲之驗證碼識別

(一)python爬蟲驗證碼識別（去除干擾線）

Python進行驗證碼識別

(二) python爬蟲驗證碼識別（去除干擾線）

驗證碼識別 Tesseract的簡單使用和總結

Python網路爬蟲之極驗滑動驗證碼識別

Python爬蟲之自動登入與驗證碼識別

Python 新手實戰之機器學習實現簡單驗證碼識別(一)：用PIL簡單繪製驗證碼

Python影象處理之圖片驗證碼識別

Python爬蟲入門教程 58-100 python爬蟲高級技術之驗證碼篇4-極驗證識別技術之一

linux環境下pytesseract的安裝和央行征信中心的登錄驗證碼識別實戰

驗證碼識別（最簡單之印刷體數字）

Python 振動分析叠代法計算高階特征值及特征向量

python tesseract-ocr 基礎驗證碼識別功能（Windows）

Python與矩陣論——特征值與特征向量

基於SVM的python簡單實現驗證碼識別

python 驗證碼識別示例（二）復雜驗證碼識別

python使用tesseract-ocr完成驗證碼識別

python之驗證碼識別 特征向量提取和余弦相似性比較

0.目錄

1.參考

2.沒事畫個流程圖

3.完整代碼

4.改進方向

相關推薦

python之驗證碼識別特征向量提取和余弦相似性比較