1. 程式人生 > >python之驗證碼識別 特征向量提取和余弦相似性比較

python之驗證碼識別 特征向量提取和余弦相似性比較

wow gif .get extra time ade upd orm log

0.目錄

1.參考
2.沒事畫個流程圖
3.完整代碼
4.改進方向

1.參考

https://en.wikipedia.org/wiki/Cosine_similarity

https://zh.wikipedia.org/wiki/%E4%BD%99%E5%BC%A6%E7%9B%B8%E4%BC%BC%E6%80%A7

Cosine similarity
Given two vectors of attributes, A and B, the cosine similarity, cos(θ),
is represented using a dot product and magnitude as...
余弦相似性通過測量兩個向量的夾角的余弦值來度量它們之間的相似性。0度角的余弦值是1,
余弦相似度通常用於正空間,因此給出的值為0到1之間。
範數(norm),是具有“長度”概念的函數。二維度的向量的歐氏範數就是箭號的長度。

Python 破解驗證碼

python3驗證碼機器學習

技術分享

這兩篇文章在計算矢量大小的時候函數參數都寫成 concordance調和, 而不用 coordinate坐標, 為何???

歐氏距離和余弦相似度

numpy中提供了範數的計算工具:linalg.norm()

所以計算cosθ起來非常方便(假定A、B均為列向量):

num = float(A.T * B) #若為行向量則 A * B.T
denom = linalg.norm(A) * linalg.norm(B)
cos = num / denom #余弦值

2.沒事畫個流程圖

流程圖 Graphviz - Graph Visualization Software

技術分享

3.完整代碼

技術分享
#!/usr/bin/env python
# -*- coding: UTF-8 -*
import os
import time
import re
from urlparse import urljoin

import requests
ss = requests.Session()
ss.headers.update({user-agent:Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0})

from PIL import Image
# https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000/001431918785710e86a1a120ce04925bae155012c7fc71e000
# 和StringIO類似,可以用一個bytes初始化BytesIO,然後,像讀文件一樣讀取: from io import BytesIO from string import ascii_letters, digits import numpy as np # ip_port_type_tuple_list = [] class Mimvp(): def __init__(self, num_width=None, feature_vectors=None, white_before_black=2, threshhold=100, max_nums=None, filepath=None, page=None): self.ip_port_type_tuple_list = [] #fluent p189 if feature_vectors is None: self.feature_vectors = [] else: self.feature_vectors = list(feature_vectors) self.num_width = num_width self.white_before_black = white_before_black self.threshhold = threshhold self.max_nums = max_nums self.filepath = filepath if page is None: self.url = http://proxy.%s.com/free.php?proxy=in_hp%mimvp else: self.url = http://proxy.%s.com/free.php?proxy=in_hp&sort=&page=%s %(mimvp, page) def get_mimvp(self): # 預處理提取特征組需要取得 self.port_src_list if self.feature_vectors == []: self.extract_features() self.load_mimvp() self.get_port_list() self.merge_result() return self.ip_port_type_tuple_list def load_mimvp(self): resp = ss.get(self.url) self.ip_list = re.findall(r"class=‘tbl-proxy-ip‘.*?>(.*?)<", resp.text) self.port_src_list = re.findall(r"class=‘tbl-proxy-port‘.*?src=(.*?)\s*/>", resp.text) #圖片鏈接 self.type_list = re.findall(r"class=‘tbl-proxy-type‘.*?>(.*?)<", resp.text) def get_port_list(self): self.port_list = [] for src in self.port_src_list: port = self.get_port(src) self.port_list.append(port) def get_port(self, src): img = self.load_image_from_src(src) split_imgs = self.split_image(img) port = ‘‘ for split_img in split_imgs: vector = self.build_vector(split_img) compare_results = [] for t in self.feature_vectors: cos = self.cos_similarity(vector, t.values()[0]) compare_results.append((cos, t.keys()[0])) # print sorted(compare_results, reverse=True) port += sorted(compare_results, reverse=True)[0][1] print port return port def load_image_from_src(self, src): src = urljoin(self.url, src) print src, resp = ss.get(src) fp = BytesIO(resp.content) img = Image.open(fp) return img def split_image(self, img): gray = img.convert(L) if self.num_width is None: img.show() print gray.getcolors() self.num_width = int(raw_input(num_width:)) self.white_before_black = int(raw_input(white_before_black:)) self.threshhold = int(raw_input(BLACK < (threshhold) < WHITE:)) gray_array = np.array(gray) bilevel_array = np.where(gray_array<self.threshhold,1,0) #標記黑點為1,方便後續掃描 left_list = [] # 從左到右按列求和 vertical = bilevel_array.sum(0) # print vertical # 從左到右按列掃描,2白1黑確定為數字左邊緣 for i,c in enumerate(vertical[:-self.white_before_black]): if self.white_before_black == 1: if vertical[i] == 0 and vertical[i+1] != 0: left_list.append(i+1) else: if vertical[i] == 0 and vertical[i+1] == 0 and vertical[i+2] != 0: left_list.append(i+2) if len(left_list) == self.max_nums: break # 分割可見圖片 # bilevel = Image.fromarray(bilevel_array) #0/1 手工提取特征 show顯示黑塊 還沒保存gif bilevel = Image.fromarray(np.where(gray_array<self.threshhold,0,255)) # the left, upper, right, and lower pixel split_imgs = [bilevel.crop((each_left, 0, each_left+self.num_width, img.height)) for each_left in left_list] return split_imgs def build_vector(self, img): # img = Image.open(img) img_array = np.array(img) # 先遍歷w,再遍歷h,總共w+h維度,不需要/255,標記黑點個數等多余處理 return list(img_array.sum(0)) + list(img_array.sum(1)) def cos_similarity(self, a, b): A = np.array(a) B = np.array(b) dot_product = float(np.dot(A, B)) # A*(B.T) 達不到目的 magnitude_product = np.linalg.norm(A) * np.linalg.norm(B) cos = dot_product / magnitude_product return cos def merge_result(self): for ip, port, _type in zip(self.ip_list, self.port_list, self.type_list): if / in _type: self.ip_port_type_tuple_list.append((ip, port, both)) elif _type == HTTPS: self.ip_port_type_tuple_list.append((ip, port, HTTPS)) else: self.ip_port_type_tuple_list.append((ip, port, HTTP)) def extract_features(self): if self.filepath is not None: img_list = self.load_images_from_filepath() else: self.load_mimvp() img_list = self.load_images_from_src_list() for img in img_list: split_imgs = self.split_image(img) for split_img in split_imgs: split_img.show() print split_img.getcolors() input = raw_input(input:) vector = self.build_vector(split_img) item = {input: vector} if item not in self.feature_vectors: print item self.feature_vectors.append(item) for i in sorted(self.feature_vectors): print i,, def load_images_from_filepath(self): img_list = [] postfix = [jpg, png, gif, bmp] for filename in [i for i in os.listdir(self.filepath) if i[-3:] in postfix]: file = os.path.join(self.filepath, filename) img_list.append(Image.open(file)) return img_list def load_images_from_src_list(self): img_list = [] for src in self.port_src_list: img = self.load_image_from_src(src) img_list.append(img) return img_list if __name__ == __main__: feature_vectors = [ {0: [4845, 5865, 5865, 5865, 5865, 4845, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} , {1: [5865, 5865, 3825, 6120, 6120, 6375, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1275, 1020, 1020, 1275, 1275, 1275, 1275, 1275, 1275, 255, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} , {2: [5100, 5610, 5610, 5610, 5610, 5355, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1275, 1020, 1275, 1275, 1275, 1275, 0, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} , {3: [5355, 5865, 5610, 5610, 5610, 4590, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1275, 765, 1275, 1275, 1020, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} , {4: [5610, 5865, 5865, 5865, 3825, 6120, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1275, 1020, 1020, 1020, 1020, 1020, 0, 1275, 1275, 1275, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} , {5: [4845, 5610, 5610, 5610, 5610, 5100, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 0, 1275, 1275, 1275, 255, 1275, 1275, 1275, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} , {6: [4590, 5610, 5610, 5610, 5610, 5355, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 765, 1275, 1275, 1275, 255, 1020, 1020, 1020, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} , {7: [6120, 6120, 6120, 5100, 5355, 5610, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 0, 1275, 1275, 1275, 1275, 1275, 1275, 1275, 1275, 1275, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} , {8: [4590, 5610, 5610, 5610, 5610, 4590, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1020, 510, 1020, 1020, 1020, 1020, 510, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} , {9: [5610, 5610, 5610, 5610, 5610, 4590, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 1530, 510, 1020, 1020, 1020, 255, 1275, 1275, 1275, 1275, 765, 1530, 1530, 1530, 1530, 1530, 1530, 1530]} , ] # def __init__(self, feature_vectors=None, filepath=None, page=None): obj = Mimvp(num_width=6, feature_vectors=feature_vectors) # obj = Mimvp() # obj = Mimvp(filepath=‘temp/‘) ip_port_type_tuple_list = obj.get_mimvp() from pprint import pprint pprint(ip_port_type_tuple_list)
View Code

4.改進方向

記錄每個分割數字的x軸實際長度,這樣的話考慮到不對圖片的上下留白做處理,每個實際數字的h固定,而w不定,因此建立特征向量的時候改為先遍歷h,再遍歷w。

考慮到在比較余弦相似性的時候由於叉乘需要兩個向量具有相同的維度,這裏需要每次取最小維度再比較。

如此,在建立特征向量集的時候不需要提前指定每張分割數字為固定寬度。

python之驗證碼識別 特征向量提取和余弦相似性比較