計算機視覺(四):使用K-NN分類器對CIFAR-10進行分類
阿新 • • 發佈:2019-01-13
1 - 引言
之前我們學習了KNN分類器的原理,現在讓我們將KNN分類器應用在計算機視覺中,學習如何使用這個演算法來進行圖片分類。
2 - 準備工作
- 建立專案結構如圖所示
- 在datasets檔案中下載資料集Cifar-10
- k_nearest_neighbor.py程式碼如下:
import numpy as np from numpy import * # 匯入numpy的庫函式 class KNearestNeighbor(object): """ 一個使用L2距離的KNN分類器""" def __init__(self): pass def train(self, X, y): """ KNN的訓練就是讀取資料並存儲 輸入: - X : 是一個numpy型別的陣列,維數是(num_train,D) - y : 是一個numpy型別的陣列,維數是(N,) """ self.X_train = X self.y_train = y def predict(self, X, k=1, num_loops=0): """ 選擇計算距離的迴圈的方式來預測y的值 輸入: - X :一個numpy型別的陣列,維數是(num_test,D) - k : 選擇距離最小的數量 -num_loops : 迴圈的方式 返回: - y : 一個numpy型別的陣列(num_test,) """ if num_loops == 0: dists = self.compute_distances_no_loops(X) elif num_loops == 1: dists = self.compute_distances_one_loop(X) elif num_loops == 2: dists = self.compute_distances_two_loops(X) else: raise ValueError('Invalid value %d for num_loops' % num_loops) return self.predict_labels(dists, k=k) def compute_distances_two_loops(self, X): """ 使用兩層迴圈來計算測試資料與每一個訓練資料之間的距離 輸入: - X :一個numpy型別的陣列,維數(num_test,D) 返回: - dists : 一個numpy型別的陣列,維數(num_test,num_train),dists[i,j]儲存了test[i]到train[j] 之間的距離 """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) for i in range(num_test): for j in range(num_train): distances = np.sqrt(np.sum(np.square(self.X_train[j] - X[i]))) dists[i, j] = distances return dists def compute_distances_one_loop(self, X): """ (利用python的廣播機制) 使用一層迴圈來計算測試資料與每一個訓練資料之間的距離 輸入/輸出:與兩層迴圈一樣 """ num_test = X.shape[0] num_train = self.X_train.shape[0] dists = np.zeros((num_test, num_train)) for i in range(num_test): distances = np.sqrt(np.sum(np.square(self.X_train - X[i]), axis=1)) dists[i, :] = distances return dists def compute_distances_no_loops(self, X): """ (利用python的廣播機制) 計算距離不使用迴圈 輸入/輸出:與兩層迴圈一致 """ M = np.dot(X, self.X_train.T) nrow = M.shape[0] ncol = M.shape[1] #np.diag : 輸出矩陣的對角線元素 te = np.diag(np.dot(X, X.T)) tr = np.diag(np.dot(self.X_train, self.X_train.T)) te = np.reshape(np.repeat(te, ncol), M.shape) tr = np.reshape(np.repeat(tr, nrow), M.T.shape) sq = -2 * M + te + tr.T dists = np.sqrt(sq) # ans # M = np.dot(X, self.X_train.T) # te = np.square(X).sum(axis=1) # tr = np.square(self.X_train).sum(axis=1) # dists = np.sqrt(-2 * M + tr + np.matrix(te).T) # print(M.shape,te.shape,tr.shape,dists.shape) return dists def predict_labels(self, dists, k=1): """ 根據距離和K的數量來預測測試資料的標籤 輸入: - dists : 一個numpy型別的陣列,維數(num_test,num_train) - k : 根據 k 個最小距離進行預測 返回: - y : 一個numpy型別的陣列,維數(num_test,) """ num_test = dists.shape[0] y_pred = np.zeros(num_test) for i in range(num_test): distances = dists[i, :] indexes = np.argsort(distances) closest_y = self.y_train[indexes[:k]] count = np.bincount(closest_y) y_pred[i] = np.argmax(count) return y_pred
- data_utils.py 程式碼如下:
from __future__ import print_function from six.moves import cPickle as pickle import numpy as np import os from scipy.ndimage import imread import platform def load_pickle(f): version = platform.python_version_tuple() if version[0] == '2': return pickle.load(f) elif version[0] == '3': return pickle.load(f, encoding='latin1') raise ValueError("invalid python version: {}".format(version)) def load_CIFAR_batch(filename): """ load single batch of cifar """ with open(filename, 'rb') as f: datadict = load_pickle(f) X = datadict['data'] Y = datadict['labels'] X = X.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype("float") Y = np.array(Y) return X, Y def load_CIFAR10(ROOT): """ load all of cifar """ xs = [] ys = [] for b in range(1, 6): f = os.path.join(ROOT, 'data_batch_%d' % (b,)) X, Y = load_CIFAR_batch(f) xs.append(X) ys.append(Y) Xtr = np.concatenate(xs) Ytr = np.concatenate(ys) del X, Y Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch')) return Xtr, Ytr, Xte, Yte def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000, subtract_mean=True): """ Load the CIFAR-10 dataset from disk and perform preprocessing to prepare it for classifiers. These are the same steps as we used for the SVM, but condensed to a single function. """ # Load the raw CIFAR-10 data cifar10_dir = 'cs231n/datasets/cifar-10-batches-py' X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir) # Subsample the data mask = list(range(num_training, num_training + num_validation)) X_val = X_train[mask] y_val = y_train[mask] mask = list(range(num_training)) X_train = X_train[mask] y_train = y_train[mask] mask = list(range(num_test)) X_test = X_test[mask] y_test = y_test[mask] # Normalize the data: subtract the mean image if subtract_mean: mean_image = np.mean(X_train, axis=0) X_train -= mean_image X_val -= mean_image X_test -= mean_image # Transpose so that channels come first X_train = X_train.transpose(0, 3, 1, 2).copy() X_val = X_val.transpose(0, 3, 1, 2).copy() X_test = X_test.transpose(0, 3, 1, 2).copy() # Package data into a dictionary return { 'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'X_test': X_test, 'y_test': y_test, } def load_tiny_imagenet(path, dtype=np.float32, subtract_mean=True): """ Load TinyImageNet. Each of TinyImageNet-100-A, TinyImageNet-100-B, and TinyImageNet-200 have the same directory structure, so this can be used to load any of them. Inputs: - path: String giving path to the directory to load. - dtype: numpy datatype used to load the data. - subtract_mean: Whether to subtract the mean training image. Returns: A dictionary with the following entries: - class_names: A list where class_names[i] is a list of strings giving the WordNet names for class i in the loaded dataset. - X_train: (N_tr, 3, 64, 64) array of training images - y_train: (N_tr,) array of training labels - X_val: (N_val, 3, 64, 64) array of validation images - y_val: (N_val,) array of validation labels - X_test: (N_test, 3, 64, 64) array of testing images. - y_test: (N_test,) array of test labels; if test labels are not available (such as in student code) then y_test will be None. - mean_image: (3, 64, 64) array giving mean training image """ # First load wnids with open(os.path.join(path, 'wnids.txt'), 'r') as f: wnids = [x.strip() for x in f] # Map wnids to integer labels wnid_to_label = {wnid: i for i, wnid in enumerate(wnids)} # Use words.txt to get names for each class with open(os.path.join(path, 'words.txt'), 'r') as f: wnid_to_words = dict(line.split('\t') for line in f) for wnid, words in wnid_to_words.iteritems(): wnid_to_words[wnid] = [w.strip() for w in words.split(',')] class_names = [wnid_to_words[wnid] for wnid in wnids] # Next load training data. X_train = [] y_train = [] for i, wnid in enumerate(wnids): if (i + 1) % 20 == 0: print('loading training data for synset %d / %d' % (i + 1, len(wnids))) # To figure out the filenames we need to open the boxes file boxes_file = os.path.join(path, 'train', wnid, '%s_boxes.txt' % wnid) with open(boxes_file, 'r') as f: filenames = [x.split('\t')[0] for x in f] num_images = len(filenames) X_train_block = np.zeros((num_images, 3, 64, 64), dtype=dtype) y_train_block = wnid_to_label[wnid] * np.ones(num_images, dtype=np.int64) for j, img_file in enumerate(filenames): img_file = os.path.join(path, 'train', wnid, 'images', img_file) img = imread(img_file) if img.ndim == 2: ## grayscale file img.shape = (64, 64, 1) X_train_block[j] = img.transpose(2, 0, 1) X_train.append(X_train_block) y_train.append(y_train_block) # We need to concatenate all training data X_train = np.concatenate(X_train, axis=0) y_train = np.concatenate(y_train, axis=0) # Next load validation data with open(os.path.join(path, 'val', 'val_annotations.txt'), 'r') as f: img_files = [] val_wnids = [] for line in f: img_file, wnid = line.split('\t')[:2] img_files.append(img_file) val_wnids.append(wnid) num_val = len(img_files) y_val = np.array([wnid_to_label[wnid] for wnid in val_wnids]) X_val = np.zeros((num_val, 3, 64, 64), dtype=dtype) for i, img_file in enumerate(img_files): img_file = os.path.join(path, 'val', 'images', img_file) img = imread(img_file) if img.ndim == 2: img.shape = (64, 64, 1) X_val[i] = img.transpose(2, 0, 1) # Next load test images # Students won't have test labels, so we need to iterate over files in the # images directory. img_files = os.listdir(os.path.join(path, 'test', 'images')) X_test = np.zeros((len(img_files), 3, 64, 64), dtype=dtype) for i, img_file in enumerate(img_files): img_file = os.path.join(path, 'test', 'images', img_file) img = imread(img_file) if img.ndim == 2: img.shape = (64, 64, 1) X_test[i] = img.transpose(2, 0, 1) y_test = None y_test_file = os.path.join(path, 'test', 'test_annotations.txt') if os.path.isfile(y_test_file): with open(y_test_file, 'r') as f: img_file_to_wnid = {} for line in f: line = line.split('\t') img_file_to_wnid[line[0]] = line[1] y_test = [wnid_to_label[img_file_to_wnid[img_file]] for img_file in img_files] y_test = np.array(y_test) mean_image = X_train.mean(axis=0) if subtract_mean: X_train -= mean_image[None] X_val -= mean_image[None] X_test -= mean_image[None] return { 'class_names': class_names, 'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'X_test': X_test, 'y_test': y_test, 'class_names': class_names, 'mean_image': mean_image, } def load_models(models_dir): """ Load saved models from disk. This will attempt to unpickle all files in a directory; any files that give errors on unpickling (such as README.txt) will be skipped. Inputs: - models_dir: String giving the path to a directory containing model files. Each model file is a pickled dictionary with a 'model' field. Returns: A dictionary mapping model file names to models. """ models = {} for model_file in os.listdir(models_dir): with open(os.path.join(models_dir, model_file), 'rb') as f: try: models[model_file] = load_pickle(f)['model'] except pickle.UnpicklingError: continue return models
- 建立K-NN.py進行實驗
3 - 具體步驟
首先為了瞭解資料集的構造,我們將資料集的維數輸出。
建立K-NN.py檔案並輸入程式碼:
import random import numpy as np from cs231n.data_utils import load_CIFAR10 import matplotlib.pyplot as plt #載入Cifar10資料集,並輸出資料集的維數 cifar10_dir = 'cs231n/datasets/cifar-10-batches-py' X_train,y_train,X_test,y_test = load_CIFAR10(cifar10_dir) print('Training data shape', X_train.shape) print('Training labels shape', y_train.shape) print('Test data shape', X_test.shape) print('Test labels shape', y_test.shape)
顯示如下,Cifar-10資料集的訓練集含有5萬張32x32畫素的彩色圖片,測試集含有1萬張圖片,每張影象都帶有標籤,將它們分成了10類
Training data shape (50000, 32, 32, 3)
Training labels shape (50000,)
Test data shape (10000, 32, 32, 3)
Test labels shape (10000,)
然後,我們取出一些資料來看看資料集裡面的圖片到底是什麼樣子
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
num_classes = len(classes)
samples_per_class = 7 #每個類別取樣個數
for y,cls in enumerate(classes): #(0,plane),y返回元素位置,cls返回元素本身
idxs = np.flatnonzero(y_train==y) #找出標籤中y類的位置
idxs = np.random.choice(idxs,samples_per_class,replace=False) #從中隨機算出7個樣本
for i,idx in enumerate(idxs): #對所選樣本的位置和樣本所對應的圖片在訓練集中的位置進行迴圈
plt_idx = i * num_classes + y + 1 #在子圖中所佔位置的計算
plt.subplot(samples_per_class,num_classes,plt_idx) #說明要畫的子圖的編號
plt.imshow(X_train[idx].astype('uint8')) #畫圖
plt.axis('off')
if i == 0:
plt.title(cls) #寫上類別名
plt.show()
因為全部的資料集圖片數量太多了,為了高效的執行我們的程式碼,我們從中選出一個子集來進行後面的實驗。
num_training = 5000
mask = list(range(num_training))
X_train = X_train[mask]
y_train = y_train[mask]
num_test = 500
mask = list(range(num_test))
X_test = X_test[mask]
y_test = y_test[mask]
#將影象資料轉置成二維
X_train = np.reshape(X_train,(X_train.shape[0],-1))
X_test = np.reshape(X_test,(X_test.shape[0],-1))
print(X_train.shape,X_test.shape)
使用訓練集5000張圖片,測試集500張圖片,3072是影象的維度乘積32x32x3
(5000, 3072) (500, 3072)
建立KNN分類器物件,並測試一下使用雙層迴圈計算歐氏距離
from cs231n.classifiers.k_nearest_neighbor import KNearestNeighbor
classifier = KNearestNeighbor()
classifier.train(X_train,y_train)
dists = classifier.compute_distances_two_loops(X_test)
plt.imshow(dists, interpolation='none')
plt.show()
可以看到有一些行或者列明顯顏色較淺(其中深色表示距離小,而淺色表示距離大)
我們將K設定為1(也就是最鄰近法)測試一下準確率
y_test_pred = classifier.predict_labels(dists, k=1)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
獲得準確率只有27%
Got 137 / 500 correct => accuracy: 0.274000
我們使用交叉測試來尋找最佳的超引數K,這裡我們使用了K折交叉驗證(K-fold cross validation)
k 折交叉驗證通過對 k 個不同分組訓練的結果進行平均來減少方差,因此模型的效能對資料的劃分就不那麼敏感。
- 第一步,不重複抽樣將原始資料隨機分為 k 份。
- 第二步,每一次挑選其中 1 份作為測試集,剩餘 k-1 份作為訓練集用於模型訓練。
- 第三步,重複第二步 k 次,這樣每個子集都有一次機會作為測試集,其餘機會作為訓練集。在每個訓練集上訓練後得到一個模型,用這個模型在相應的測試集上測試,計算並儲存模型的評估指標,
- 第四步,計算 k 組測試結果的平均值作為模型精度的估計,並作為當前 k 折交叉驗證下模型的效能指標。
"""
使用交叉驗證選出最優的超引數K
將訓練資料切分,儲存在X_train_folds和y_train_folds中
"""
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
X_train_folds = []
y_train_folds = []
X_train_folds=np.array_split(X_train,num_folds)
y_train_folds=np.array_split(y_train,num_folds)
k_to_accuracies = {}
for i in k_choices:
k_to_accuracies[i]=[]
for ki in k_choices:
for fi in range(num_folds):
#prepare the data
valindex=fi
X_traini = np.vstack((X_train_folds[0:fi]+X_train_folds[fi+1:num_folds]))
y_traini = np.hstack((y_train_folds[0:fi]+ y_train_folds[fi+1:num_folds]))
X_vali=np.array(X_train_folds[valindex])
y_vali = np.array(y_train_folds[valindex])
num_val=len(y_vali)
#initialize the KNN
classifier = KNearestNeighbor()
classifier.train(X_traini,y_traini)
#calculate the accuracy
dists = classifier.compute_distances_no_loops(X_vali)
y_val_pred = classifier.predict_labels(dists, k=ki)
num_correct = np.sum(y_val_pred == y_vali)
accuracy = float(num_correct) / num_val
k_to_accuracies[ki].append(accuracy)
# Print out the computed accuracies
for k in sorted(k_to_accuracies):
for accuracy in k_to_accuracies[k]:
print('k = %d, accuracy = %f' % (k, accuracy))
# plot the raw observations
for k in k_choices:
accuracies = k_to_accuracies[k]
plt.scatter([k] * len(accuracies), accuracies)
# plot the trend line with error bars that correspond to standard deviation
accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('Cross-validation on k')
plt.xlabel('k')
plt.ylabel('Cross-validation accuracy')
plt.show()
可以看到,在K=10時,準確率最高,但是也只有百分之20幾,所以KNN演算法不太適合用於影象識別,但是這是一個很好的例子來幫助我們理解如何使用演算法來對影象進行分類。