利用python對mnist資料集中的0,1手寫字型進行二分類
1. 下載程式碼,通過點選連結(http://yann.lecun.com/exdb/mnist/),開啟頁面如下圖所示,下載對應MNIST手寫數字識別資料集,包括訓練集影象、訓練集標籤、測試集影象與測試集標籤四個部分。下載保存於指定位置。
2.分析資料集,進行預處理,由於所提供的資料集格式為.idx3_ubyte。不方便直接進行訓練,因此,如要將其轉化為圖片格式,通過直接讀取圖片畫素作為訓練特徵。資料集預處理的程式程式碼為(程式檔名:analysisdataset.py):
import numpy as np
import struct
import matplotlib.pyplot as plt
# 訓練集檔案
train_images_idx3_ubyte_file = 'user/mnist/train-images.idx3-ubyte'
# 訓練集標籤檔案
train_labels_idx1_ubyte_file = 'user/mnist/train-labels.idx1-ubyte'
# 測試集檔案
test_images_idx3_ubyte_file = 'user /mnist/t10k-images.idx3-ubyte'
# 測試集標籤檔案
test_labels_idx1_ubyte_file = 'user/mnist/t10k-labels.idx1-ubyte'
def decode_idx3_ubyte(idx3_ubyte_file):
"""
解析idx3檔案的通用函式
:param idx3_ubyte_file: idx3檔案路徑
:return: 資料集
"""
# 讀取二進位制資料
bin_data = open(idx3_ubyte_file, 'rb').read()
# 解析檔案頭資訊,依次為魔數、圖片數量、每張圖片高、每張圖片寬
offset = 0
fmt_header = '>iiii'
magic_number, num_images, num_rows, num_cols = struct.unpack_from(fmt_header, bin_data, offset)
print('魔數:%d, 圖片數量: %d張, 圖片大小: %d*%d' % (magic_number, num_images, num_rows, num_cols))
# 解析資料集
image_size = num_rows * num_cols
offset += struct.calcsize(fmt_header)
fmt_image = '>' + str(image_size) + 'B'
images = np.empty((num_images, num_rows, num_cols))
for i in range(num_images):
if (i + 1) % 10000 == 0:
print('已解析 %d' % (i + 1) + '張')
images[i] = np.array(struct.unpack_from(fmt_image, bin_data, offset)).reshape((num_rows, num_cols))
offset += struct.calcsize(fmt_image)
return images
def decode_idx1_ubyte(idx1_ubyte_file):
"""
解析idx1檔案的通用函式
:param idx1_ubyte_file: idx1檔案路徑
:return: 資料集
"""
# 讀取二進位制資料
bin_data = open(idx1_ubyte_file, 'rb').read()
# 解析檔案頭資訊,依次為魔數和標籤數
offset = 0
fmt_header = '>ii'
magic_number, num_images = struct.unpack_from(fmt_header, bin_data, offset)
print('魔數:%d, 圖片數量: %d張' % (magic_number, num_images))
# 解析資料集
offset += struct.calcsize(fmt_header)
fmt_image = '>B'
labels = np.empty(num_images)
for i in range(num_images):
if (i + 1) % 10000 == 0:
print('已解析 %d' % (i + 1) + '張')
labels[i] = struct.unpack_from(fmt_image, bin_data, offset)[0]
offset += struct.calcsize(fmt_image)
return labels
def load_train_images(idx_ubyte_file=train_images_idx3_ubyte_file):
"""
TRAINING SET IMAGE FILE (train-images-idx3-ubyte):
:param idx_ubyte_file: idx檔案路徑
:return: n*row*col維np.array物件,n為圖片數量
"""
return decode_idx3_ubyte(idx_ubyte_file)
def load_train_labels(idx_ubyte_file=train_labels_idx1_ubyte_file):
"""
TRAINING SET LABEL FILE (train-labels-idx1-ubyte):
:param idx_ubyte_file: idx檔案路徑
:return: n*1維np.array物件,n為圖片數量
"""
return decode_idx1_ubyte(idx_ubyte_file)
def load_test_images(idx_ubyte_file=test_images_idx3_ubyte_file):
"""
TEST SET IMAGE FILE (t10k-images-idx3-ubyte):
:param idx_ubyte_file: idx檔案路徑
:return: n*row*col維np.array物件,n為圖片數量
"""
return decode_idx3_ubyte(idx_ubyte_file)
def load_test_labels(idx_ubyte_file=test_labels_idx1_ubyte_file):
"""
TEST SET LABEL FILE (t10k-labels-idx1-ubyte):
:param idx_ubyte_file: idx檔案路徑
:return: n*1維np.array物件,n為圖片數量
"""
return decode_idx1_ubyte(idx_ubyte_file)
3.第二步僅僅將資料集轉化為圖片檔案並返回圖片畫素矩陣,實驗需要將畫素矩陣轉化為一列向量,因此需要單獨再進行特徵提取,其程式碼為(程式檔案為:extraction_feature.py):
import analysisdataset
import numpy as np
def load_features():
# extract the training and testing datasets and labels
training_images = analysisdataset.load_train_images()
training_labels = analysisdataset.load_train_labels()
testing_images = analysisdataset.load_test_images()
testing_labels = analysisdataset.load_test_labels()
print(training_labels.shape)
# pre-processing the mnist datasets
num_train = training_images.shape[0]
dimension = (training_images[0].shape[0])**2
training_features = np.empty([num_train, dimension])
num_test = testing_images.shape[0]
testing_features = np.empty([num_test, dimension])
# transform the matrix to a column vector
for i in range(num_train):
training_features[i, :] = training_images[i].reshape([dimension, ])
for i in range(num_test):
testing_features[i, :] = testing_images[i].reshape([dimension, ])
return training_features, training_labels, testing_features, testing_labels
4. 由於任務為0,1二分類任務,因此,還需要將提取好的特徵中將標籤為0,1的資料集特徵分開,用於進一步的邏輯迴歸的優化問題,處理的程式碼為(程式檔案logistregression.py):
import extraction_feature as ef
import numpy as np
# pre-processing the dataset
def extract_binary_features():
# extraction features of dataset mnist
training_features, training_labels, testing_features, testing_labels = ef.load_features()
# extract the 0,1 images of mnist dataset
bi_training_features = training_features[training_labels <= 1, :]
bi_training_labels = training_labels[training_labels <= 1]
bi_testing_features = testing_features[testing_labels <= 1, :]
bi_testing_labels = testing_labels[testing_labels <= 1]
num_training = bi_training_features.shape[0]
num_testing = bi_testing_features.shape[0]
# pre-process the shape of feature matrix
bi_training_features = bi_training_features / 255
bi_testing_features = bi_testing_features / 255
bi_training_labels = bi_training_labels.reshape([1, num_training])
bi_testing_labels = bi_testing_labels.reshape([1, num_testing])
return bi_training_features.T, bi_training_labels.T,\
bi_testing_features.T, bi_testing_labels.T
5. 提取完特徵,需要進行訓練邏輯迴歸模型,這裡對邏輯迴歸模型的優化利用梯度下降演算法。其中,模型的優化目標為:
其中, ,,而,w和b分別為特徵向量的權重與偏置。而。對以上模型利用“極大似然法”來估計w和b,對率迴歸模型最大化“對數似然”:
整理得到最終的目標函式為:
利用梯度下降的方式,並對模型進行測試的程式程式碼為(程式檔案為:logistregression.py):
def cond_pro(w_b, x_hat):
'''
this function calculates the conditional probability with y equals 1
:param w_b: the combined weight and bias
:param x_hat: expanded training data features which add ones matrix as a row
:return: the result of conditional probability of x with y equals 1
'''
dim = w_b.shape[0]
w_b = w_b.reshape([dim, ])
x_hat = x_hat.reshape([dim, ])
pro = np.inner(w_b, x_hat)
e_pro = np.math.exp(pro)
result = e_pro/(1 + e_pro)
return result
def obj_fun(w_b, x_hat, y):
'''
this function calculates the result of objective function
:param w_b: the combined weight and bias
:param x_hat: expanded training data features which add ones matrix as a row
:param y: training data labels
:return: objective result
'''
dim, num = x_hat.shape
l_beta = 0
w_b = w_b.reshape([dim, ])
for i in range(num):
x = x_hat[:, i]
a = np.inner(x, w_b)
l_beta += -y[i] * a + np.math.log(1 + np.math.exp(a))
return l_beta
def first_order(w_b, x_hat, y):
'''
this function get the first order derivation of objection function
:param w_b: the combined weight and bias
:param x_hat: expanded training data features which add ones matrix as a row
:param y: training data labels
:return: the first order derivation
'''
dim, num = x_hat.shape
result = np.zeros([dim, 1])
for i in range(num):
x = x_hat[:, i].reshape([dim, 1])
result += x * (y[i] - cond_pro(w_b, x))
return -result
def newton_optimal(x, y, max_iter, acc):
'''
this is the main optimal process of learning weight and bias
:param x: training data features
:param y: training data labels
:param max_iter: maximum iterations of algorithm
:param acc: objective accuracy of algorithm
:return: the optimal weight and bias: w_b
'''
dim, num = x.shape
# initial weight and bias vector
w_b = np.random.random([dim+1, 1])
x_hat = np.insert(x, dim, 1, 0)
# calculate objective result to decide it is convergence or not
obj_result = obj_fun(w_b, x_hat, y)
# initial the original accuracy of optimal method
flag = 1 # mark the number of iterations
print('------------------------------------------')
print('---------newton optimal process-----------')
# newton optimal method begins
while True:
# sec = np.mat(second_order(w_b, x_hat))
fir = first_order(w_b, x_hat, y)
w_b = w_b - 0.001 * fir
new_obj_result = obj_fun(w_b, x_hat, y)
accuracy = obj_result - new_obj_result
obj_result = new_obj_result
print('iteration %d: error: %f(objective: %f)' % (flag, accuracy,new_obj_result))
flag += 1
if (flag >= max_iter) or (accuracy <= acc):
break
return w_b
def model_test(w_b, x, y):
'''
this function uses the learnt model to test the accuracy of logistic regression method
:param w_b: this is learnt weight and bias
:param x: testing data features, a column represents an instance
:param y: testing data labels
:return: accuracy of the model
'''
dim, num = x.shape
flag = 0 # mark the right match pairs
for i in range(num):
x_h = x[:, i]
w = w_b[: -1, 0]
b = w_b[-1, 0]
z = np.inner(w, x_h) + b
forecast_labels = 1 / (1 + np.math.exp(-z))
if forecast_labels > 0.5:
forecast_labels = 1
else:
forecast_labels = 0
if forecast_labels == y[i]:
flag += 1
accuracy = flag / num
return accuracy
def main():
bi_training_features, bi_training_labels, bi_testing_features, bi_testing_labels = extract_binary_features()
maximum = 10000
acc = 0.00001
w_b = newton_optimal(bi_training_features, bi_training_labels, maximum, acc)
accuracy = model_test(w_b, bi_testing_features,bi_testing_labels)
print('model accuracy is: %f' % accuracy)
if __name__ == '__main__':
main()
6.通過執行程式碼,得到模型對於二分類問題的識別率為:0.998582