利用python對mnist資料集中的0，1手寫字型進行二分類

阿新 • • 發佈：2018-12-13

1. 下載程式碼，通過點選連結（http://yann.lecun.com/exdb/mnist/），開啟頁面如下圖所示，下載對應MNIST手寫數字識別資料集，包括訓練集影象、訓練集標籤、測試集影象與測試集標籤四個部分。下載保存於指定位置。

2.分析資料集，進行預處理，由於所提供的資料集格式為.idx3_ubyte。不方便直接進行訓練，因此，如要將其轉化為圖片格式，通過直接讀取圖片畫素作為訓練特徵。資料集預處理的程式程式碼為(程式檔名:analysisdataset.py)：

import numpy as np

import struct

import matplotlib.pyplot as plt

# 訓練集檔案

train_images_idx3_ubyte_file = 'user/mnist/train-images.idx3-ubyte'

# 訓練集標籤檔案

train_labels_idx1_ubyte_file = 'user/mnist/train-labels.idx1-ubyte'

# 測試集檔案

test_images_idx3_ubyte_file = 'user /mnist/t10k-images.idx3-ubyte'

# 測試集標籤檔案

test_labels_idx1_ubyte_file = 'user/mnist/t10k-labels.idx1-ubyte'

def decode_idx3_ubyte(idx3_ubyte_file):

"""

解析idx3檔案的通用函式

:param idx3_ubyte_file: idx3檔案路徑

:return: 資料集

"""

# 讀取二進位制資料

bin_data = open(idx3_ubyte_file, 'rb').read()

# 解析檔案頭資訊，依次為魔數、圖片數量、每張圖片高、每張圖片寬

offset = 0

fmt_header = '>iiii'

magic_number, num_images, num_rows, num_cols = struct.unpack_from(fmt_header, bin_data, offset)

print('魔數:%d, 圖片數量: %d張, 圖片大小: %d*%d' % (magic_number, num_images, num_rows, num_cols))

# 解析資料集

image_size = num_rows * num_cols

offset += struct.calcsize(fmt_header)

fmt_image = '>' + str(image_size) + 'B'

images = np.empty((num_images, num_rows, num_cols))

for i in range(num_images):

if (i + 1) % 10000 == 0:

print('已解析 %d' % (i + 1) + '張')

images[i] = np.array(struct.unpack_from(fmt_image, bin_data, offset)).reshape((num_rows, num_cols))

offset += struct.calcsize(fmt_image)

return images

def decode_idx1_ubyte(idx1_ubyte_file):

"""

解析idx1檔案的通用函式

:param idx1_ubyte_file: idx1檔案路徑

:return: 資料集

"""

# 讀取二進位制資料

bin_data = open(idx1_ubyte_file, 'rb').read()

# 解析檔案頭資訊，依次為魔數和標籤數

offset = 0

fmt_header = '>ii'

magic_number, num_images = struct.unpack_from(fmt_header, bin_data, offset)

print('魔數:%d, 圖片數量: %d張' % (magic_number, num_images))

# 解析資料集

offset += struct.calcsize(fmt_header)

fmt_image = '>B'

labels = np.empty(num_images)

for i in range(num_images):

if (i + 1) % 10000 == 0:

print('已解析 %d' % (i + 1) + '張')

labels[i] = struct.unpack_from(fmt_image, bin_data, offset)[0]

offset += struct.calcsize(fmt_image)

return labels

def load_train_images(idx_ubyte_file=train_images_idx3_ubyte_file):

"""

TRAINING SET IMAGE FILE (train-images-idx3-ubyte):

:param idx_ubyte_file: idx檔案路徑

:return: n*row*col維np.array物件，n為圖片數量

"""

return decode_idx3_ubyte(idx_ubyte_file)

def load_train_labels(idx_ubyte_file=train_labels_idx1_ubyte_file):

"""

TRAINING SET LABEL FILE (train-labels-idx1-ubyte):

:param idx_ubyte_file: idx檔案路徑

:return: n*1維np.array物件，n為圖片數量

"""

return decode_idx1_ubyte(idx_ubyte_file)

def load_test_images(idx_ubyte_file=test_images_idx3_ubyte_file):

"""

TEST SET IMAGE FILE (t10k-images-idx3-ubyte):

:param idx_ubyte_file: idx檔案路徑

:return: n*row*col維np.array物件，n為圖片數量

"""

return decode_idx3_ubyte(idx_ubyte_file)

def load_test_labels(idx_ubyte_file=test_labels_idx1_ubyte_file):

"""

TEST SET LABEL FILE (t10k-labels-idx1-ubyte):

:param idx_ubyte_file: idx檔案路徑

:return: n*1維np.array物件，n為圖片數量

"""

return decode_idx1_ubyte(idx_ubyte_file)

3.第二步僅僅將資料集轉化為圖片檔案並返回圖片畫素矩陣，實驗需要將畫素矩陣轉化為一列向量，因此需要單獨再進行特徵提取，其程式碼為(程式檔案為：extraction_feature.py)：

import analysisdataset

import numpy as np

def load_features():

# extract the training and testing datasets and labels

training_images = analysisdataset.load_train_images()

training_labels = analysisdataset.load_train_labels()

testing_images = analysisdataset.load_test_images()

testing_labels = analysisdataset.load_test_labels()

print(training_labels.shape)

# pre-processing the mnist datasets

num_train = training_images.shape[0]

dimension = (training_images[0].shape[0])**2

training_features = np.empty([num_train, dimension])

num_test = testing_images.shape[0]

testing_features = np.empty([num_test, dimension])

# transform the matrix to a column vector

for i in range(num_train):

training_features[i, :] = training_images[i].reshape([dimension, ])

for i in range(num_test):

testing_features[i, :] = testing_images[i].reshape([dimension, ])

return training_features, training_labels, testing_features, testing_labels

4. 由於任務為0,1二分類任務，因此，還需要將提取好的特徵中將標籤為0,1的資料集特徵分開，用於進一步的邏輯迴歸的優化問題，處理的程式碼為(程式檔案logistregression.py)：

import extraction_feature as ef

import numpy as np

# pre-processing the dataset

def extract_binary_features():

# extraction features of dataset mnist

training_features, training_labels, testing_features, testing_labels = ef.load_features()

# extract the 0,1 images of mnist dataset

bi_training_features = training_features[training_labels <= 1, :]

bi_training_labels = training_labels[training_labels <= 1]

bi_testing_features = testing_features[testing_labels <= 1, :]

bi_testing_labels = testing_labels[testing_labels <= 1]

num_training = bi_training_features.shape[0]

num_testing = bi_testing_features.shape[0]

# pre-process the shape of feature matrix

bi_training_features = bi_training_features / 255

bi_testing_features = bi_testing_features / 255

bi_training_labels = bi_training_labels.reshape([1, num_training])

bi_testing_labels = bi_testing_labels.reshape([1, num_testing])

return bi_training_features.T, bi_training_labels.T,\

bi_testing_features.T, bi_testing_labels.T

5. 提取完特徵，需要進行訓練邏輯迴歸模型，這裡對邏輯迴歸模型的優化利用梯度下降演算法。其中，模型的優化目標為：

$% MathType!MTEF!2!1!+- % feaagKart1ev2aaatCvAUfeBSjuyZL2yd9gzLbvyNv2CaerbuLwBLn % hiov2DGi1BTfMBaeXatLxBI9gBaerbd9wDYLwzYbItLDharqqtubsr % 4rNCHbWexLMBbXgBd9gzLbvyNv2CaeHbl7mZLdGeaGqiVu0Je9sqGq % pepC0xbbL8F4rqqrFfpeea0xe9Lq-Jc9vqaqpepm0xbba9pwe9Q8fs % 0-yqaqpepae9pg0FirpepeKkFr0xfr-xfr-xb9adbaqaaeGaciGaai % aabeqaamaabaabauaakeaacaWGWbGaaiikaiaadMhadaWgaaWcbaGa % amyAaaqabaGccaGG8bGaamiEamaaBaaaleaacaWGPbaabeaakiaacU % dacaWG3bGaaiilaiaadkgacaGGPaGaeyypa0JaamyEamaaBaaaleaa % caWGPbaabeaakiaadchadaWgaaWcbaGaaGymaaqabaGccaGGOaGabm % iEayaajaWaaSbaaSqaaiaadMgaaeqaaOGaai4oaiabek7aIjaacMca % cqGHRaWkcaGGOaGaaGymaiabgkHiTiaadMhadaWgaaWcbaGaamyAaa % qabaGccaGGPaGaamiCamaaBaaaleaacaaIWaaabeaakiaacIcaceWG % 4bGbaKaadaWgaaWcbaGaamyAaaqabaGccaGG7aGaeqOSdiMaaiykaa % aa!62A0! \[p({y_i}|{x_i};w,b) = {y_i}{p_1}({\hat x_i};\beta ) + (1 - {y_i}){p_0}({\hat x_i};\beta )\]$ $p({y_i}|{x_i};w,b) = {y_i}{p_1}({\hat x_i};\beta ) + (1 - {y_i}){p_0}({\hat x_i};\beta )$

其中， ${p_1}({\hat x_i};\beta ) = \frac{{{e^{{\beta ^{\rm{T}}}\hat x}}}}{{1 + {e^{{\beta ^{\rm{T}}}\hat x}}}}$ ， ${p_0}({\hat x_i};\beta ) = \frac{1}{{1 + {e^{{\beta ^{\rm{T}}}\hat x}}}}$ ，而 $\beta {\rm{ = (}}w{\rm{;}}b{\rm{)}}$ ，w和b分別為特徵向量的權重與偏置。而 $\hat x = (x;1)$ 。對以上模型利用“極大似然法”來估計w和b，對率迴歸模型最大化“對數似然”：

$l(w,b) = \sum\limits_{i = 1}^m {\ln p({y_i}|{x_i};w,b)}$

整理得到最終的目標函式為：

$l(\beta ) = \sum\limits_{i = 1}^m {( - {y_i}{\beta ^T}{{\hat x}_i} + ln(1 + {e^{{\beta ^T}{{\hat x}_i}}}))}$

利用梯度下降的方式，並對模型進行測試的程式程式碼為(程式檔案為：logistregression.py)：

def cond_pro(w_b, x_hat):

'''

this function calculates the conditional probability with y equals 1

:param w_b: the combined weight and bias

:param x_hat: expanded training data features which add ones matrix as a row

:return: the result of conditional probability of x with y equals 1

'''

dim = w_b.shape[0]

w_b = w_b.reshape([dim, ])

x_hat = x_hat.reshape([dim, ])

pro = np.inner(w_b, x_hat)

e_pro = np.math.exp(pro)

result = e_pro/(1 + e_pro)

return result

def obj_fun(w_b, x_hat, y):

'''

this function calculates the result of objective function

:param w_b: the combined weight and bias

:param x_hat: expanded training data features which add ones matrix as a row

:param y: training data labels

:return: objective result

'''

dim, num = x_hat.shape

l_beta = 0

w_b = w_b.reshape([dim, ])

for i in range(num):

x = x_hat[:, i]

a = np.inner(x, w_b)

l_beta += -y[i] * a + np.math.log(1 + np.math.exp(a))

return l_beta

def first_order(w_b, x_hat, y):

'''

this function get the first order derivation of objection function

:param w_b: the combined weight and bias

:param x_hat: expanded training data features which add ones matrix as a row

:param y: training data labels

:return: the first order derivation

'''

dim, num = x_hat.shape

result = np.zeros([dim, 1])

for i in range(num):

x = x_hat[:, i].reshape([dim, 1])

result += x * (y[i] - cond_pro(w_b, x))

return -result

def newton_optimal(x, y, max_iter, acc):

'''

this is the main optimal process of learning weight and bias

:param x: training data features

:param y: training data labels

:param max_iter: maximum iterations of algorithm

:param acc: objective accuracy of algorithm

:return: the optimal weight and bias: w_b

'''

dim, num = x.shape

# initial weight and bias vector

w_b = np.random.random([dim+1, 1])

x_hat = np.insert(x, dim, 1, 0)

# calculate objective result to decide it is convergence or not

obj_result = obj_fun(w_b, x_hat, y)

# initial the original accuracy of optimal method

flag = 1 # mark the number of iterations

print('------------------------------------------')

print('---------newton optimal process-----------')

# newton optimal method begins

while True:

# sec = np.mat(second_order(w_b, x_hat))

fir = first_order(w_b, x_hat, y)

w_b = w_b - 0.001 * fir

new_obj_result = obj_fun(w_b, x_hat, y)

accuracy = obj_result - new_obj_result

obj_result = new_obj_result

print('iteration %d: error: %f(objective: %f)' % (flag, accuracy,new_obj_result))

flag += 1

if (flag >= max_iter) or (accuracy <= acc):

break

return w_b

def model_test(w_b, x, y):

'''

this function uses the learnt model to test the accuracy of logistic regression method

:param w_b: this is learnt weight and bias

:param x: testing data features, a column represents an instance

:param y: testing data labels

:return: accuracy of the model

'''

dim, num = x.shape

flag = 0 # mark the right match pairs

for i in range(num):

x_h = x[:, i]

w = w_b[: -1, 0]

b = w_b[-1, 0]

z = np.inner(w, x_h) + b

forecast_labels = 1 / (1 + np.math.exp(-z))

if forecast_labels > 0.5:

forecast_labels = 1

else:

forecast_labels = 0

if forecast_labels == y[i]:

flag += 1

accuracy = flag / num

return accuracy

def main():

bi_training_features, bi_training_labels, bi_testing_features, bi_testing_labels = extract_binary_features()

maximum = 10000

acc = 0.00001

w_b = newton_optimal(bi_training_features, bi_training_labels, maximum, acc)

accuracy = model_test(w_b, bi_testing_features,bi_testing_labels)

print('model accuracy is: %f' % accuracy)

if __name__ == '__main__':

main()

6.通過執行程式碼，得到模型對於二分類問題的識別率為:0.998582

利用python對mnist資料集中的0，1手寫字型進行二分類

利用python對mnist資料集中的0，1手寫字型進行二分類

R語言對MNIST資料集分析：探索手寫數字分類

使用OpenCV自帶的神經網路對MNIST手寫字型進行識別

機器學習Tensorflow基於MNIST資料集識別自己的手寫數字（讀取和測試自己的模型）

利用Python實現k最近鄰演算法並識別手寫數字（詳細註釋）

利用caffe訓練好的模型測試自己的手寫字型圖片

利用python對2012美國大選進行資料分析(四，時間處理)

利用Python對QQ空間資料進行分析，瞭解你的QQ好友

python打亂資料集中X，y標籤對的方法

Tensorflow學習教程------利用卷積神經網路對mnist資料集進行分類_利用訓練好的模型進行分類

利用softmax函式對mnist資料集簡單分類

python對同一個資料夾下進行遍歷操作，跳過處理過的

利用python對泰坦尼克號資料集進行分析

利用python對多個txt檔案中的資料進行篩選

memset 對每個字節進行初始化（0，-1）

利用python爬取點小圖片，滿足私欲(爬蟲)

利用Python編寫一個會員管理系統，沈迷於編程的世界裏！

利用python對WiderFace數據解析及畫框

破解密碼很難？利用Python自動編寫暴力破解字典，黑客必學技能！

mysql 如何將查詢出來的資料轉化為0，1標識的狀態碼

利用python對mnist資料集中的0，1手寫字型進行二分類

相關推薦