1. 程式人生 > >keras跟蹤模型訓練過程和訓練過程有選擇儲存模型

keras跟蹤模型訓練過程和訓練過程有選擇儲存模型

一般我們訓練深度學習模型都是等訓練完成後才能檢視loss acc等這些指標,然而當我們模型設計並不正確或者出現了過擬合甚至於資料擬合出現了很嚴重錯誤,但是在訓練很深的網路的時候一般都要等上好久,我就遇到過用RestNet50訓練100張圖跑了三四天。這時候我們該浪費時間繼續train呢還是等指標出來再判斷,我們都不知道。慶幸的是現在我們用keras可以隨時監督訓練的過程,一旦訓練出現了問題可以隨時終止,然後也可以儲存當前模型等下次訓練的時候可以繼續開始訓練。

舉個例子:用之前寫的miniVGG來訓練cifar10資料集:

先是過載keras的模組:trainmonitor.py

from keras.callbacks import BaseLogger
import matplotlib.pyplot as plt
import numpy as np
import json
import os

class TrainingMonitor(BaseLogger):
	def __init__(self, figPath, jsonPath=None, startAt=0):
		# 儲存loss圖片到指定路徑,同時也儲存json檔案
		super(TrainingMonitor, self).__init__()
		self.figPath = figPath
		self.jsonPath = jsonPath
		# 開始模型開始儲存的開始epoch
		self.startAt = startAt

	def on_train_begin(self, logs={}):
		# 初始化儲存檔案的目錄dict
		self.H = {}
		# 判斷是否存在檔案和該目錄
		if self.jsonPath is not None:
			if os.path.exists(self.jsonPath):
				self.H = json.loads(open(self.jsonPath).read())
				# 開始儲存的epoch是否提供
				if self.startAt > 0:
					for k in self.H.keys():
						# 迴圈儲存歷史記錄,從startAt開始
						self.H[k] = self.H[k][:self.startAt]

	def on_epoch_end(self, epoch, logs={}):
		# 不斷更新logs和loss accuracy等等
		for (k, v) in logs.items():
			l = self.H.get(k, [])
			l.append(v)
			self.H[k] = l
		# 檢視訓練引數記錄是否應該儲存
		# 主要是看jsonPath是否提供
		if self.jsonPath is not None:
			f = open(self.jsonPath, 'w')
			f.write(json.dumps(self.H))
			f.close()
		# 儲存loss acc等成圖片
		if len(self.H["loss"]) > 1:
			N = np.arange(0, len(self.H["loss"]))
			plt.style.use("ggplot")
			plt.figure()
			plt.plot(N, self.H["loss"], label="train_loss")
			plt.plot(N, self.H["val_loss"], label="val_loss")
			plt.plot(N, self.H["acc"], label="train_acc")
			plt.plot(N, self.H["val_acc"], label="val_acc")
			plt.title("Training Loss and Accuracy [Epoch {}]".format(len(self.H["loss"])))
			plt.xlabel("Epoch #")
			plt.ylabel("Loss/Accuracy")
			plt.legend()
			plt.savefig(self.figPath)
			plt.close()

cifar10_monitor.py

import matplotlib
matplotlib.use("Agg")
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from miniVGG import MiniVGGNet
from keras.optimizers import SGD
from keras.callbacks import LearningRateScheduler
from trainingmonitor import TrainingMonitor
from keras.datasets import cifar10
from keras.datasets import cifar100
import matplotlib.pyplot as plt
import numpy as np
import argparse
import os

def step_decay(epoch):
	initAlpha = 0.01
	factor = 0.5
	dropEvery = 5

	alpha = initAlpha * (factor ** np.floor((1 + epoch) / dropEvery))

	return float(alpha)

ap = argparse.ArgumentParser()
ap.add_argument("-o", "--output", required=True, help="path to the output loss/accuracy plot")
ap.add_argument("-m", "--model", required=True, help="path to save train model")
args = vars(ap.parse_args())

print("[INFO] process ID: {}".format(os.getpid()))

print("[INFO] loading CIFAR-10 dataset")
((trainX, trainY), (testX, testY)) = cifar100.load_data()
trainX = trainX.astype("float") / 255.0
testX = testX.astype("float") / 255.0

lb = LabelBinarizer()
trainY = lb.fit_transform(trainY)
testY = lb.transform(testY)
# 標籤0-9代表的類別string
labelNames = ['airplane', 'automobile', 'bird', 'cat', 
	'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

print("[INFO] compiling model...")
# callbacks = [LearningRateScheduler(step_decay)]
# opt = SGD(lr=0.01, decay=0.01 / 70, momentum=0.9, nesterov=True)
opt = SGD(lr=0.01, momentum=0.9, nesterov=True)
model = MiniVGGNet.build(width=32, height=32, depth=3, classes=100)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=['accuracy'])
print(model.summary())

figPath = os.path.sep.join([args["output"], "{}.png".format(os.getpid())])
jsonPath = os.path.sep.join([args["output"], "{}.json".format(os.getpid())])

callbacks = [TrainingMonitor(figPath, jsonPath=jsonPath)]

print("[INFO] training network Lenet-5")
H = model.fit(trainX, trainY, validation_data=(testX, testY), batch_size=64, epochs=100, 
	callbacks=callbacks, verbose=1)

model.save(args["model"])

print("[INFO] evaluating Lenet-5..")
preds = model.predict(testX, batch_size=64)
print(classification_report(testY.argmax(axis=1), preds.argmax(axis=1), 
	target_names=labelNames))

這樣其實就可以看出,在接近25epoch的時候,就有明顯的過擬合了,那麼我們就可以終止目前的訓練,因為之後大批次的訓練也沒意義。還要讓我們的GPU高負荷高溫運轉...

然後接著說怎麼去儲存訓練過程的模型,一般訓練都是完全完成一次訓練後才儲存一個最終模型,萬一在中途已經出現beyond acceptance的過擬合了,那麼最終的儲存的模型反而不是最好了。這時候中間訓練的某個epoch恰恰才是很適合的模型,可我們沒有儲存到,這就很che dan。keras讓我們可以很好的解決這個問題,不得不說,keras相對於tensorflow使用確實友好。下面給了demo:

demo-cifar10-checkpoint.py

from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from miniVGG import MiniVGGNet
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD
from keras.datasets import cifar10
import argparse
import os

ap = argparse.ArgumentParser()
ap.add_argument("-w", "--weights", required=True, help="path toweights directory")
args = vars(ap.parse_args())

print("[INFO] loading CIFAR-10 dataset")
((trainX, trainY), (testX, testY)) = cifar10.load_data()
trainX = trainX.astype("float") / 255.0
testX = testX.astype("float") / 255.0

lb = LabelBinarizer()
trainY = lb.fit_transform(trainY)
testY = lb.transform(testY)
# 標籤0-9代表的類別string
labelNames = ['airplane', 'automobile', 'bird', 'cat', 
	'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

print("[INFO] compiling model...")
opt = SGD(lr=0.01, decay=0.01 / 40, momentum=0.9, nesterov=True)
model = MiniVGGNet.build(width=32, height=32, depth=3, classes=10)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
# 定義模型儲存的模板
fname = os.path.sep.join([args["weights"], "weights={epoch:03d}-{val_loss:.4f}.hdf5"])
# 呼叫keras的模組然後根據引數進行儲存,具體的monitor mode等引數我覺得最後的學習方法就是直接去看原文件
checkpoint = ModelCheckpoint(fname, monitor="val_loss", mode="min", save_best_only=True, verbose=1)
callbacks = [checkpoint]

print("[INFO] training our network MiniVGG...")
H = model.fit(trainX, trainY, validation_data=(testX, testY), batch_size=64, epochs=50, callbacks=callbacks, verbose=2)

print("[INFO] evaluating MiniVGG..")
preds = model.predict(testX, batch_size=64)
print(classification_report(testY.argmax(axis=1), preds.argmax(axis=1), 
	target_names=labelNames))

如果說你嫌棄每次訓練一個epoch,只要loss或者acc有朝著正向方向變化,都要儲存一個模型,豈不是很浪費硬碟儲存,OK,那你可以只用儲存整個訓練過程最好的一個模型:

demo-cifar10-best.py

from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from miniVGG import MiniVGGNet
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD
from keras.datasets import cifar10
import argparse
import os

ap = argparse.ArgumentParser()
ap.add_argument("-w", "--weights", required=True, help="path toweights directory")
args = vars(ap.parse_args())

print("[INFO] loading CIFAR-10 dataset")
((trainX, trainY), (testX, testY)) = cifar10.load_data()
trainX = trainX.astype("float") / 255.0
testX = testX.astype("float") / 255.0

lb = LabelBinarizer()
trainY = lb.fit_transform(trainY)
testY = lb.transform(testY)
# 標籤0-9代表的類別string
labelNames = ['airplane', 'automobile', 'bird', 'cat', 
	'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

print("[INFO] compiling model...")
opt = SGD(lr=0.01, decay=0.01 / 40, momentum=0.9, nesterov=True)
model = MiniVGGNet.build(width=32, height=32, depth=3, classes=10)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])

# fname = os.path.sep.join([args["weights"], "weights={epoch:03d}-{val_loss:.4f}.hdf5"])
# 這裡就改為一個檔案儲存模型,同樣的是儲存validation loss最低的那個模型
checkpoint = ModelCheckpoint(args["weights"], monitor="val_loss", mode="min", save_best_only=True, verbose=1)
callbacks = [checkpoint]

print("[INFO] training our network MiniVGG...")
H = model.fit(trainX, trainY, validation_data=(testX, testY), batch_size=64, epochs=50, callbacks=callbacks, verbose=2)

print("[INFO] evaluating MiniVGG..")
preds = model.predict(testX, batch_size=64)
print(classification_report(testY.argmax(axis=1), preds.argmax(axis=1), 
	target_names=labelNames))

原理其實就是當loss或者acc朝著好的方向訓練的時候,就覆蓋掉原來的那個次的。

這裡再多說一個視覺化我們模型的小trick,也mark一下:

我在ubuntu16上需要先安裝graphviz所依賴的庫

sudo apt-get install graphviz
pip install graphviz
pip install pydot

demo.py

from lenet import LeNet
from miniVGG import MiniVGGNet
from keras.utils import plot_model

model1 = LeNet.build(28, 28, 1, 10)
plot_model(model1, to_file="lenet-5-mnist.png", show_shapes=True)

model12 = LeNet.build(32, 32, 3, 10)
plot_model(model12, to_file="lenet-5-cifar10.png", show_shapes=True)

model13 = LeNet.build(32, 32, 3, 100)
plot_model(model13, to_file="lenet-5-cifar100.png", show_shapes=True)

model2 = MiniVGGNet.build(28, 28, 1, 10)
plot_model(model2, to_file="miniVGGNet-mnist.png", show_shapes=True)

model21 = MiniVGGNet.build(32, 32, 3, 10)
plot_model(model21, to_file="miniVGGNet-cifar10.png", show_shapes=True)

model22 = MiniVGGNet.build(32, 32, 3, 100)
plot_model(model22, to_file="miniVGGNet-cifar100.png", show_shapes=True)

lenet-5-cifar10

lenet-5-cifar100

lenet-cifar-mnist

minivgg-mnist

這種奇技淫巧可以幫助我們視覺化驗證自己設計的網路結構是否在預期內合理。