Yolov2原始碼解析(二)
阿新 • • 發佈:2018-12-30
一、資料集製作
首先是從官網上下載VOC2012資料集,這裡我個人得到是訓練集檔案:VOCtrainval_11-May-2012,為了減輕訓練開銷,我將驗證集作為測試集,通過將Main資料夾下的val.txt改名為test.txt檔案,將資料集製作成hdf5檔案的形式。
import os import h5py import numpy as np import matplotlib.pyplot as plt import xml.etree.ElementTree as ElementTree classes = [ "aeroplane","bicycle","bird","boat","bottle","bus","car","cat", "chair","cow","diningtable", "dog","horse","motorbike","person", "pottedplant","sheep","sofa","train","tvmonitor" ] sets_from_2012 = [('2012', 'train'), ('2012', 'test')] train_set = [('2012', 'train')] test_set = [('2012', 'test')] # 讀取xml檔案內的box資訊, 即(class,x,y,h,w) def get_boxes_for_id(voc_path, year, image_id, height, width): fname = os.path.join(voc_path, 'VOCdevkit/VOC{}/Annotations/{}.xml'.format(year, image_id)) with open(fname) as in_file: xml_tree = ElementTree.parse(in_file) root = xml_tree.getroot() boxes = [] for obj in root.iter('object'): difficult = obj.find('difficult').text label = obj.find('name').text if label not in classes or int(difficult) == 1: continue xml_box = obj.find('bndbox') bbox = (classes.index(label), float(xml_box.find('xmin').text) / width, float(xml_box.find('ymin').text) / height, float(xml_box.find('xmax').text) / width, float(xml_box.find('ymax').text) / height) boxes.extend(bbox) return np.array(boxes) # test ''' voc_path = 'C:/Users/guesthost/Desktop/VOCtrainval_11-May-2012/VOCdevkit/' year = '2012' image_id = '2007_000123' boxes = get_boxes_for_id(voc_path, year, image_id) ''' def get_image_for_id(voc_path, year, image_id): fname = os.path.join(voc_path, 'VOCdevkit/VOC{}/JPEGImages/{}.jpg'.format(year, image_id)) im = plt.imread(fname) height = im.shape[0] width = im.shape[1] return np.ndarray.flatten(im, 'C'), height, width # test ''' voc_path = 'C:/Users/guesthost/Desktop/VOCtrainval_11-May-2012/VOCdevkit/' year = '2012' image_id = '2007_000123' boxes = get_image_for_id(voc_path, year, image_id) ''' # 獲取txt檔案內的圖片名稱 def get_ids(voc_path, datasets): ids = [] for year, image_set in datasets: id_file = os.path.join(voc_path, 'VOCdevkit/VOC{}/ImageSets/Main/{}.txt'.format( year, image_set)) with open(id_file, 'r') as image_ids: ids.extend(map(str.strip, image_ids.readlines())) return ids # test # voc_path = 'C:/Users/guesthost/Desktop/VOCtrainval_11-May-2012/' # ids = get_ids(voc_path, train_set) def add_to_dataset(voc_path, year, ids, images, boxes, image_size, start = 0): for i, voc_id in enumerate(ids): image_data, height, width = get_image_for_id(voc_path, year, voc_id) image_boxes = get_boxes_for_id(voc_path, year, voc_id, height, width) images[start + i] = image_data boxes[start + i] = image_boxes image_size[start + i] = [height, width] def main(voc_path): # 讀取訓練集,驗證集,測試集序號 train_ids = get_ids(voc_path, train_set) test_ids = get_ids(voc_path, test_set) # 建立hdf5資料結構 print('Creating HDF5 dataset structure.') fname = os.path.join(voc_path, 'pascal_voc_2012.hdf5') voc_h5file = h5py.File(fname, 'w') uint8_dt = h5py.special_dtype( vlen = np.dtype('uint8')) vlen_int_dt = h5py.special_dtype( vlen=np.dtype(int)) vlen_float = h5py.special_dtype( vlen=np.dtype(float)) train_group = voc_h5file.create_group('train') test_group = voc_h5file.create_group('test') voc_h5file.attrs['classes'] = np.string_(str.join(',', classes)) # 儲存影象的矩陣資訊 # store images as variable length uint8 arrays train_images = train_group.create_dataset( 'images', shape=(len(train_ids), ), dtype=uint8_dt) test_images = test_group.create_dataset( 'images', shape=(len(test_ids), ), dtype=uint8_dt) # 儲存影象的class,xmin,ymin,xmax,ymax資訊 train_boxes = train_group.create_dataset( 'boxes', shape=(len(train_ids), ), dtype=vlen_float) test_boxes = test_group.create_dataset( 'boxes', shape=(len(test_ids), ), dtype=vlen_float) # 儲存影象的height,width train_size = train_group.create_dataset( 'size', shape=(len(train_ids), ), dtype=vlen_int_dt) test_size = test_group.create_dataset( 'size', shape = (len(test_ids), ), dtype=vlen_int_dt) print('Processing Pascal VOC 2012 datasets for training set.') add_to_dataset(voc_path, '2012', train_ids, train_images, train_boxes, train_size) print('Processing Pascal VOC 2012 test set.') add_to_dataset(voc_path, '2012', test_ids, test_images, test_boxes, test_size) print('Closing HDF5 file.') voc_h5file.close() print('Done.') if __name__ == '__main__': voc_path = 'C:/Users/guesthost/Desktop/VOCtrainval_11-May-2012/' main(voc_path)
這裡將voc_path資料夾下的影象資訊和boxes資訊記錄在h5檔案中,最後生成pascal_voc_2012.hdf5檔案。
二、訓練
首先是讀取資料集,由於之前製作資料集使用的是h5檔案格式。
def load_data(data_path): file = h5py.File(data_path) # file = h5py.File('C:\\Users\\guesthost\\Desktop\\VOCtrainval_11-May-2012\\pascal_voc_2012.hdf5') train_data = file['train'] # 這裡將原來的資料集恢復原本的尺寸 train_size = train_data['size'] train_boxes = train_data['boxes'] train_images = process_images(train_size) class_names = get_classes('C:\\Users\\guesthost\\Desktop\\zc\\yoloV2\\voc_classes.txt') anchors = get_anchors('C:\\Users\\guesthost\\Desktop\\zc\\yoloV2\\voc_anchor_box.txt') return train_images, train_boxes, class_names, anchors
由於在資料集上我們對影象資料進行flatten,所以這裡需要根據記錄影象的尺寸對矩陣進行恢復:
def process_images(train_images,train_size): images = [] for i, size in enumerate(train_size): height = size[0] width = size[1] image = np.reshape(train_images[i], (height, width, 3)) images.append(image) return images
之後對影象矩陣資訊進行處理,包括有調整影象大小,資料型別轉化,標準化。針對boxes,這裡將座標資訊轉化成相對於整張圖片的比值。同時記錄影象最大目標數,將其他小於等於box的填充至最大。
def process_data(images, boxes = None):
'''
處理影象和目標ground box的資訊
'''
images = [PIL.Image.fromarray(i) for i in images]
orig_size = np.array([images[0].width, images[0].height])
orig_size = np.expand_dims(orig_size, axis = 0)
# 處理影象矩陣資訊
processed_images = [i.resize((416, 416), PIL.Image.BICUBIC) for i in images]
processed_images = [np.array(image, dtype = np.float) for image in processed_images]
processed_images = [image / 255. for image in processed_images]
if boxes is not None:
# boxes = [class, x_min, y_min, x_max, y_max]
boxes = [box.reshape((-1, 5)) for box in boxes]
boxes_xy = [0.5 * (box[:, 3:5] + box[:, 1:3]) for box in boxes]
boxes_wh = [box[:, 3:5] - box[:, 1:3] for box in boxes]
boxes_xy = [boxxy / orig_size for boxxy in boxes_xy]
boxes_wh = [boxwh / orig_size for boxwh in boxes_wh]
boxes = [np.concatenate((boxes_xy[i], boxes_wh[i], box[:, 0:1]), axis = 1) for i, box in enumerate(boxes)]
# 為了方便訓練,對於某些圖片內檢測物體的box數量不一致情況,通過對影象
max_boxes = 0
for boxz in boxes:
if boxz.shape[0] > max_boxes:
max_boxes = boxz.shape[0]
for i, boxz in enumerate(boxes):
if boxz.shape[0] < max_boxes:
zero_padding = np.zeros((max_boxes - boxz.shape[0], 5), dtype = np.float32)
boxes[i] = np.vstack((boxz, zero_padding))
return np.array(processed_images), np.array(boxes)
else:
return np.array(processed_images)
獲取訓練所需的一些引數資訊
def get_detector_mask(boxes, anchors):
detectors_mask = [0 for i in range(len(boxes))]
matching_true_boxes = [0 for i in range(len(boxes))]
for i, box in enumerate(boxes):
detectors_mask[i], matching_true_boxes[i] = preprocess_true_boxes(box, anchors, [416, 416])
return np.array(detectors_mask), np.array(matching_true_boxes)
def get_classes(classes_path):
'''載入類別資訊'''
with open(classes_path) as f:
class_names = f.readlines()
class_names = [c.strip() for c in class_names]
return class_names
def get_anchors(anchors_path):
'''載入anchor box'''
with open(anchors_path) as f:
anchors = f.readline()
anchors = [float(x) for x in anchors.split(',')]
anchors = np.array(anchors)
return np.reshape(anchors, (-1, 2))
再我們定義模型,這裡返回兩個模型,其中yolo_model最後返回DarkNet-19網路輸出的特徵向量,model返回帶loss函式的模型。
def create_model(anchors, class_names, load_pretrained = True):
detectors_mask_shape = (13, 13, 5, 1)
matching_boxes_shape = (13, 13, 5, 5)
image_input = Input(shape = (416, 416, 3))
boxes_input = Input(shape = (None, 5))
detectors_mask_input = Input(shape = detectors_mask_shape)
matching_boxes_input = Input(shape = matching_boxes_shape)
yolo_model = yolo_body(image_input, len(anchors), len(class_names))
if load_pretrained:
yolo_path = os.path.join('model_data', 'yolo.h5')
model_body = load_model(yolo_path)
model_body = Model(model_body.inputs, model_body.layers[-2].output)
with tf.device('/cpu:0'):
model_loss = Lambda(
yolo_loss,
output_shape=(1, ),
name='yolo_loss',
arguments={'anchors': anchors,
'num_classes': len(class_names)})([
yolo_model.output, boxes_input,
detectors_mask_input, matching_boxes_input
])
model = Model(
[yolo_model.input, boxes_input, detectors_mask_input,
matching_boxes_input], model_loss)
return yolo_model, model
最後定義訓練函式,具體的引數在程式碼中已給出
def train(model, class_names, anchors, image_data, boxes, detectors_mask, matching_true_boxes, validation_split = 0.1):
model.compile(optimizer = 'adam',
loss = {'yolo_loss': lambda y_true, y_pred: y_pred})
logging = TensorBoard()
checkpoint = ModelCheckpoint('trained_stage_3_best.h5', monitor = 'val_loss',
save_weights_only = True, save_best_only = True)
early_stopping = EarlyStopping(monitor = 'val_loss', min_delta = 0, patience = 15, verbose = 1, mode = 'auto')
# print(image_data.shape)
model.fit([image_data, boxes, detectors_mask, matching_true_boxes],
np.zeros(len(image_data)),
validation_split = validation_split,
batch_size = 32,
epochs = 5,
callbacks = [logging])
model.save_weights('trained_stage_1.h5')
model_body, model = create_model(anchors, class_names, load_pretrained = False)
model.load_weights('trained_stage_1.h5')
model.compile(
optimizer='adam', loss={
'yolo_loss': lambda y_true, y_pred: y_pred
})
model.fit([image_data, boxes, detectors_mask, matching_true_boxes],
np.zeros(len(image_data)),
validation_split = 0.1,
batch_size = 8
)
model.save_weights('trained_stage_2.h5')
model.fit([image_data, boxes, detectors_mask, matching_true_boxes],
np.zeros(len(image_data)),
validation_split = 0.1,
batch_size = 8,
epochs = 30,
callbacks=[logging, checkpoint, early_stopping])
model.save_weights('trained_stage_3.h5')
最終,結合(一)所說明的程式碼,我們定義整個訓練函式
def main():
train_images, train_boxes, class_names, anchors = load_data('C:\\Users\\guesthost\\Desktop\\VOCtrainval_11-May-2012\\pascal_voc_2012.hdf5')
print("Loading dataset successful")
processed_images, processed_boxes = process_data(train_images, train_boxes)
detectors_mask, matching_true_boxes = get_detector_mask(processed_boxes, anchors)
print("Process dataset successful")
yolo_model, model = create_model(anchors, class_names, False)
train(model, class_names, anchors, processed_images, processed_boxes, detectors_mask, matching_true_boxes)
三、後記
我在GTX1080,16G的電腦上訓練,依然會出現記憶體不足的現象,由於個人能力欠缺,暫時還沒有什麼好的解決辦法,所以後續還是匯入別人訓練好的引數最好了。