1. 程式人生 > >Yolov2原始碼解析(二)

Yolov2原始碼解析(二)

一、資料集製作

首先是從官網上下載VOC2012資料集,這裡我個人得到是訓練集檔案:VOCtrainval_11-May-2012,為了減輕訓練開銷,我將驗證集作為測試集,通過將Main資料夾下的val.txt改名為test.txt檔案,將資料集製作成hdf5檔案的形式。

import os
import h5py
import numpy as np
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ElementTree


classes = [
    "aeroplane","bicycle","bird","boat","bottle","bus","car","cat",
    "chair","cow","diningtable", "dog","horse","motorbike","person",
    "pottedplant","sheep","sofa","train","tvmonitor"
]

sets_from_2012 = [('2012', 'train'), ('2012', 'test')]
train_set = [('2012', 'train')]
test_set = [('2012', 'test')]

# 讀取xml檔案內的box資訊, 即(class,x,y,h,w)
def get_boxes_for_id(voc_path, year, image_id, height, width):
    fname = os.path.join(voc_path, 'VOCdevkit/VOC{}/Annotations/{}.xml'.format(year, image_id))
    
    with open(fname) as in_file:
        xml_tree = ElementTree.parse(in_file)
    
    root = xml_tree.getroot()
    boxes = []
    
    for obj in root.iter('object'):
        difficult = obj.find('difficult').text
        label = obj.find('name').text
        if label not in classes or int(difficult) == 1:
            continue
        xml_box = obj.find('bndbox')
        bbox = (classes.index(label), float(xml_box.find('xmin').text) / width,
                float(xml_box.find('ymin').text) / height, float(xml_box.find('xmax').text) / width,
                float(xml_box.find('ymax').text) / height)
        boxes.extend(bbox)
    return np.array(boxes)
# test
'''
voc_path = 'C:/Users/guesthost/Desktop/VOCtrainval_11-May-2012/VOCdevkit/'
year = '2012'
image_id = '2007_000123'
boxes = get_boxes_for_id(voc_path, year, image_id)
'''

def get_image_for_id(voc_path, year, image_id):
    fname = os.path.join(voc_path, 'VOCdevkit/VOC{}/JPEGImages/{}.jpg'.format(year, image_id))
    
    im = plt.imread(fname)
    height = im.shape[0]
    width = im.shape[1]
    return np.ndarray.flatten(im, 'C'), height, width

# test
'''
voc_path = 'C:/Users/guesthost/Desktop/VOCtrainval_11-May-2012/VOCdevkit/'
year = '2012'
image_id = '2007_000123'
boxes = get_image_for_id(voc_path, year, image_id)
'''

# 獲取txt檔案內的圖片名稱
def get_ids(voc_path, datasets):
    ids = []
    for year, image_set in datasets:
        id_file = os.path.join(voc_path, 'VOCdevkit/VOC{}/ImageSets/Main/{}.txt'.format(
            year, image_set))
        with open(id_file, 'r') as image_ids:
            ids.extend(map(str.strip, image_ids.readlines()))
    return ids
# test
# voc_path = 'C:/Users/guesthost/Desktop/VOCtrainval_11-May-2012/'
# ids = get_ids(voc_path, train_set)

def add_to_dataset(voc_path, year, ids, images, boxes, image_size, start = 0):
    for i, voc_id in enumerate(ids):
        image_data, height, width = get_image_for_id(voc_path, year, voc_id)
        image_boxes = get_boxes_for_id(voc_path, year, voc_id, height, width)
        
        
        images[start + i] = image_data
        boxes[start + i] = image_boxes
        image_size[start + i] = [height, width]

def main(voc_path):
    # 讀取訓練集,驗證集,測試集序號
    train_ids = get_ids(voc_path, train_set)
    test_ids = get_ids(voc_path, test_set)
    
    # 建立hdf5資料結構
    print('Creating HDF5 dataset structure.')
    fname = os.path.join(voc_path, 'pascal_voc_2012.hdf5')
    voc_h5file = h5py.File(fname, 'w')
    uint8_dt = h5py.special_dtype(
        vlen = np.dtype('uint8'))
    vlen_int_dt = h5py.special_dtype(
        vlen=np.dtype(int))
    vlen_float = h5py.special_dtype(
            vlen=np.dtype(float))
    train_group = voc_h5file.create_group('train')
    test_group = voc_h5file.create_group('test')
    voc_h5file.attrs['classes'] = np.string_(str.join(',', classes))
    
    # 儲存影象的矩陣資訊
    # store images as variable length uint8 arrays
    train_images = train_group.create_dataset(
        'images', shape=(len(train_ids), ), dtype=uint8_dt)
    test_images = test_group.create_dataset(
        'images', shape=(len(test_ids), ), dtype=uint8_dt)
    
    # 儲存影象的class,xmin,ymin,xmax,ymax資訊
    train_boxes = train_group.create_dataset(
        'boxes', shape=(len(train_ids), ), dtype=vlen_float)
    test_boxes = test_group.create_dataset(
        'boxes', shape=(len(test_ids), ), dtype=vlen_float)
    
    # 儲存影象的height,width
    train_size = train_group.create_dataset(
        'size', shape=(len(train_ids), ), dtype=vlen_int_dt)
    test_size = test_group.create_dataset(
        'size', shape = (len(test_ids), ), dtype=vlen_int_dt)
    
    print('Processing Pascal VOC 2012 datasets for training set.')
    add_to_dataset(voc_path, '2012', train_ids, train_images,
                               train_boxes, train_size)
    print('Processing Pascal VOC 2012 test set.')
    add_to_dataset(voc_path, '2012', test_ids, test_images, test_boxes, test_size)
    print('Closing HDF5 file.')
    voc_h5file.close()
    print('Done.')
    
if __name__ == '__main__':
    voc_path = 'C:/Users/guesthost/Desktop/VOCtrainval_11-May-2012/'
    main(voc_path)

這裡將voc_path資料夾下的影象資訊和boxes資訊記錄在h5檔案中,最後生成pascal_voc_2012.hdf5檔案。

 

二、訓練

首先是讀取資料集,由於之前製作資料集使用的是h5檔案格式。

def load_data(data_path):
    file = h5py.File(data_path)
    # file = h5py.File('C:\\Users\\guesthost\\Desktop\\VOCtrainval_11-May-2012\\pascal_voc_2012.hdf5')
    train_data = file['train']
    # 這裡將原來的資料集恢復原本的尺寸

    train_size = train_data['size']
    train_boxes = train_data['boxes']
    train_images = process_images(train_size)
    class_names = get_classes('C:\\Users\\guesthost\\Desktop\\zc\\yoloV2\\voc_classes.txt')
    anchors = get_anchors('C:\\Users\\guesthost\\Desktop\\zc\\yoloV2\\voc_anchor_box.txt')
    return train_images, train_boxes, class_names, anchors

 

由於在資料集上我們對影象資料進行flatten,所以這裡需要根據記錄影象的尺寸對矩陣進行恢復:

def process_images(train_images,train_size):
    images = []
    for i, size in enumerate(train_size):
        height = size[0]
        width = size[1]
        image = np.reshape(train_images[i], (height, width, 3))
        images.append(image)
    return images

 

之後對影象矩陣資訊進行處理,包括有調整影象大小,資料型別轉化,標準化。針對boxes,這裡將座標資訊轉化成相對於整張圖片的比值。同時記錄影象最大目標數,將其他小於等於box的填充至最大。

def process_data(images, boxes = None):
    '''
    處理影象和目標ground box的資訊
    '''
    images = [PIL.Image.fromarray(i) for i in images]
    orig_size = np.array([images[0].width, images[0].height])
    orig_size = np.expand_dims(orig_size, axis = 0)
    
    # 處理影象矩陣資訊
    processed_images = [i.resize((416, 416), PIL.Image.BICUBIC) for i in images]
    processed_images = [np.array(image, dtype = np.float) for image in processed_images]
    processed_images = [image / 255. for image in processed_images]
    
    
    if boxes is not None:
        # boxes = [class, x_min, y_min, x_max, y_max]
        boxes = [box.reshape((-1, 5)) for box in boxes]
        boxes_xy = [0.5 * (box[:, 3:5] + box[:, 1:3]) for box in boxes]
        boxes_wh = [box[:, 3:5] - box[:, 1:3] for box in boxes]
        boxes_xy = [boxxy / orig_size for boxxy in boxes_xy]
        boxes_wh = [boxwh / orig_size for boxwh in boxes_wh]
        
        boxes = [np.concatenate((boxes_xy[i], boxes_wh[i], box[:, 0:1]), axis = 1) for i, box in enumerate(boxes)]
        
        # 為了方便訓練,對於某些圖片內檢測物體的box數量不一致情況,通過對影象
        max_boxes = 0
        for boxz in boxes:
            if boxz.shape[0] > max_boxes:
                max_boxes = boxz.shape[0]
                
        for i, boxz in enumerate(boxes):
            if boxz.shape[0]  < max_boxes:
                zero_padding = np.zeros((max_boxes - boxz.shape[0], 5), dtype = np.float32)
                boxes[i] = np.vstack((boxz, zero_padding))
                
        return np.array(processed_images), np.array(boxes)
    else:
        return np.array(processed_images)

 

獲取訓練所需的一些引數資訊

def get_detector_mask(boxes, anchors):
    detectors_mask = [0 for i in range(len(boxes))]
    matching_true_boxes = [0 for i in range(len(boxes))]
    for i, box in enumerate(boxes):
        detectors_mask[i], matching_true_boxes[i] = preprocess_true_boxes(box, anchors, [416, 416])
    return np.array(detectors_mask), np.array(matching_true_boxes)
def get_classes(classes_path):
    '''載入類別資訊'''
    with open(classes_path) as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]
    return class_names
def get_anchors(anchors_path):
    '''載入anchor box'''
    with open(anchors_path) as f:
        anchors = f.readline()
        anchors = [float(x) for x in anchors.split(',')]
        anchors = np.array(anchors)
        return np.reshape(anchors, (-1, 2))

 

再我們定義模型,這裡返回兩個模型,其中yolo_model最後返回DarkNet-19網路輸出的特徵向量,model返回帶loss函式的模型。

def create_model(anchors, class_names, load_pretrained = True):
    detectors_mask_shape = (13, 13, 5, 1)
    matching_boxes_shape = (13, 13, 5, 5)
    image_input = Input(shape = (416, 416, 3))
    boxes_input = Input(shape = (None, 5))
    
    detectors_mask_input = Input(shape = detectors_mask_shape)
    matching_boxes_input = Input(shape = matching_boxes_shape)

    yolo_model = yolo_body(image_input, len(anchors), len(class_names))

    if load_pretrained:
        yolo_path = os.path.join('model_data', 'yolo.h5')
        model_body = load_model(yolo_path)
        model_body = Model(model_body.inputs, model_body.layers[-2].output)

    with tf.device('/cpu:0'):
        model_loss = Lambda(
            yolo_loss,
            output_shape=(1, ),
            name='yolo_loss',
            arguments={'anchors': anchors,
                       'num_classes': len(class_names)})([
                           yolo_model.output, boxes_input,
                           detectors_mask_input, matching_boxes_input
                       ])
    model = Model(
        [yolo_model.input, boxes_input, detectors_mask_input,
         matching_boxes_input], model_loss)
    return yolo_model, model

 

最後定義訓練函式,具體的引數在程式碼中已給出

def train(model, class_names, anchors, image_data, boxes, detectors_mask, matching_true_boxes, validation_split = 0.1):
    model.compile(optimizer = 'adam',
                  loss = {'yolo_loss': lambda y_true, y_pred: y_pred})
    
    logging = TensorBoard()
    checkpoint = ModelCheckpoint('trained_stage_3_best.h5', monitor = 'val_loss',
                                 save_weights_only = True, save_best_only = True)
    early_stopping = EarlyStopping(monitor = 'val_loss', min_delta = 0, patience = 15, verbose = 1, mode = 'auto')
    # print(image_data.shape)
    model.fit([image_data, boxes, detectors_mask, matching_true_boxes],
              np.zeros(len(image_data)),
              validation_split = validation_split,
              batch_size = 32,
              epochs = 5,
              callbacks = [logging])
    model.save_weights('trained_stage_1.h5')
    
    model_body, model = create_model(anchors, class_names, load_pretrained = False)
    model.load_weights('trained_stage_1.h5')
    model.compile(
        optimizer='adam', loss={
            'yolo_loss': lambda y_true, y_pred: y_pred
        })
    
    model.fit([image_data, boxes, detectors_mask, matching_true_boxes],
              np.zeros(len(image_data)),
              validation_split = 0.1,
              batch_size = 8
              )
    model.save_weights('trained_stage_2.h5')
    model.fit([image_data, boxes, detectors_mask, matching_true_boxes],
              np.zeros(len(image_data)),
              validation_split = 0.1,
              batch_size = 8,
              epochs = 30,
              callbacks=[logging, checkpoint, early_stopping])
    model.save_weights('trained_stage_3.h5')

 

最終,結合(一)所說明的程式碼,我們定義整個訓練函式

def main():
    train_images, train_boxes, class_names, anchors = load_data('C:\\Users\\guesthost\\Desktop\\VOCtrainval_11-May-2012\\pascal_voc_2012.hdf5')
    print("Loading dataset successful")
    processed_images, processed_boxes = process_data(train_images, train_boxes)
    detectors_mask, matching_true_boxes = get_detector_mask(processed_boxes, anchors)
    print("Process dataset successful")
    yolo_model, model = create_model(anchors, class_names, False)
    train(model, class_names, anchors, processed_images, processed_boxes, detectors_mask, matching_true_boxes)

 

 

三、後記

我在GTX1080,16G的電腦上訓練,依然會出現記憶體不足的現象,由於個人能力欠缺,暫時還沒有什麼好的解決辦法,所以後續還是匯入別人訓練好的引數最好了。