1. 程式人生 > >基於 SSD: Single Shot MultiBox Detector 的人體上下半身檢測

基於 SSD: Single Shot MultiBox Detector 的人體上下半身檢測

基於 SSD 的人體上下半身檢測

這裡主要是通過將訓練資料轉換成 Pascal VOC 資料集格式來實現 SSD 檢測人體上下半身.

由於沒有對人體上下半身進行標註的資料集, 這裡利用 MPII Human Pose Dataset 來將 Pose 資料轉換成上下半身 box 資料, 故box的準確性不一定很高, 但還是可以用來測試學習使用的.

1. Pose to GTbox

將MPII Human Pose Data 轉換為 json 格式 - mpii_single.txt, 其內容如下:

mpii/060111501.jpg|{"PELVIS": [904,237], "THORAX": [858,135], "
NECK": [871.1877,180.4244], "HEAD": [835.8123,58.5756], "R_ANKLE": [980,322], "R_KNEE": [896,318], "R_HIP": [865,248], "L_HIP": [943,226], "L_KNEE": [948,290], "L_ANKLE": [881,349], "R_WRIST": [772,294], "R_ELBOW": [754,247], "R_SHOULDER": [792,147], "L_SHOULDER": [923,123], "L_ELBOW": [995,163], "L_WRIST": [961,223]}
mpii/002058449.jpg|{"PELVIS": [846,351], "THORAX": [738,259], "NECK": [795.2738,314.8937], "HEAD": [597.7262,122.1063], "R_ANKLE": [918,456], "R_KNEE": [659,518], "R_HIP": [713,413], "L_HIP": [979,288], "L_KNEE": [1222,453], "L_ANKLE": [974,399], "R_WRIST": [441,490], "R_ELBOW": [446,434], "R_SHOULDER": [599,270], "
L_SHOULDER": [877,247], "L_ELBOW": [1112,384], "L_WRIST": [1012,489]} mpii/029122914.jpg|{"PELVIS": [332,346], "THORAX": [325,217], "NECK": [326.2681,196.1669], "HEAD": [330.7319,122.8331], "R_ANKLE": [301,473], "R_KNEE": [302,346], "R_HIP": [362,345], "L_HIP": [367,470], "L_KNEE": [275,299], "L_ANKLE": [262,300], "R_WRIST": [278,220], "R_ELBOW": [371,213], "R_SHOULDER": [396,309], "L_SHOULDER": [393,290]} mpii/061185289.jpg|{"PELVIS": [533,322], "THORAX": [515.0945,277.1333], "NECK": [463.9055,148.8667], "HEAD": [353,172], "R_ANKLE": [426,239], "R_KNEE": [513,288], "R_HIP": [552,355]} mpii/013949386.jpg|{"PELVIS": [159,370], "THORAX": [189,228], "NECK": [191.1195,227.0916], "HEAD": [326.8805,168.9084], "R_ANKLE": [110,385], "R_KNEE": [208,355], "R_HIP": [367,363], "L_HIP": [254,429], "L_KNEE": [166,303], "L_ANKLE": [212,153], "R_WRIST": [319,123], "R_ELBOW": [376,39]} ....

定義上下半身關節點:

upper = ['HEAD', 'NECK', 'L_SHOULDER', 'L_ELBOW', 'L_WRIST', 'R_WRIST', 'R_ELBOW', 'R_SHOULDER', 'THORAX']
lower = ['PELVIS', 'L_HIP', 'L_KNEE', 'L_ANKLE', 'R_ANKLE', 'R_KNEE', 'R_HIP']

以關節點影象中的位置, 設定外擴 50 個 畫素,以使得 gtbox 儘可能準確.

get_gtbox.py

#!/usr/bin/env python
import json
import cv2
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import scipy.misc as scm

upper = ['HEAD', 'NECK', 'L_SHOULDER', 'L_ELBOW', 'L_WRIST', 'R_WRIST', 'R_ELBOW', 'R_SHOULDER', 'THORAX']
lower = ['PELVIS', 'L_HIP', 'L_KNEE', 'L_ANKLE', 'R_ANKLE', 'R_KNEE', 'R_HIP']


datas = open('mpii_single.txt').readlines()
print 'Length of datas: ', len(datas)

f = open('mpii_gtbox.txt', 'w')
for data in datas:
    # print data
    datasplit = data.split('|')
    imgname, posedict = datasplit[0], json.loads(datasplit[1])
    img = np.array(Image.open(imgname), dtype=np.uint8)
    height, width, _ = np.shape(img)

    if len(posedict.keys()) == 16: # only joints of full body used to get gtbox 
        x_upper, y_upper = [], []
        for joint in upper:
            x_upper.append(posedict[joint][0])
            y_upper.append(posedict[joint][1])
        upper_x1, upper_y1 = int(max(min(x_upper) - 50, 0)),     int(max(min(y_upper) - 50, 0))
        upper_x2, upper_y2 = int(min(max(x_upper) + 50, width)), int(min(max(y_upper) + 50, height))
        img = cv2.rectangle(img, (upper_x1, upper_y1), (upper_x2, upper_y2), (0, 255, 0), 2)

        x_lower, y_lower = [], []
        for joint in lower:
            x_lower.append(posedict[joint][0])
            y_lower.append(posedict[joint][1])
        lower_x1, lower_y1 = int(max(min(x_lower) - 50, 0)),     int(max(min(y_lower) - 50, 0))
        lower_x2, lower_y2 = int(min(max(x_lower) + 50, width)), int(min(max(y_lower) + 50, height))
        img = cv2.rectangle(img, (lower_x1, lower_y1), (lower_x2, lower_y2), (255, 0, 0), 2)

        tempstr_upper = str(upper_x1) + ',' + str(upper_y1) + ',' + str(upper_x2) + ',' + str(upper_y2) + ',upper'
        tempstr_lower = str(lower_x1) + ',' + str(lower_y1) + ',' + str(lower_x2) + ',' + str(lower_y2) + ',lower'
        tempstr = imgname + '|' + tempstr_upper + '|' + tempstr_lower + '\n'
        f.write(tempstr)
        # plt.imshow(img)
        # plt.show()
f.close()
print 'Done.'

得到的 gtbox 如下:
這裡寫圖片描述
這裡寫圖片描述

2. GTbox - txt2xml

由於Pascal VOC 的 image-xml 的格式, 即一張圖片對應一個 xml 標註資訊, 因此這裡也將得到的 人體上下半身的 gtbox 轉換成 xml 標註的形式.

這裡每張圖片都是有兩個標註資訊的, 上半身 gtbox 和 下半身 gtbox.

txt2xml.py

#! /usr/bin/python
import os
from PIL import Image

datas = open("mpii_gtbox.txt").readlines()

imgpath = "mpii/"
ann_dir = 'gtboxs/'
for data in datas:
    datasplit = datas.split('|')
    img_name = datasplit[0]
    im = Image.open(imgpath + img_name)
    width, height = im.size

    gts = datasplit[1:]
    # write in xml file
    if os.path.exists(ann_dir + os.path.dirname(img_name)):
        pass
    else:
        os.makedirs(ann_dir + os.path.dirname(img_name))
        os.mknod(ann_dir + img_name[:-4] + '.xml')
    xml_file = open((ann_dir + img_name[:-4] + '.xml'), 'w')
    xml_file.write('<annotation>\n')
    xml_file.write('    <folder>gtbox</folder>\n')
    xml_file.write('    <filename>' + img_name + '</filename>\n')
    xml_file.write('    <size>\n')
    xml_file.write('        <width>' + str(width) + '</width>\n')
    xml_file.write('        <height>' + str(height) + '</height>\n')
    xml_file.write('        <depth>3</depth>\n')
    xml_file.write('     </size>\n')

    # write the region of text on xml file
    for img_each_label in gts:
        spt = img_each_label.split(',')
        xml_file.write('    <object>\n')
        xml_file.write('        <name>'+ spt[4].strip() + '</name>\n')
        xml_file.write('        <pose>Unspecified</pose>\n')
        xml_file.write('        <truncated>0</truncated>\n')
        xml_file.write('        <difficult>0</difficult>\n')
        xml_file.write('        <bndbox>\n')
        xml_file.write('            <xmin>' + str(spt[0]) + '</xmin>\n')
        xml_file.write('            <ymin>' + str(spt[1]) + '</ymin>\n')
        xml_file.write('            <xmax>' + str(spt[2]) + '</xmax>\n')
        xml_file.write('            <ymax>' + str(spt[3]) + '</ymax>\n')
        xml_file.write('        </bndbox>\n')
        xml_file.write('    </object>\n')

    xml_file.write('</annotation>')
    xml_file.close() #

print 'Done.'

gtbox - xml 內容格式如:

<annotation>
    <folder>gtbox</folder>
    <filename>mpii/000004812.jpg</filename>
    <size>
        <width>1920</width>
        <height>1080</height>
        <depth>3</depth>
     </size>
    <object>
        <name>upper</name>
        <pose>Unspecified</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>1408</xmin>
            <ymin>573</ymin>
            <xmax>1848</xmax>
            <ymax>1025</ymax>
        </bndbox>
    </object>
    <object>
        <name>lower</name>
        <pose>Unspecified</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>1310</xmin>
            <ymin>475</ymin>
            <xmax>1460</xmax>
            <ymax>1042</ymax>
        </bndbox>
    </object>
</annotation>

3. Create LMDB

生成 trainval.txt 和 test.txt, 其內容格式為:

mpii/038796633.jpg gtboxs/038796633.xml
mpii/081305121.jpg gtboxs/081305121.xml
mpii/016047648.jpg gtboxs/016047648.xml
mpii/078242581.jpg gtboxs/078242581.xml
mpii/027364042.jpg gtboxs/027364042.xml
mpii/090828862.jpg gtboxs/090828862.xml
......

labelmap_gtbox.prototxt 定義如下:

item {
  name: "none_of_the_above"
  label: 0
  display_name: "background"
}
item {
  name: "upper"
  label: 1
  display_name: "upper"
}
item {
  name: "lower"
  label: 2
  display_name: "lower"
}

test_name_size.py 來生成 test_name_size.txt:

#! /usr/bin/python

import os
from PIL import Image

img_lists = open('test.txt').readlines()
img_lists = [item.split(' ')[0] for item in img_lists]

test_name_size = open('test_name_size.txt', 'w')

imgpath = "mpii/"
for item in img_lists:
    img = Image.open(imgpath + item)
    width, height = img.size
    temp1, temp2 = os.path.splitext(item)
    test_name_size.write(temp1 + ' ' + str(height) + ' ' + str(width) + '\n')

print 'Done.'

利用 create_data.sh 建立 trainval 和 test 的 lmdb —— gtbox_trainval_lmdb 和 gtbox_test_lmdb.

cur_dir=$(cd $( dirname ${BASH_SOURCE[0]} ) && pwd )
root_dir="mpii/data"est
ssd_dir="/path/to/caffe-ssd"

cd $root_dir

redo=1
data_root_dir="mpii/"
dataset_name="gtbox"
mapfile="$root_dir/labelmap_gtbox.prototxt"
anno_type="detection"
db="lmdb"
min_dim=0
max_dim=0
width=0
height=0

extra_cmd="--encode-type=jpg --encoded"
if [ $redo ]
then
  extra_cmd="$extra_cmd --redo"
fi
for subset in test trainval
do
  python $ssd_dir/scripts/create_annoset.py --anno-type=$anno_type --label-map-file=$mapfile --min-dim=$min_dim --max-dim=$max_dim --resize-width=$width --resize-height=$height --check-label $extra_cmd $data_root_dir $root_dir/$subset.txt $root_dir/$dataset_name/$db/$dataset_name"_"$subset"_"$db ddbox/$dataset_name
done

4. Train/Eval

訓練得到的測試精度接近 90%,還可以.

檢測程式碼 —— ssd_detect.py

#!/usr/bin/env/python
import numpy as np
import matplotlib.pyplot as plt

caffe_root = '/path/to/caffe-ssd/'
import sys
sys.path.insert(0, caffe_root + 'python')

import caffe
caffe.set_device(0)
caffe.set_mode_gpu()

from google.protobuf import text_format
from caffe.proto import caffe_pb2

# load labels
labelmap_file = 'gtbox/labelmap_gtbox.prototxt'
file = open(labelmap_file, 'r')
labelmap = caffe_pb2.LabelMap()
text_format.Merge(str(file.read()), labelmap)

def get_labelname(labelmap, labels):
    num_labels = len(labelmap.item)
    labelnames = []
    if type(labels) is not list:
        labels = [labels]
    for label in labels:
        found = False
        for i in xrange(0, num_labels):
            if label == labelmap.item[i].label:
                found = True
                labelnames.append(labelmap.item[i].display_name)
                break
        assert found == True
    return labelnames


model_def     = 'deploy.prototxt'
model_weights = 'VGG_gtbox_SSD_300x300_iter_120000.caffemodel'
net = caffe.Net(model_def, model_weights, caffe.TEST)

image_resize = 300
net.blobs['data'].reshape(1, 3, image_resize, image_resize)


transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
transformer.set_transpose('data', (2, 0, 1))
transformer.set_mean('data', np.array([104,117,123])) # mean pixel
transformer.set_raw_scale('data', 255)  # the reference model operates on images in [0,255] range instead of [0,1]
transformer.set_channel_swap('data', (2,1,0))  # the reference model has channels in BGR order instead of RGB


image = caffe.io.load_image('images/000000011.jpg')

transformed_image = transformer.preprocess('data', image)
net.blobs['data'].data[...] = transformed_image

# Forward pass.
detections = net.forward()['detection_out']

# Parse the outputs.
det_label = detections[0,0,:,1]
det_conf = detections[0,0,:,2]
det_xmin = detections[0,0,:,3]
det_ymin = detections[0,0,:,4]
det_xmax = detections[0,0,:,5]
det_ymax = detections[0,0,:,6]

# Get detections with confidence higher than 0.6.
top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.6]

top_conf = det_conf[top_indices]
top_label_indices = det_label[top_indices].tolist()
top_labels = get_labelname(labelmap, top_label_indices)
top_xmin = det_xmin[top_indices]
top_ymin = det_ymin[top_indices]
top_xmax = det_xmax[top_indices]
top_ymax = det_ymax[top_indices]

colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist()

plt.imshow(image)
plt.axis('off')
currentAxis = plt.gca()

for i in xrange(top_conf.shape[0]):
    xmin = int(round(top_xmin[i] * image.shape[1]))
    ymin = int(round(top_ymin[i] * image.shape[0]))
    xmax = int(round(top_xmax[i] * image.shape[1]))
    ymax = int(round(top_ymax[i] * image.shape[0]))
    score = top_conf[i]
    label = int(top_label_indices[i])
    label_name = top_labels[i]
    display_txt = '%s: %.2f'%(label_name, score)
    coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1
    color = colors[label]
    currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
    currentAxis.text(xmin, ymin, display_txt, bbox={'facecolor':color, 'alpha':0.5})

plt.show()

5. Results

這裡寫圖片描述
這裡寫圖片描述
這裡寫圖片描述

6. Reference