基於 SSD: Single Shot MultiBox Detector 的人體上下半身檢測
阿新 • • 發佈:2019-01-31
基於 SSD 的人體上下半身檢測
這裡主要是通過將訓練資料轉換成 Pascal VOC 資料集格式來實現 SSD 檢測人體上下半身.
由於沒有對人體上下半身進行標註的資料集, 這裡利用 MPII Human Pose Dataset 來將 Pose 資料轉換成上下半身 box 資料, 故box的準確性不一定很高, 但還是可以用來測試學習使用的.
1. Pose to GTbox
將MPII Human Pose Data 轉換為 json 格式 - mpii_single.txt, 其內容如下:
mpii/060111501.jpg|{"PELVIS": [904,237], "THORAX": [858,135], " NECK": [871.1877,180.4244], "HEAD": [835.8123,58.5756], "R_ANKLE": [980,322], "R_KNEE": [896,318], "R_HIP": [865,248], "L_HIP": [943,226], "L_KNEE": [948,290], "L_ANKLE": [881,349], "R_WRIST": [772,294], "R_ELBOW": [754,247], "R_SHOULDER": [792,147], "L_SHOULDER": [923,123], "L_ELBOW": [995,163], "L_WRIST": [961,223]}
mpii/002058449.jpg|{"PELVIS": [846,351], "THORAX": [738,259], "NECK": [795.2738,314.8937], "HEAD": [597.7262,122.1063], "R_ANKLE": [918,456], "R_KNEE": [659,518], "R_HIP": [713,413], "L_HIP": [979,288], "L_KNEE": [1222,453], "L_ANKLE": [974,399], "R_WRIST": [441,490], "R_ELBOW": [446,434], "R_SHOULDER": [599,270], " L_SHOULDER": [877,247], "L_ELBOW": [1112,384], "L_WRIST": [1012,489]}
mpii/029122914.jpg|{"PELVIS": [332,346], "THORAX": [325,217], "NECK": [326.2681,196.1669], "HEAD": [330.7319,122.8331], "R_ANKLE": [301,473], "R_KNEE": [302,346], "R_HIP": [362,345], "L_HIP": [367,470], "L_KNEE": [275,299], "L_ANKLE": [262,300], "R_WRIST": [278,220], "R_ELBOW": [371,213], "R_SHOULDER": [396,309], "L_SHOULDER": [393,290]}
mpii/061185289.jpg|{"PELVIS": [533,322], "THORAX": [515.0945,277.1333], "NECK": [463.9055,148.8667], "HEAD": [353,172], "R_ANKLE": [426,239], "R_KNEE": [513,288], "R_HIP": [552,355]}
mpii/013949386.jpg|{"PELVIS": [159,370], "THORAX": [189,228], "NECK": [191.1195,227.0916], "HEAD": [326.8805,168.9084], "R_ANKLE": [110,385], "R_KNEE": [208,355], "R_HIP": [367,363], "L_HIP": [254,429], "L_KNEE": [166,303], "L_ANKLE": [212,153], "R_WRIST": [319,123], "R_ELBOW": [376,39]}
....
定義上下半身關節點:
upper = ['HEAD', 'NECK', 'L_SHOULDER', 'L_ELBOW', 'L_WRIST', 'R_WRIST', 'R_ELBOW', 'R_SHOULDER', 'THORAX']
lower = ['PELVIS', 'L_HIP', 'L_KNEE', 'L_ANKLE', 'R_ANKLE', 'R_KNEE', 'R_HIP']
以關節點影象中的位置, 設定外擴 50 個 畫素,以使得 gtbox 儘可能準確.
get_gtbox.py
#!/usr/bin/env python
import json
import cv2
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import scipy.misc as scm
upper = ['HEAD', 'NECK', 'L_SHOULDER', 'L_ELBOW', 'L_WRIST', 'R_WRIST', 'R_ELBOW', 'R_SHOULDER', 'THORAX']
lower = ['PELVIS', 'L_HIP', 'L_KNEE', 'L_ANKLE', 'R_ANKLE', 'R_KNEE', 'R_HIP']
datas = open('mpii_single.txt').readlines()
print 'Length of datas: ', len(datas)
f = open('mpii_gtbox.txt', 'w')
for data in datas:
# print data
datasplit = data.split('|')
imgname, posedict = datasplit[0], json.loads(datasplit[1])
img = np.array(Image.open(imgname), dtype=np.uint8)
height, width, _ = np.shape(img)
if len(posedict.keys()) == 16: # only joints of full body used to get gtbox
x_upper, y_upper = [], []
for joint in upper:
x_upper.append(posedict[joint][0])
y_upper.append(posedict[joint][1])
upper_x1, upper_y1 = int(max(min(x_upper) - 50, 0)), int(max(min(y_upper) - 50, 0))
upper_x2, upper_y2 = int(min(max(x_upper) + 50, width)), int(min(max(y_upper) + 50, height))
img = cv2.rectangle(img, (upper_x1, upper_y1), (upper_x2, upper_y2), (0, 255, 0), 2)
x_lower, y_lower = [], []
for joint in lower:
x_lower.append(posedict[joint][0])
y_lower.append(posedict[joint][1])
lower_x1, lower_y1 = int(max(min(x_lower) - 50, 0)), int(max(min(y_lower) - 50, 0))
lower_x2, lower_y2 = int(min(max(x_lower) + 50, width)), int(min(max(y_lower) + 50, height))
img = cv2.rectangle(img, (lower_x1, lower_y1), (lower_x2, lower_y2), (255, 0, 0), 2)
tempstr_upper = str(upper_x1) + ',' + str(upper_y1) + ',' + str(upper_x2) + ',' + str(upper_y2) + ',upper'
tempstr_lower = str(lower_x1) + ',' + str(lower_y1) + ',' + str(lower_x2) + ',' + str(lower_y2) + ',lower'
tempstr = imgname + '|' + tempstr_upper + '|' + tempstr_lower + '\n'
f.write(tempstr)
# plt.imshow(img)
# plt.show()
f.close()
print 'Done.'
得到的 gtbox 如下:
2. GTbox - txt2xml
由於Pascal VOC 的 image-xml 的格式, 即一張圖片對應一個 xml 標註資訊, 因此這裡也將得到的 人體上下半身的 gtbox 轉換成 xml 標註的形式.
這裡每張圖片都是有兩個標註資訊的, 上半身 gtbox 和 下半身 gtbox.
txt2xml.py
#! /usr/bin/python
import os
from PIL import Image
datas = open("mpii_gtbox.txt").readlines()
imgpath = "mpii/"
ann_dir = 'gtboxs/'
for data in datas:
datasplit = datas.split('|')
img_name = datasplit[0]
im = Image.open(imgpath + img_name)
width, height = im.size
gts = datasplit[1:]
# write in xml file
if os.path.exists(ann_dir + os.path.dirname(img_name)):
pass
else:
os.makedirs(ann_dir + os.path.dirname(img_name))
os.mknod(ann_dir + img_name[:-4] + '.xml')
xml_file = open((ann_dir + img_name[:-4] + '.xml'), 'w')
xml_file.write('<annotation>\n')
xml_file.write(' <folder>gtbox</folder>\n')
xml_file.write(' <filename>' + img_name + '</filename>\n')
xml_file.write(' <size>\n')
xml_file.write(' <width>' + str(width) + '</width>\n')
xml_file.write(' <height>' + str(height) + '</height>\n')
xml_file.write(' <depth>3</depth>\n')
xml_file.write(' </size>\n')
# write the region of text on xml file
for img_each_label in gts:
spt = img_each_label.split(',')
xml_file.write(' <object>\n')
xml_file.write(' <name>'+ spt[4].strip() + '</name>\n')
xml_file.write(' <pose>Unspecified</pose>\n')
xml_file.write(' <truncated>0</truncated>\n')
xml_file.write(' <difficult>0</difficult>\n')
xml_file.write(' <bndbox>\n')
xml_file.write(' <xmin>' + str(spt[0]) + '</xmin>\n')
xml_file.write(' <ymin>' + str(spt[1]) + '</ymin>\n')
xml_file.write(' <xmax>' + str(spt[2]) + '</xmax>\n')
xml_file.write(' <ymax>' + str(spt[3]) + '</ymax>\n')
xml_file.write(' </bndbox>\n')
xml_file.write(' </object>\n')
xml_file.write('</annotation>')
xml_file.close() #
print 'Done.'
gtbox - xml 內容格式如:
<annotation>
<folder>gtbox</folder>
<filename>mpii/000004812.jpg</filename>
<size>
<width>1920</width>
<height>1080</height>
<depth>3</depth>
</size>
<object>
<name>upper</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>1408</xmin>
<ymin>573</ymin>
<xmax>1848</xmax>
<ymax>1025</ymax>
</bndbox>
</object>
<object>
<name>lower</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>1310</xmin>
<ymin>475</ymin>
<xmax>1460</xmax>
<ymax>1042</ymax>
</bndbox>
</object>
</annotation>
3. Create LMDB
生成 trainval.txt 和 test.txt, 其內容格式為:
mpii/038796633.jpg gtboxs/038796633.xml
mpii/081305121.jpg gtboxs/081305121.xml
mpii/016047648.jpg gtboxs/016047648.xml
mpii/078242581.jpg gtboxs/078242581.xml
mpii/027364042.jpg gtboxs/027364042.xml
mpii/090828862.jpg gtboxs/090828862.xml
......
labelmap_gtbox.prototxt 定義如下:
item {
name: "none_of_the_above"
label: 0
display_name: "background"
}
item {
name: "upper"
label: 1
display_name: "upper"
}
item {
name: "lower"
label: 2
display_name: "lower"
}
test_name_size.py 來生成 test_name_size.txt:
#! /usr/bin/python
import os
from PIL import Image
img_lists = open('test.txt').readlines()
img_lists = [item.split(' ')[0] for item in img_lists]
test_name_size = open('test_name_size.txt', 'w')
imgpath = "mpii/"
for item in img_lists:
img = Image.open(imgpath + item)
width, height = img.size
temp1, temp2 = os.path.splitext(item)
test_name_size.write(temp1 + ' ' + str(height) + ' ' + str(width) + '\n')
print 'Done.'
利用 create_data.sh 建立 trainval 和 test 的 lmdb —— gtbox_trainval_lmdb 和 gtbox_test_lmdb.
cur_dir=$(cd $( dirname ${BASH_SOURCE[0]} ) && pwd )
root_dir="mpii/data"est
ssd_dir="/path/to/caffe-ssd"
cd $root_dir
redo=1
data_root_dir="mpii/"
dataset_name="gtbox"
mapfile="$root_dir/labelmap_gtbox.prototxt"
anno_type="detection"
db="lmdb"
min_dim=0
max_dim=0
width=0
height=0
extra_cmd="--encode-type=jpg --encoded"
if [ $redo ]
then
extra_cmd="$extra_cmd --redo"
fi
for subset in test trainval
do
python $ssd_dir/scripts/create_annoset.py --anno-type=$anno_type --label-map-file=$mapfile --min-dim=$min_dim --max-dim=$max_dim --resize-width=$width --resize-height=$height --check-label $extra_cmd $data_root_dir $root_dir/$subset.txt $root_dir/$dataset_name/$db/$dataset_name"_"$subset"_"$db ddbox/$dataset_name
done
4. Train/Eval
訓練得到的測試精度接近 90%,還可以.
檢測程式碼 —— ssd_detect.py
#!/usr/bin/env/python
import numpy as np
import matplotlib.pyplot as plt
caffe_root = '/path/to/caffe-ssd/'
import sys
sys.path.insert(0, caffe_root + 'python')
import caffe
caffe.set_device(0)
caffe.set_mode_gpu()
from google.protobuf import text_format
from caffe.proto import caffe_pb2
# load labels
labelmap_file = 'gtbox/labelmap_gtbox.prototxt'
file = open(labelmap_file, 'r')
labelmap = caffe_pb2.LabelMap()
text_format.Merge(str(file.read()), labelmap)
def get_labelname(labelmap, labels):
num_labels = len(labelmap.item)
labelnames = []
if type(labels) is not list:
labels = [labels]
for label in labels:
found = False
for i in xrange(0, num_labels):
if label == labelmap.item[i].label:
found = True
labelnames.append(labelmap.item[i].display_name)
break
assert found == True
return labelnames
model_def = 'deploy.prototxt'
model_weights = 'VGG_gtbox_SSD_300x300_iter_120000.caffemodel'
net = caffe.Net(model_def, model_weights, caffe.TEST)
image_resize = 300
net.blobs['data'].reshape(1, 3, image_resize, image_resize)
transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
transformer.set_transpose('data', (2, 0, 1))
transformer.set_mean('data', np.array([104,117,123])) # mean pixel
transformer.set_raw_scale('data', 255) # the reference model operates on images in [0,255] range instead of [0,1]
transformer.set_channel_swap('data', (2,1,0)) # the reference model has channels in BGR order instead of RGB
image = caffe.io.load_image('images/000000011.jpg')
transformed_image = transformer.preprocess('data', image)
net.blobs['data'].data[...] = transformed_image
# Forward pass.
detections = net.forward()['detection_out']
# Parse the outputs.
det_label = detections[0,0,:,1]
det_conf = detections[0,0,:,2]
det_xmin = detections[0,0,:,3]
det_ymin = detections[0,0,:,4]
det_xmax = detections[0,0,:,5]
det_ymax = detections[0,0,:,6]
# Get detections with confidence higher than 0.6.
top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.6]
top_conf = det_conf[top_indices]
top_label_indices = det_label[top_indices].tolist()
top_labels = get_labelname(labelmap, top_label_indices)
top_xmin = det_xmin[top_indices]
top_ymin = det_ymin[top_indices]
top_xmax = det_xmax[top_indices]
top_ymax = det_ymax[top_indices]
colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist()
plt.imshow(image)
plt.axis('off')
currentAxis = plt.gca()
for i in xrange(top_conf.shape[0]):
xmin = int(round(top_xmin[i] * image.shape[1]))
ymin = int(round(top_ymin[i] * image.shape[0]))
xmax = int(round(top_xmax[i] * image.shape[1]))
ymax = int(round(top_ymax[i] * image.shape[0]))
score = top_conf[i]
label = int(top_label_indices[i])
label_name = top_labels[i]
display_txt = '%s: %.2f'%(label_name, score)
coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1
color = colors[label]
currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
currentAxis.text(xmin, ymin, display_txt, bbox={'facecolor':color, 'alpha':0.5})
plt.show()
5. Results