1. 程式人生 > >openpose訓練程式碼(一)

openpose訓練程式碼(一)

openspoe本身是很繁雜的,包含了人體姿態估計、手勢估計、臉部關鍵點提取,還有3Dpose,是在caffe上再做的一層封裝,但是如果我們實際要去用的話,很多其實都是不需要的,比如openpose裡面的多執行緒,GUI等等,我們只需要關注一些核心的東西就好了。
在這裡,我們只關心openpose中的人體關鍵點估計,其實在上一篇部落格中,我們可以大致瞭解到,Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields就是CVPR6016的CPM加上PAF,inference是很直觀的,就是提取關鍵點,算PAF積分,再把關鍵點放到每個group(就是確定是不是同一個人)完成多人的姿態估計。

訓練程式碼,其實主要就是看資料準備和資料讀取,主要包括幾個檔案:
資料讀取檔案:

cpm_data_layer.cpp
cpm_data_transformer.cpp

資料準備檔案:

genCOCOMask.m
genJSON.m
genLMDB.py
getANNO.m

cpm_data_layer和cpm_data_transformer都是在caffe中實現的,要理清楚這兩個檔案,我們需要先看一下資料準備是怎麼做的,這裡,也只是關注LMDB檔案是怎麼生成的,因為其他的都比較簡單(其實生成LMDB也蠻簡單的,但是作者這部分寫的有點亂,需要靜心好好梳理)可以自行查閱。
在genLMDB.py中,把事先處理好的資料都寫入LMDB中,其中有一個函式writeLMDB,這個函式就是逐行,逐頁面(這裡的頁面可以理解長channel,因為在讀取的時候都是利用指標移動)來寫入的:

def writeLMDB(datasets, lmdb_path, validation):
    env = lmdb.open(lmdb_path, map_size=int(1e12))  # 需要先建立一個空資料夾用來放LMDB檔案,大概需要140G
    txn = env.begin(write=True)
    data = []
    numSample = 0

    for d in range(len(datasets)):
        if(datasets[d] == "MPI"):
            print datasets[d]
            with
open('MPI.json') as data_file: data_this = json.load(data_file) data_this = data_this['root'] data = data + data_this numSample = len(data) #print data print numSample elif(datasets[d] == "COCO"): # 讀json檔案 print datasets[d] with open('dataset/COCO/json/COCO.json') as data_file: data_this = json.load(data_file) data_this = data_this['root'] data = data + data_this numSample = len(data) #print data print numSample random_order = np.random.permutation(numSample).tolist() isValidationArray = [data[i]['isValidation'] for i in range(numSample)]; if(validation == 1): totalWriteCount = isValidationArray.count(0.0); else: totalWriteCount = len(data) print totalWriteCount; writeCount = 0 for count in range(numSample):# numSample #idx = random_order[count] idx = 3 if (data[idx]['isValidation'] != 0 and validation == 1): print '%d/%d skipped' % (count,idx) continue if "MPI" in data[idx]['dataset']: path_header = 'dataset/MPI/images/' elif "COCO" in data[idx]['dataset']: path_header = '/proj/Sunjiarui/fcm_pose_train/training/dataset/COCO/images/' print os.path.join(path_header, data[idx]['img_paths']) img = cv2.imread(os.path.join(path_header, data[idx]['img_paths'])) #print data[idx]['img_paths'] img_idx = data[idx]['img_paths'][-16:-3]; #print img_idx # 做mask_all 和mask_miss 這裡是因為有一些人比較小,沒有標註,但是又存在,所以才有這一步 if "COCO_val" in data[idx]['dataset']: mask_all = cv2.imread(path_header+'mask2014/val2014_mask_all_'+img_idx+'png', 0) mask_miss = cv2.imread(path_header+'mask2014/val2014_mask_miss_'+img_idx+'png', 0) #print path_header+'mask2014/val2014_mask_miss_'+img_idx+'png' elif "COCO" in data[idx]['dataset']: mask_all = cv2.imread(path_header+'mask2014/train2014_mask_all_'+img_idx+'png', 0) mask_miss = cv2.imread(path_header+'mask2014/train2014_mask_miss_'+img_idx+'png', 0) #print path_header+'mask2014/train2014_mask_miss_'+img_idx+'png' elif "MPI" in data[idx]['dataset']: img_idx = data[idx]['img_paths'][-13:-3]; #print img_idx mask_miss = cv2.imread('dataset/MPI/masks/mask_'+img_idx+'jpg', 0) #mask_all = mask_miss height = img.shape[0] width = img.shape[1] if(width < 64): img = cv2.copyMakeBorder(img,0,0,0,64-width,cv2.BORDER_CONSTANT,value=(128,128,128)) print 'saving padded image!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' cv2.imwrite('padded_img.jpg', img) width = 64 # no modify on width, because we want to keep information meta_data = np.zeros(shape=(height,width,1), dtype=np.uint8) #print type(img), img.shape #print type(meta_data), meta_data.shape clidx = 0 # current line index # dataset name (string) for i in range(len(data[idx]['dataset'])): meta_data[clidx][i] = ord(data[idx]['dataset'][i]) print 'type()=', type(ord(data[idx]['dataset'][i])) # 開始準備mata資訊 clidx = clidx + 1 # image height, image width height_binary = float2bytes(data[idx]['img_height']) for i in range(len(height_binary)): meta_data[clidx][i] = ord(height_binary[i]) width_binary = float2bytes(data[idx]['img_width']) print 'type(width_binary)=',type(width_binary) for i in range(len(width_binary)): meta_data[clidx][4+i] = ord(width_binary[i]) clidx = clidx + 1 # (a) isValidation(uint8), numOtherPeople (uint8), people_index (uint8), annolist_index (float), writeCount(float), totalWriteCount(float) meta_data[clidx][0] = data[idx]['isValidation'] meta_data[clidx][1] = data[idx]['numOtherPeople'] meta_data[clidx][2] = data[idx]['people_index'] print 'type() =', type(data[idx]['isValidation']) print 'data numOther = ',data[idx]['numOtherPeople'] annolist_index_binary = float2bytes(data[idx]['annolist_index']) for i in range(len(annolist_index_binary)): # 3,4,5,6 meta_data[clidx][3+i] = ord(annolist_index_binary[i]) count_binary = float2bytes(float(writeCount)) # note it's writecount instead of count! for i in range(len(count_binary)): meta_data[clidx][7+i] = ord(count_binary[i]) totalWriteCount_binary = float2bytes(float(totalWriteCount)) for i in range(len(totalWriteCount_binary)): meta_data[clidx][11+i] = ord(totalWriteCount_binary[i]) nop = int(data[idx]['numOtherPeople']) clidx = clidx + 1 # (b) objpos_x (float), objpos_y (float) objpos_binary = float2bytes(data[idx]['objpos']) for i in range(len(objpos_binary)): meta_data[clidx][i] = ord(objpos_binary[i]) clidx = clidx + 1 # (c) scale_provided (float) scale_provided_binary = float2bytes(data[idx]['scale_provided']) for i in range(len(scale_provided_binary)): meta_data[clidx][i] = ord(scale_provided_binary[i]) clidx = clidx + 1 # (d) joint_self (3*16) (float) (3 line) joints = np.asarray(data[idx]['joint_self']).T.tolist() # transpose to 3*16 for i in range(len(joints)): row_binary = float2bytes(joints[i]) for j in range(len(row_binary)): meta_data[clidx][j] = ord(row_binary[j]) clidx = clidx + 1 # (e) check nop, prepare arrays print 'nop=',nop if(nop!=0): if(nop==1): joint_other = [data[idx]['joint_others']] objpos_other = [data[idx]['objpos_other']] scale_provided_other = [data[idx]['scale_provided_other']] print 'joint_other=',joint_other else: joint_other = data[idx]['joint_others'] objpos_other = data[idx]['objpos_other'] scale_provided_other = data[idx]['scale_provided_other'] print 'joint_others2 =', joint_other # (f) objpos_other_x (float), objpos_other_y (float) (nop lines) for i in range(nop): objpos_binary = float2bytes(objpos_other[i]) for j in range(len(objpos_binary)): meta_data[clidx][j] = ord(objpos_binary[j]) clidx = clidx + 1 # (g) scale_provided_other (nop floats in 1 line) scale_provided_other_binary = float2bytes(scale_provided_other) for j in range(len(scale_provided_other_binary)): meta_data[clidx][j] = ord(scale_provided_other_binary[j]) clidx = clidx + 1 # (h) joint_others (3*16) (float) (nop*3 lines) for n in range(nop): joints = np.asarray(joint_other[n]).T.tolist() # transpose to 3*16 print 'joints=',joints print 'joint_other[n]=', joint_other[n] for i in range(len(joints)): row_binary = float2bytes(joints[i]) for j in range(len(row_binary)): meta_data[clidx][j] = ord(row_binary[j]) clidx = clidx + 1 # print meta_data[0:12,0:48] # total 7+4*nop lines # lmdb排列的順序一定要記清楚,這個在讀取資料的時候很重要,在C++程式碼中相關聯的就是指標的偏移量 if "COCO" in data[idx]['dataset']: img4ch = np.concatenate((img, meta_data, mask_miss[...,None], mask_all[...,None]), axis=2) #img4ch = np.concatenate((img, meta_data, mask_miss[...,None]), axis=2) elif "MPI" in data[idx]['dataset']: img4ch = np.concatenate((img, meta_data, mask_miss[...,None]), axis=2) img4ch = np.transpose(img4ch, (2, 0, 1)) print img4ch.shape datum = caffe.io.array_to_datum(img4ch, label=0) key = '%07d' % writeCount txn.put(key, datum.SerializeToString()) if(writeCount % 1000 == 0): txn.commit() txn = env.begin(write=True) print '%d/%d/%d/%d' % (count,writeCount,idx,numSample) writeCount = writeCount + 1 txn.commit() env.close()

在上述Python程式碼過後,就會生成訓練所需要的LMDB檔案,在實際的使用過程中,需要重新寫caffe的data_layer,關於caffe的data_layer ,可以參考我之前的一篇部落格: http://mp.blog.csdn.net/mdeditor/77987504

下面是cpm_data_layer和cpm_data_transformer,其實cpm_data_layer主要就是layer的建立,主要的資料轉化都是在cpm_data_transformer中完成的。
先看cpm_data_layer的setup函式(程式碼有些細微地方我可能改過):

template <typename Dtype>
void CPMDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  cpm_data_transformer_.reset(
     new CPMDataTransformer<Dtype>(cpm_transform_param_, this->phase_));
  cpm_data_transformer_->InitRand();


  // Read a data point, and use it to initialize the top blob.
  Datum& datum = *(reader_.full().peek());
  LOG(INFO) << datum.height() << " " << datum.width() << " " << datum.channels();

  bool force_color = this->layer_param_.data_param().force_encoded_color();
  if ((force_color && DecodeDatum(&datum, true)) ||
      DecodeDatumNative(&datum)) {
    LOG(INFO) << "Decoding Datum";
  }

  // image
  const int crop_size = this->layer_param_.cpm_transform_param().crop_size();
  const int batch_size = this->layer_param_.data_param().batch_size();
  if (crop_size > 0) {
    // top[0]->Reshape(batch_size, datum.channels(), crop_size, crop_size);
    // for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
    //   this->prefetch_[i].data_.Reshape(batch_size, datum.channels(), crop_size, crop_size);
    // }
    // //this->transformed_data_.Reshape(1, 4, crop_size, crop_size);
    // this->transformed_data_.Reshape(1, 6, crop_size, crop_size);
  } 
  else {
    const int height = this->phase_ != TRAIN ? datum.height() :
      this->layer_param_.cpm_transform_param().crop_size_y();
    const int width = this->phase_ != TRAIN ? datum.width() :
      this->layer_param_.cpm_transform_param().crop_size_x();
    LOG(INFO) << "PREFETCH_COUNT is " << this->PREFETCH_COUNT;  // asynchronously if to GPU memory
    top[0]->Reshape(batch_size, datum.channels(), height, width);
    for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
      this->prefetch_[i].data_.Reshape(batch_size, datum.channels(), height, width);   // 10,6,368,368
    }
    //this->transformed_data_.Reshape(1, 4, height, width);
    this->transformed_data_.Reshape(1, datum.channels(), height, width);  // 1,6,368,368
  }
  LOG(INFO) << "output data size: " << top[0]->num() << ","              
      << top[0]->channels() << "," << top[0]->height() << ","
      << top[0]->width();   // 10,6,368,368

  // label
  if (this->output_labels_) {
    const int stride = this->layer_param_.cpm_transform_param().stride();  // 8,重要
    const int height = this->phase_ != TRAIN ? datum.height() :
      this->layer_param_.cpm_transform_param().crop_size_y();
    const int width = this->phase_ != TRAIN ? datum.width() :
      this->layer_param_.cpm_transform_param().crop_size_x();

    int num_parts = this->layer_param_.cpm_transform_param().num_parts();  // 56
    top[1]->Reshape(batch_size, 2*(num_parts+1), height/stride, width/stride);
    for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
      this->prefetch_[i].label_.Reshape(batch_size, 2*(num_parts+1), height/stride, width/stride);  // 10,114,46,46
    }
    this->transformed_label_.Reshape(1, 2*(num_parts+1), height/stride, width/stride);  // 1,114,46,46
  }
}

在這個函式中,主要是就一些超引數的讀取,和資料輸出格式的規定。
關鍵的是load_batch 函式,我截取了一部分:

    // Apply data transformations (mirror, scale, crop...)
    timer.Start();
    const int offset_data = batch->data_.offset(item_id);
    const int offset_label = batch->label_.offset(item_id);
    this->transformed_data_.set_cpu_data(top_data + offset_data);
    this->transformed_label_.set_cpu_data(top_label + offset_label);
    if (datum.encoded()) {
      this->cpm_data_transformer_->Transform(cv_img, &(this->transformed_data_));
    } else {
      this->cpm_data_transformer_->Transform_nv(datum, 
        &(this->transformed_data_),
        &(this->transformed_label_), cnt);
      ++cnt;
    }
    // if (this->output_labels_) {
    //   top_label[item_id] = datum.label();
    // }
    trans_time += timer.MicroSeconds();