1. 程式人生 > >caffe1原始碼解析從入門到放棄1):記憶體管理syncedmem.hpp / syncedmem.cpp

caffe1原始碼解析從入門到放棄1):記憶體管理syncedmem.hpp / syncedmem.cpp

/*這些程式碼都是本人在linux-nsight-eclipse環境下純手打。
  文章結尾都會丟擲一些本人尚未解決的問題,歡迎各路大神拍磚。
  文章屬於學習交流性質,隨著本人學力的提升,此blog將會長期修正更新。
 * syncedmem.hpp
 *  Created on: Jun 4, 2017
 *      Author: pan
 */
#ifndef SYNCEDMEM_HPP_
#define SYNCEDMEM_HPP_
#include <cstdlib>
#include "caffe/common.hpp"
/*定義了caffe名稱空間,內部封裝了caffe所有的類和方法,
 * eg:using namespace caffe / using namespace std*/
namespace caffe { // If CUDA is available and in GPU mode, host memory will be allocated pinned, // using cudaMallocHost. It avoids dynamic pinning for transfers (DMA). // The improvement in performance seems negligible in the single GPU case, // but might be more significant for parallel training. Most importantly,
// it improved stability for large models on many GPUs. /*如果主機支援CUDA並且工作在GPU模式下,主機記憶體將會 allocated(分配) pinned, 使用cudaMallocHost(). * 它避免了dynamic pinning for transfers (DMA).在單GPU情況下使用cudaMallocHost(),這個操作在效能 * 上的提高看起來幾乎可以忽視。但是在多GPU並行訓練的情況下,cudaMallocHost()可能會顯的更重要。最重要的是, * cudaMallocHost()的使用提高了在多GPU環境下大模型的穩定性。 * caffe工作在GPU模式下使用cudaMallocHost()在主機上分配記憶體將會比使用malloc()方法有效能和穩定性的提高。 */
/*在主機上分配記憶體,CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);方法使用二級指標 cpu_ptr_分配記憶體*/ inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) { #ifndef CPU_ONLY if(Caffe::mode() == Caffe::GPU) { CUDA_CHECK(cudaMallocHost(ptr, size));//***************** *use_cuda = true; return ;//在void型別的函式中,return用於返回空,不是返回 0 值 } #endif /*這裡分配了size個位元組的記憶體,由於使用的是void*最後要強制型別轉換成特定型別的 * 指標eg: static_cast<int*> cpu_ptr_ 。這點在Blob中會詳細陳述*/ *ptr = malloc(size); *use_cuda = false; CHECK(*ptr)<<"host allocation of size "<< size <<" failed";//********************** } /*記憶體釋放方法,由於在cuda環境下有兩種主機分配記憶體的方法,所以在這裡做了一個巨集定義處理,分別是 cudaFreeHost()和 free()*/ inline void CaffeFreeHost(void* ptr, bool use_cuda) { #ifndef CPU_ONLY if(use_cuda) { CUDA_CHECK(cudaFreeHost(ptr));//*************** return ; } #endif free(ptr); } /** * @brief Manages memory allocation and synchronization between the host (CPU) * and device (GPU). * * TODO(dox): more thorough description. */ /* *SyncedMemory類 @簡單的用於在主機(CPU)和 裝置(GPU)之間進行記憶體分配和同步工作,也就是說在CPU和GPU *之間管理記憶體。 *TODO(dox): more thorough description. * */ class SyncedMemory { public: /*建構函式將初始化各種指標*/ SyncedMemory() : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), own_cpu_data_(false), cpu_malloc_use_cuda_(false),own_gpu_data_(false), gpu_device_(-1){} /*建構函式將初始化各種指標 * explicit 表示建構函式不接受隱式轉換 eg: ********************/ explicit SyncedMemory(size_t size) : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false), gpu_device_(-1){} /*解構函式中定義了釋放堆區記憶體的操作,在caffe的資料容器Blob中,定義了shared_ptr<syncedmemory> data_ * 定義了shared_ptr<syncedmemory> diff_ 的智慧指標,通過reset方法控制記憶體的釋放。由於nvcc編譯器 * 對C++11支援的不好,暫且不能夠使用unique_ptr智慧指標,目前只能呼叫boost庫的shared_ptr*/ ~SyncedMemory(){}; public: /*cpu_data()和gpu_data()返回值為const void* 表示cpu_ptr_和gpu_ptr_所指向的記憶體空間不允許被修改 * 與此相反void* mutable_cpu_data() 和 void* mutable_gpu_data(); 返回的是void* 的指標,也即記憶體返回的 * 記憶體空間是允許修改的*/ const void* cpu_data(); void set_cpu_data(void* data); const void* gpu_data(); void set_gpu_data(void* data); void* mutable_cpu_data(); void* mutable_gpu_data(); /*此處定義了一個列舉型別SyncedHead主要作用是標誌頭指標狀態,其中SYNCED表示記憶體已經同步*/ enum SyncedHead {UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED}; SyncedHead head() {return head_;} //size_t是標準C庫中定義的,應為unsigned int,在64位系統中為 long unsigned int size_t size() {return size_;} #ifndef CPU_ONLY void async_gpu_push(const cudaStream_t& stream);//***************** #endif private: SyncedHead head_;//頭指標位置 /*控制記憶體同步的方法,如果head在cpu上執行to_cpu()表示記憶體已經同步,否則要呼叫caffe_gpu_memcpy()方法 * 實質上呼叫的是cudaMemcpy(Y, X, N, cudaMemcpyDefault),caffe_gpu_memcpy()做了一層封裝而已。 * 同理to_gpu()*/ void to_cpu(); void to_gpu(); void* cpu_ptr_; void* gpu_ptr_; size_t size_; bool own_cpu_data_; bool cpu_malloc_use_cuda_; bool own_gpu_data_; int gpu_device_; DISABLE_COPY_AND_ASSIGN(SyncedMemory);//*************** };// class SyncedMemory };//namespace caffe #endif /* SYNCEDMEM_HPP_ */ --------------------------------------------- --------------------------------------------- --------------------------------------------- /* * syncedmem.cpp * * Created on: Jun 4, 2017 * Author: pan */ #include "common.hpp" #include "syncedmem.hpp" #include "util/math_functions.hpp" namespace caffe { SyncedMemory::~SyncedMemory() { /*cpu_ptr_不為NULL,不能釋放NULL指標, own_cpu_data_標誌位不為 0這個標誌位不知道如何理解 ??????????????????*/ if(cpu_ptr_ && own_cpu_data_) { CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_); } #ifndef CPU_ONLY if(gpu_ptr_ && own_gpu_data_) { int initial_device; cudaGetDevice(&initial_device); if (gpu_device_ != -1) { CUDA_CHECK(cudaSetDevice(gpu_device_));//???????????????? } CUDA_CHECK(cudaFree(gpu_ptr_)); cudaSetDevice(initial_device); } #endif } //同步記憶體到CPU 即設定cpu_ptr_ inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_); caffe_memset(size_, 0, cpu_ptr_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; case HEAD_AT_GPU: #ifndef CPU_ONLY//Makefile.config中定義 if (cpu_ptr_ == NULL) { CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_); own_cpu_data_ = true; } caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_); head_ = SYNCED; #else NO_GPU;//Makefile.config中定義 #endif break; case HEAD_AT_CPU://頭指標指向CPU記憶體已經同步 case SYNCED: break; } }//to_cpu() //同步記憶體到CPU 即設定gpu_ptr_ inline void SyncedMemory::to_gpu() { #ifndef CPU_ONLY switch (head_) { case UNINITIALIZED://???????????????????/ head_ = HEAD_AT_GPU; own_gpu_data_ = true; break; case HEAD_AT_CPU: if(gpu_ptr_ == NULL) { CUDA_CHECK(cudaGetDevice(&gpu_device_)); CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); own_gpu_data_ = true; } caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_); head_ = SYNCED; break; case HEAD_AT_GPU: case SYNCED: break; } #else NO_GPU #endif }//to_gpu() //獲取cpu 堆區記憶體頭指標 const void* SyncedMemory::cpu_data() { to_cpu(); return (const void*)cpu_ptr_; } void SyncedMemory::set_cpu_data(void* data) { CHECK(data);//??????????????????????? //????if(data == NULL) return -1;????????????????? if(own_cpu_data_) { CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_); } cpu_ptr_ = data; head_ = HEAD_AT_CPU; own_cpu_data_ = false; } const void* SyncedMemory::gpu_data() { #ifndef CPU_ONLY to_gpu(); return (const void*)gpu_ptr_; #else NO_GPU; return NULL; #endif } void SyncedMemory::set_gpu_data(void* data) { #ifndef CPU_ONLY CHECK(data); if (own_gpu_data_) { int initial_device; cudaGetDevice(&initial_device); if (gpu_device_ != -1) { CUDA_CHECK(cudaSetDevice(gpu_device_)); } CUDA_CHECK(cudaFree(gpu_ptr_)); cudaSetDevice(initial_device); } gpu_ptr_ = data; head_ = HEAD_AT_GPU; own_gpu_data_ = false; #else NO_GPU; #endif } void* SyncedMemory::mutable_cpu_data() { to_cpu(); head_ = HEAD_AT_CPU; return cpu_ptr_; } void* SyncedMemory::mutable_gpu_data() { #ifndef CPU_ONLY to_gpu(); head_ = HEAD_AT_GPU; return gpu_ptr_; #else NO_GPU; return NULL; #endif } #ifndef CPU_ONLY void SyncedMemory::async_gpu_push(const cudaStream_t& stream) { CHECK(head_ == HEAD_AT_CPU); if (gpu_ptr_ == NULL) { CUDA_CHECK(cudaGetDevice(&gpu_device_)); CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); own_gpu_data_ = true; } const cudaMemcpyKind put = cudaMemcpyHostToDevice; CUDA_CHECK(cudaMemcpyAsync(gpu_ptr_, cpu_ptr_, size_, put, stream)); // Assume caller will synchronize on the stream before use head_ = SYNCED; } #endif };//namespace caffe
自己寫的測試程式碼分析建構函式和解構函式的行為
/*
 * caffe.cpp
 *
 *  Created on: Jun 5, 2017
 *      Author: pan
 */
#include <iostream>
#include <climits>
#include <cstdlib>
#include <boost/shared_ptr.hpp>
using namespace std;
using boost::shared_ptr;
// inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda)

inline void CaffeMallocoHost(void** ptr, size_t size)
{
// #ifndef CPU_ONLY
//    cudaMallocHost(ptr, size);
//      void* ptr = (void*)(new char[size]);
    *ptr = malloc(size);
    if(ptr == NULL)
    {
    cout<<"malloc error in fuction CaffeMallocoHost !"<<endl;
    }
}

inline void CaffeFreeHost(void* ptr)
{
  cout<<">>>>>>>>>>>>>CaffeFreeHost";
  if(ptr != NULL)
  {
      free(ptr);
      cout<<">>>>>>>>>>>now free cpu_ptr_ "<<endl;
  }
}

class synced
{
public:
  synced(size_t size, int num) : cpu_ptr_(NULL),
        gpu_ptr_(NULL), own_cpu_data_(false),own_gpu_data_(false),
        size_(size), cpu_malloc_use_cuda_(0),num_(num){cout<<"constructor "<< num_<<" called !\n";}

  ~synced()
  {

    CaffeFreeHost(cpu_ptr_);

    cout<<"destructor "<<num_<<" called!\n";

  }

  void to_cpu()
  {
    CaffeMallocoHost(&cpu_ptr_, size_);
  }
  void* cpu_data()
  {
    to_cpu();
    return cpu_ptr_;
  }
private:
  void* cpu_ptr_;
  bool own_cpu_data_;
  void* gpu_ptr_;
  bool own_gpu_data_;
  bool cpu_malloc_use_cuda_;
  size_t size_;

  int num_;
};


int main()
{
  shared_ptr<synced> data;
  data.reset(new synced(10 * sizeof(int), 1));
  int* ptr = static_cast<int*>(data->cpu_data());
  ptr[9] = 10;
 // cout<< INT_MAX <<endl;

  return 0;
}
丟擲問題: caffe記憶體管理如何使用new delete 形式重寫CaffeFreeHost() ; CaffeMallocHost()。 主要困難是c++
如何分配一個void* 並delete 一個void* 。