caffe1原始碼解析從入門到放棄1):記憶體管理syncedmem.hpp / syncedmem.cpp
阿新 • • 發佈:2019-02-01
/*這些程式碼都是本人在linux-nsight-eclipse環境下純手打。
文章結尾都會丟擲一些本人尚未解決的問題,歡迎各路大神拍磚。
文章屬於學習交流性質,隨著本人學力的提升,此blog將會長期修正更新。
* syncedmem.hpp
* Created on: Jun 4, 2017
* Author: pan
*/
#ifndef SYNCEDMEM_HPP_
#define SYNCEDMEM_HPP_
#include <cstdlib>
#include "caffe/common.hpp"
/*定義了caffe名稱空間,內部封裝了caffe所有的類和方法,
* eg:using namespace caffe / using namespace std*/
namespace caffe
{
// If CUDA is available and in GPU mode, host memory will be allocated pinned,
// using cudaMallocHost. It avoids dynamic pinning for transfers (DMA).
// The improvement in performance seems negligible in the single GPU case,
// but might be more significant for parallel training. Most importantly,
// it improved stability for large models on many GPUs.
/*如果主機支援CUDA並且工作在GPU模式下,主機記憶體將會 allocated(分配) pinned, 使用cudaMallocHost().
* 它避免了dynamic pinning for transfers (DMA).在單GPU情況下使用cudaMallocHost(),這個操作在效能
* 上的提高看起來幾乎可以忽視。但是在多GPU並行訓練的情況下,cudaMallocHost()可能會顯的更重要。最重要的是,
* cudaMallocHost()的使用提高了在多GPU環境下大模型的穩定性。
* caffe工作在GPU模式下使用cudaMallocHost()在主機上分配記憶體將會比使用malloc()方法有效能和穩定性的提高。
*/
/*在主機上分配記憶體,CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);方法使用二級指標
cpu_ptr_分配記憶體*/
inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda)
{
#ifndef CPU_ONLY
if(Caffe::mode() == Caffe::GPU)
{
CUDA_CHECK(cudaMallocHost(ptr, size));//*****************
*use_cuda = true;
return ;//在void型別的函式中,return用於返回空,不是返回 0 值
}
#endif
/*這裡分配了size個位元組的記憶體,由於使用的是void*最後要強制型別轉換成特定型別的
* 指標eg: static_cast<int*> cpu_ptr_ 。這點在Blob中會詳細陳述*/
*ptr = malloc(size);
*use_cuda = false;
CHECK(*ptr)<<"host allocation of size "<< size <<" failed";//**********************
}
/*記憶體釋放方法,由於在cuda環境下有兩種主機分配記憶體的方法,所以在這裡做了一個巨集定義處理,分別是
cudaFreeHost()和 free()*/
inline void CaffeFreeHost(void* ptr, bool use_cuda)
{
#ifndef CPU_ONLY
if(use_cuda)
{
CUDA_CHECK(cudaFreeHost(ptr));//***************
return ;
}
#endif
free(ptr);
}
/**
* @brief Manages memory allocation and synchronization between the host (CPU)
* and device (GPU).
*
* TODO(dox): more thorough description.
*/
/*
*SyncedMemory類 @簡單的用於在主機(CPU)和 裝置(GPU)之間進行記憶體分配和同步工作,也就是說在CPU和GPU
*之間管理記憶體。
*TODO(dox): more thorough description.
* */
class SyncedMemory
{
public:
/*建構函式將初始化各種指標*/
SyncedMemory()
: cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
own_cpu_data_(false), cpu_malloc_use_cuda_(false),own_gpu_data_(false),
gpu_device_(-1){}
/*建構函式將初始化各種指標
* explicit 表示建構函式不接受隱式轉換 eg: ********************/
explicit SyncedMemory(size_t size) : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size),
head_(UNINITIALIZED), own_cpu_data_(false), cpu_malloc_use_cuda_(false),
own_gpu_data_(false), gpu_device_(-1){}
/*解構函式中定義了釋放堆區記憶體的操作,在caffe的資料容器Blob中,定義了shared_ptr<syncedmemory> data_
* 定義了shared_ptr<syncedmemory> diff_ 的智慧指標,通過reset方法控制記憶體的釋放。由於nvcc編譯器
* 對C++11支援的不好,暫且不能夠使用unique_ptr智慧指標,目前只能呼叫boost庫的shared_ptr*/
~SyncedMemory(){};
public:
/*cpu_data()和gpu_data()返回值為const void* 表示cpu_ptr_和gpu_ptr_所指向的記憶體空間不允許被修改
* 與此相反void* mutable_cpu_data() 和 void* mutable_gpu_data(); 返回的是void* 的指標,也即記憶體返回的
* 記憶體空間是允許修改的*/
const void* cpu_data();
void set_cpu_data(void* data);
const void* gpu_data();
void set_gpu_data(void* data);
void* mutable_cpu_data();
void* mutable_gpu_data();
/*此處定義了一個列舉型別SyncedHead主要作用是標誌頭指標狀態,其中SYNCED表示記憶體已經同步*/
enum SyncedHead {UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED};
SyncedHead head() {return head_;}
//size_t是標準C庫中定義的,應為unsigned int,在64位系統中為 long unsigned int
size_t size() {return size_;}
#ifndef CPU_ONLY
void async_gpu_push(const cudaStream_t& stream);//*****************
#endif
private:
SyncedHead head_;//頭指標位置
/*控制記憶體同步的方法,如果head在cpu上執行to_cpu()表示記憶體已經同步,否則要呼叫caffe_gpu_memcpy()方法
* 實質上呼叫的是cudaMemcpy(Y, X, N, cudaMemcpyDefault),caffe_gpu_memcpy()做了一層封裝而已。
* 同理to_gpu()*/
void to_cpu();
void to_gpu();
void* cpu_ptr_;
void* gpu_ptr_;
size_t size_;
bool own_cpu_data_;
bool cpu_malloc_use_cuda_;
bool own_gpu_data_;
int gpu_device_;
DISABLE_COPY_AND_ASSIGN(SyncedMemory);//***************
};// class SyncedMemory
};//namespace caffe
#endif /* SYNCEDMEM_HPP_ */
---------------------------------------------
---------------------------------------------
---------------------------------------------
/*
* syncedmem.cpp
*
* Created on: Jun 4, 2017
* Author: pan
*/
#include "common.hpp"
#include "syncedmem.hpp"
#include "util/math_functions.hpp"
namespace caffe
{
SyncedMemory::~SyncedMemory()
{
/*cpu_ptr_不為NULL,不能釋放NULL指標, own_cpu_data_標誌位不為 0這個標誌位不知道如何理解 ??????????????????*/
if(cpu_ptr_ && own_cpu_data_)
{
CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
}
#ifndef CPU_ONLY
if(gpu_ptr_ && own_gpu_data_)
{
int initial_device;
cudaGetDevice(&initial_device);
if (gpu_device_ != -1)
{
CUDA_CHECK(cudaSetDevice(gpu_device_));//????????????????
}
CUDA_CHECK(cudaFree(gpu_ptr_));
cudaSetDevice(initial_device);
}
#endif
}
//同步記憶體到CPU 即設定cpu_ptr_
inline void SyncedMemory::to_cpu()
{
switch (head_)
{
case UNINITIALIZED:
CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
caffe_memset(size_, 0, cpu_ptr_);
head_ = HEAD_AT_CPU;
own_cpu_data_ = true;
break;
case HEAD_AT_GPU:
#ifndef CPU_ONLY//Makefile.config中定義
if (cpu_ptr_ == NULL)
{
CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
own_cpu_data_ = true;
}
caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
head_ = SYNCED;
#else
NO_GPU;//Makefile.config中定義
#endif
break;
case HEAD_AT_CPU://頭指標指向CPU記憶體已經同步
case SYNCED:
break;
}
}//to_cpu()
//同步記憶體到CPU 即設定gpu_ptr_
inline void SyncedMemory::to_gpu()
{
#ifndef CPU_ONLY
switch (head_)
{
case UNINITIALIZED://???????????????????/
head_ = HEAD_AT_GPU;
own_gpu_data_ = true;
break;
case HEAD_AT_CPU:
if(gpu_ptr_ == NULL)
{
CUDA_CHECK(cudaGetDevice(&gpu_device_));
CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
own_gpu_data_ = true;
}
caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
head_ = SYNCED;
break;
case HEAD_AT_GPU:
case SYNCED:
break;
}
#else
NO_GPU
#endif
}//to_gpu()
//獲取cpu 堆區記憶體頭指標
const void* SyncedMemory::cpu_data()
{
to_cpu();
return (const void*)cpu_ptr_;
}
void SyncedMemory::set_cpu_data(void* data)
{
CHECK(data);//???????????????????????
//????if(data == NULL) return -1;?????????????????
if(own_cpu_data_)
{
CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
}
cpu_ptr_ = data;
head_ = HEAD_AT_CPU;
own_cpu_data_ = false;
}
const void* SyncedMemory::gpu_data()
{
#ifndef CPU_ONLY
to_gpu();
return (const void*)gpu_ptr_;
#else
NO_GPU;
return NULL;
#endif
}
void SyncedMemory::set_gpu_data(void* data)
{
#ifndef CPU_ONLY
CHECK(data);
if (own_gpu_data_)
{
int initial_device;
cudaGetDevice(&initial_device);
if (gpu_device_ != -1)
{
CUDA_CHECK(cudaSetDevice(gpu_device_));
}
CUDA_CHECK(cudaFree(gpu_ptr_));
cudaSetDevice(initial_device);
}
gpu_ptr_ = data;
head_ = HEAD_AT_GPU;
own_gpu_data_ = false;
#else
NO_GPU;
#endif
}
void* SyncedMemory::mutable_cpu_data()
{
to_cpu();
head_ = HEAD_AT_CPU;
return cpu_ptr_;
}
void* SyncedMemory::mutable_gpu_data()
{
#ifndef CPU_ONLY
to_gpu();
head_ = HEAD_AT_GPU;
return gpu_ptr_;
#else
NO_GPU;
return NULL;
#endif
}
#ifndef CPU_ONLY
void SyncedMemory::async_gpu_push(const cudaStream_t& stream)
{
CHECK(head_ == HEAD_AT_CPU);
if (gpu_ptr_ == NULL)
{
CUDA_CHECK(cudaGetDevice(&gpu_device_));
CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
own_gpu_data_ = true;
}
const cudaMemcpyKind put = cudaMemcpyHostToDevice;
CUDA_CHECK(cudaMemcpyAsync(gpu_ptr_, cpu_ptr_, size_, put, stream));
// Assume caller will synchronize on the stream before use
head_ = SYNCED;
}
#endif
};//namespace caffe
自己寫的測試程式碼分析建構函式和解構函式的行為
/*
* caffe.cpp
*
* Created on: Jun 5, 2017
* Author: pan
*/
#include <iostream>
#include <climits>
#include <cstdlib>
#include <boost/shared_ptr.hpp>
using namespace std;
using boost::shared_ptr;
// inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda)
inline void CaffeMallocoHost(void** ptr, size_t size)
{
// #ifndef CPU_ONLY
// cudaMallocHost(ptr, size);
// void* ptr = (void*)(new char[size]);
*ptr = malloc(size);
if(ptr == NULL)
{
cout<<"malloc error in fuction CaffeMallocoHost !"<<endl;
}
}
inline void CaffeFreeHost(void* ptr)
{
cout<<">>>>>>>>>>>>>CaffeFreeHost";
if(ptr != NULL)
{
free(ptr);
cout<<">>>>>>>>>>>now free cpu_ptr_ "<<endl;
}
}
class synced
{
public:
synced(size_t size, int num) : cpu_ptr_(NULL),
gpu_ptr_(NULL), own_cpu_data_(false),own_gpu_data_(false),
size_(size), cpu_malloc_use_cuda_(0),num_(num){cout<<"constructor "<< num_<<" called !\n";}
~synced()
{
CaffeFreeHost(cpu_ptr_);
cout<<"destructor "<<num_<<" called!\n";
}
void to_cpu()
{
CaffeMallocoHost(&cpu_ptr_, size_);
}
void* cpu_data()
{
to_cpu();
return cpu_ptr_;
}
private:
void* cpu_ptr_;
bool own_cpu_data_;
void* gpu_ptr_;
bool own_gpu_data_;
bool cpu_malloc_use_cuda_;
size_t size_;
int num_;
};
int main()
{
shared_ptr<synced> data;
data.reset(new synced(10 * sizeof(int), 1));
int* ptr = static_cast<int*>(data->cpu_data());
ptr[9] = 10;
// cout<< INT_MAX <<endl;
return 0;
}
丟擲問題: caffe記憶體管理如何使用new delete 形式重寫CaffeFreeHost() ; CaffeMallocHost()。 主要困難是c++
如何分配一個void* 並delete 一個void* 。