CUDFF 影象傅立葉變換
阿新 • • 發佈:2019-02-20
cudff 快速傅立葉變換
利用cuda進行fft變換時,會有一些引數設定的規則,一下舉例進行說明: float *h_Data; //"h_": host,表示CPU記憶體 float *d_Data; //"d_":device,表示GPU記憶體 fComplex *d_DataSpectrum, //fComplex:為float複數形式,x為實數,y為複數 cufftHandle fftPlanFwd, fftPlanInv; //Fwd表示正變換,Inv變換反變換 const int dataH = 1024; //二維影象的高度 const int dataW = 512; //二維影象的寬度 int fftH = dataH; //若dataH 不為2的冪次數,需要進行影象擴充套件 int fftW = dataW; //若dataW 不為2的冪次數,需要進行影象擴充套件 h_Data = (float *)malloc(dataH * dataW * sizeof(float)); //CPU端記憶體分配方式 checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); //GPU端記憶體開盤方式 checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex))); //cufft中R2C變換要求輸入為H*W,輸出為H*(W/2+1),W必須保證為2的倍數 for (int i = 0; i < dataH * dataW; i++) { h_Data[i] = getRand(); } //對cpu中的記憶體塊h_Data賦值,這裡僅為說明,因此用getRand()函式進行隨機數產生 checkCudaErrors(cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C)); //設定fft變換的控制代碼,這裡需要注意,傳入的引數先為fftH,fftW,即先傳高度值,再傳寬度值,CUFFT_R2C表示從real實數變換到complex複數 checkCudaErrors(cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R)); //設定fft反變換的控制代碼,雷同正變換,傳入的引數先為fftH,fftW,即先傳高度值,再傳寬度值,CUFFT_C2R表示從complex複數變換到real實數 checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice));//將CPU記憶體的資料塊copy到GPU的視訊記憶體中,cudaMemcpyHostToDevice表示從host端copy到device端,cudaMemcpyDeviceToHost表示從device端copy到host端 checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_Data, (cufftComplex *)d_DataSpectrum));//按照FFT正變換的控制代碼,進行影象資料的FFT變換,注意這裡的控制代碼使用的是fftPlanFwd //此處為其它在頻域中操作的函式 checkCudaErrors(cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_Data));//將在頻域中處理的結果,再反變換回空間域中,注意這裡的控制代碼使用的是fftPlanInv //Release checkCudaErrors(cufftDestroy(fftPlanInv)); //銷燬控制代碼,對應cufftPlan2d checkCudaErrors(cufftDestroy(fftPlanFwd));//銷燬控制代碼,對應cufftPlan2d checkCudaErrors(cudaFree(d_DataSpectrum));//GPU中記憶體釋放,對應cudaMalloc checkCudaErrors(cudaFree(d_Data));//GPU中記憶體釋放,對應cudaMalloc free(h_Data);//CPU中記憶體釋放,對應malloc
CPU 實現與GPU實現對比
使用小圖時CPU更快,使用大圖時GPU更快
#include <cuda_runtime.h> #include <cufft.h> #include <iostream> #include <time.h> #include <opencv2/opencv.hpp> using namespace std; using namespace cv; cv::Mat real(cv::Mat img) { std::vector<cv::Mat> planes; cv::split(img, planes); return planes[0]; } int main(int argc, char ** argv) { Mat img = imread("01.jpg", CV_LOAD_IMAGE_GRAYSCALE); //int NX = 2560; //int NY = 2560; int NN = 1000; //if (argc == 4) //{ // NX = atoi(argv[1]); // NY = atoi(argv[2]); // NN = atoi(argv[3]); //} //std::cout << "NX=" << NX << " ; NY=" << NY << " ; NN=" << NN << std::endl; resize(img, img, Size(24, 24)); cout << "img: " << img.channels() << " " << img.rows << " " << img.cols << endl; imshow("img", img); waitKey(500); int rows = img.rows; int cols = img.cols; //normalize(img, img, 0, 1, CV_MINMAX); img.convertTo(img, CV_32FC1, 1.0f / 255); //cout << img(Rect(0, 0, 10, 96)) << endl; cufftHandle planFwd; cufftHandle planInv; float *data; float *data2; cufftComplex*res; cudaMalloc((float**)&data, sizeof(float)*rows*cols); cudaMalloc((float**)&data2, sizeof(float)*rows*cols); cudaMalloc((void**)&res, sizeof(cufftComplex)*rows*(cols/2+1)); /* Try to do the same thing than cv::randu() */ float* host_data; host_data = (float *)malloc(sizeof(float)*rows*cols); //host_data = (float *)img.data; //srand(time(NULL)); for (int i = 0; i < rows; i++) { float *ptrs = img.ptr<float>(i); for (size_t j = 0; j < cols; j++) { host_data[i*cols + j] = ptrs[j]; } //cout << img(Rect(0, 0, 1, 2)) << endl; //cout << sizeof(img.data[0])<< " " << (float)img.data[1]<< endl; //host_data[i] = make_cuComplex(rand() % 256, rand() % 256); ////host_data[i].x = rand() % 256; ////host_data[i].y = rand() % 256; } //cout << host_data[0]<< endl; /* Warm up ? */ /* Create a 3D FFT plan. */ cufftPlan2d(&planFwd, rows, cols, CUFFT_R2C); cufftPlan2d(&planInv, rows, cols, CUFFT_C2R); //cufftPlan2d(&planInv, rows, cols, CUFFT_C2C); cufftComplex* dftRes = (cufftComplex *)malloc(sizeof(cufftComplex)*rows*(cols / 2 + 1)); double t = cv::getTickCount(); for (size_t i = 0; i < NN; i++) { cudaMemcpy(data, host_data, sizeof(float)*rows*cols, cudaMemcpyHostToDevice); /* Transform the first signal in place. */ cufftExecR2C(planFwd, (cufftReal*)data, (cufftComplex *)res); cudaMemcpy(dftRes, res, sizeof(cufftComplex)*rows*(cols / 2 + 1), cudaMemcpyDeviceToHost); //for (size_t i = 0; i < 5; i++) //{ // cout << "gpu: " << i << " " << dftRes[i].x << " " << dftRes[i].y << endl; //} //逆變換 cufftExecC2R(planInv, (cufftComplex *)res, (cufftReal *)data2); float * new_data = (float *)malloc(sizeof(float)*rows*cols); cudaMemcpy(new_data, data2, sizeof(float)*rows*cols, cudaMemcpyDeviceToHost); //cout << new_data[0] / (rows*cols) << " " << new_data[1] / (rows*cols) << endl; //for (size_t i = 0; i < rows*cols; i++) //{ // new_data[i] = new_data[i] / (rows*cols); //} //Mat gpu_res = Mat(rows, cols, CV_32FC1, new_data); //imshow("gpu:", gpu_res); //waitKey(2); } t = 1000 * ((double)cv::getTickCount() - t) / cv::getTickFrequency() / NN; std::cout << "Cuda time=" << t << " ms" << std::endl; //use CPU cout << "cpu img:" << img(Rect(0, 0, 1, 2)) << endl; Mat img2, newimg2, dst; double t1 = cv::getTickCount(); bool backwards = false; for (size_t i = 0; i < NN; i++) { if (img.channels() == 1) { cv::Mat planes[] = { cv::Mat_<float>(img), cv::Mat_<float>::zeros(img.size()) }; //cv::Mat planes[] = {cv::Mat_<double> (img), cv::Mat_<double>::zeros(img.size())}; cv::merge(planes, 2, img2); } cv::dft(img2, dst, backwards ? (cv::DFT_INVERSE | cv::DFT_SCALE) : 0); // 0.01ms //cout << "cpu dst :" << dst(Rect(0, 0, 5, 1)) << endl; backwards = true; cv::dft(dst, newimg2, backwards ? (cv::DFT_INVERSE | cv::DFT_SCALE) : 0); // 0.01ms //Mat cpu_res = real(newimg2); //cout << "cpu newImg2:" << cpu_res(Rect(0, 0, 1, 2)) << endl; //imshow("cpu:", cpu_res); //waitKey(0); } t1 = 1000 * ((double)cv::getTickCount() - t1) / cv::getTickFrequency() / NN; std::cout << "cpu time=" << t1 << " ms" << std::endl; //double t = cv::getTickCount(); //for (int i = 0; i < NN; i++) //{ // /* Create a 2D FFT plan. */ // cufftPlan2d(&plan, cols, rows, CUFFT_C2C); // /* Transform the first signal in place. */ // cufftExecC2C(plan, data, res, CUFFT_FORWARD); //} //t = 1000 * ((double)cv::getTickCount() - t) / cv::getTickFrequency() / NN; //std::cout << "Cuda time=" << t << " ms" << std::endl; /* Destroy the cuFFT plan. */ cufftDestroy(planFwd); cufftDestroy(planInv); cudaFree(res); cudaFree(data); free(host_data); getchar(); std::system("PAUSE"); return 0; }
備註
CUFFT庫實現了快速傅立葉變換,由於新手加上英文水平有限,被這玩意小小折磨了一下。我把其中需要注意到的幾點問題,列在這裡,以供後來的讀者能夠少犯錯誤。
(1) 在影象正變換時,使用函式R2C,或者D2Z。從影象資料變換到複數資料,影象為N*N時,其實複數資料只有N*(N/2+1), 因為另外的N*(N/2-1)可以根據中心對稱和共軛關係算出來,CUFFT為了節省儲存空間,所以略去了那些資料。
(2) 在反變換時使用C2R變換,也可以是Z2D變換,在建立cufftPlan控制代碼時要注意:
cufftPlan2d(&plan_IFFT,nHeight,nWidth,CUFFT_C2R);
上面的語句中引數並不是(nWidth/2+1)而是nWidth。別以為得到的複數資料為N*(N/2+1)就用引數(nWidth/2+1)。
(3)反變換出來的資料是實數,但是被放大了nWidth*nHeight倍,所以你要得到影象資料的話,需要每個畫素值除以 nWidth*nHeight。