CUDA牽手OpenCV
官方的pdf文件中已經對CUDA怎麼牽手OpenCV做了很好的說明。詳情請下載文件1,文件2。
我也是博採眾家之長為己所用,總結下CUDA牽手OpenCV的方法。
用IplImage
包含兩個檔案,kernel.cu以及app.cpp。
kernel.cu
#ifndef _KERNEL_CU_
#define _KERNEL_CU_
#include<time.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define BYTE unsigned char
__global__ void InverseImg_kernel(BYTE* pImgOut, BYTE* pImgIn, int nWidth, int nHeight, int nWidthStep)
{
const int ix = blockIdx.x * blockDim.x + threadIdx.x;
const int iy = blockIdx.y * blockDim.y + threadIdx.y;
if (ix < nWidth && iy < nHeight)
{
pImgOut[iy * nWidthStep + ix] =
255 - pImgIn[iy * nWidthStep + ix];
}
}
extern "C"
double cudaInverseImg(BYTE* pImgOut, BYTE* pImgIn, int nWidth, int nHeight, int nWidthStep, int nChannels)
{
// var for timing
clock_t start, finish;
double duration = 0.0;
// cpu 計時開始
start = clock();
// 準備空間
BYTE* d_pImgOut;
BYTE* d_pImgIn;
cudaMalloc((void**)&d_pImgOut, sizeof(BYTE) * nWidthStep * nHeight);
cudaMalloc((void**)&d_pImgIn, sizeof(BYTE) * nWidthStep * nHeight);
//傳入資料來源
cudaMemcpy(d_pImgIn, pImgIn, sizeof(BYTE) * nWidthStep * nHeight, cudaMemcpyHostToDevice);
cudaMemset(d_pImgOut, 0, sizeof(BYTE) * nWidthStep * nHeight);
//GPU處理
dim3 ts(16, 16);
dim3 bs((nWidth*nChannels + 15) / 16, (nHeight + 15) / 16);
InverseImg_kernel<<< bs, ts >>>(d_pImgOut, d_pImgIn, nWidth*nChannels, nHeight, nWidthStep);
//輸出結果
cudaMemcpy(pImgOut, d_pImgOut, sizeof(BYTE) * nWidthStep * nHeight, cudaMemcpyDeviceToHost);
//釋放空間
cudaFree(d_pImgOut);
cudaFree(d_pImgIn);
//cpu 計時結束
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
return duration;
}
#endif
app.cpp
#include <opencv2/opencv.hpp>
using namespace cv;
#define BYTE unsigned char
extern "C"
double cudaInverseImg(BYTE* pImgOut, BYTE* pImgIn, int nWidth, int nHeight, int nWidthStep, int nChannels);
void main(void)
{
IplImage* img = cvLoadImage("1.jpg", CV_LOAD_IMAGE_GRAYSCALE);
cvShowImage("原始圖", img);
BYTE* pImgIn = (BYTE* ) img->imageData;
BYTE* pImgOut = (BYTE*)img->imageData;
int nWidth = img->width;
int nHeight = img->height;
int nDepth = img->depth;
int nWidthStep = img->widthStep;
int nChannels = img->nChannels;
double time = cudaInverseImg(pImgOut, pImgIn, nWidth, nHeight, nWidthStep, nChannels);
printf("time : %f", time);
IplImage* imgOut = cvCreateImageHeader(cvSize(nWidth, nHeight), nDepth, nChannels);
cvSetData(imgOut, pImgOut, nWidthStep);
cvShowImage("反相圖", imgOut);
cvWaitKey(0);
}
牽牽
用Mat
包含兩個檔案,kernel.cu以及app.cpp。這個例子分別編寫了CPU以及GPU處理影象的程式碼。最讓我弄不明白的是,GPU運算的時間比CPU運算的時間長,另外GPU映象時也有問題。但是自行利用Mat編寫CUDA核心函式的思路還是在這裡的(目前是這樣,因為這個方法並不是主流的方法,所以我也沒有深究。原諒我的不求甚解……)。
kernel.cu
#include <opencv2/opencv.hpp>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void kernel_channel_1(uchar* srcData, uchar* dstData, int rows, int cols)
{
const int ix = blockIdx.x * blockDim.x + threadIdx.x;
const int iy = blockIdx.y * blockDim.y + threadIdx.y;
if (ix < rows && iy < cols)
{
*(dstData + ix + iy * rows) = *(srcData + rows - 1 - ix + (cols - 1 - iy) * rows);
}
}
__global__ void kernel_channel_3(uchar3* srcData, uchar3* dstData, int rows, int cols)
{
const int ix = blockIdx.x * blockDim.x + threadIdx.x;
const int iy = blockIdx.y * blockDim.y + threadIdx.y;
if (ix < rows && iy < cols)
{
*(dstData + ix + iy * rows) = *(srcData + rows - 1 - ix + (cols - 1 - iy) * rows);
}
}
extern "C"
static int iDivUp(int a, int b)
{
return (a % b != 0) ? (a / b + 1) : (a / b);
}
extern "C"
void gpuMirrorImg(const cv::Mat& src, cv::Mat& dst)
{
int rowNumber = src.rows;
int colNumber = src.cols;
dim3 threads(16, 16);
dim3 grid(iDivUp(rowNumber + 15, threads.x), iDivUp(colNumber + 15, threads.y));
size_t memSize = sizeof(uchar3) * rowNumber * colNumber;
switch (src.channels())
{
case 1:
uchar* uSrcData;
uchar* uDstData;
cudaMalloc((void**)&uSrcData, sizeof(uchar) * rowNumber * colNumber);
cudaMalloc((void**)&uDstData, sizeof(uchar) * rowNumber * colNumber);
cudaMemcpy(uSrcData, src.data, sizeof(uchar) * rowNumber * colNumber, cudaMemcpyHostToDevice);
cudaMemset(uDstData, 0, sizeof(uchar) * rowNumber * colNumber);
kernel_channel_1 <<<grid, threads >>>(uSrcData, uDstData, rowNumber, colNumber);
cudaMemcpy(dst.data, uDstData, sizeof(uchar) * rowNumber * colNumber, cudaMemcpyDeviceToHost);
// 釋放空間
cudaFree(uSrcData);
cudaFree(uDstData);
case 3:
uchar3* vSrcData;
uchar3* vDstData;
cudaMalloc((void**)&vSrcData, memSize);
cudaMalloc((void**)&vDstData, memSize);
cudaMemcpy(vSrcData, src.data, memSize, cudaMemcpyHostToDevice);
cudaMemset(vDstData, 0, memSize);
kernel_channel_3 <<<grid, threads >>>(vSrcData, vDstData, rowNumber, colNumber);
cudaMemcpy(dst.data, vDstData, memSize, cudaMemcpyDeviceToHost);
//釋放空間
cudaFree(vSrcData);
cudaFree(vDstData);
default:
break;
}
}
extern "C"
void cpuMirrorImg(const cv::Mat& src, cv::Mat& dst)
{
int rowNumber = src.rows;
int colNumber = src.cols;
switch (src.channels())
{
case 1:
const uchar* uSrcData;
uchar* uDstData;
for (int i = 0; i < rowNumber; i++)
{
uSrcData = src.ptr<uchar>(i);
uDstData = dst.ptr<uchar>(i);
for (int j = 0; j < colNumber; j++)
{
*(uDstData + j) = *(uSrcData + colNumber - 1 - j);
}
}
case 3:
const cv::Vec3b* vSrcData;
cv::Vec3b* vDstData;
for (int i = 0; i < rowNumber; i++)
{
vSrcData = src.ptr<cv::Vec3b>(i);
vDstData = dst.ptr<cv::Vec3b>(i);
for (int j = 0; j < colNumber; j++)
{
*(vDstData + j) = *(vSrcData + colNumber - 1 - j);
}
}
default:
break;
}
}
app.cpp
#include <iostream>
#include <opencv2/opencv.hpp>
using namespace cv;
using namespace std;
extern "C"
void cpuMirrorImg(const cv::Mat& src, cv::Mat& dst);
extern "C"
void gpuMirrorImg(const cv::Mat& src, cv::Mat& dst);
void main()
{
Mat srcImage = imread("1.jpg");
Mat dstImageCpu = srcImage.clone();
const int64 startCpu = getTickCount();
cpuMirrorImg(srcImage, dstImageCpu);
const double timeSecCpu = (getTickCount() - startCpu) / getTickFrequency();
cout << "CPU Time : " << timeSecCpu * 1000 << " ms" << endl;
Mat dstImageGpu = Mat::zeros(srcImage.size(), srcImage.type());
const int64 startGpu = getTickCount();
gpuMirrorImg(srcImage, dstImageGpu);
const double timeSecGpu = (getTickCount() - startGpu) / getTickFrequency();
cout << "GPU Time : " << timeSecGpu * 1000 << " ms" << endl;
imshow("source", srcImage);
imshow("mirror [CPU]", dstImageCpu);
imshow("mirror [GPU]", dstImageGpu);
waitKey(0);
}
尷尬
利用GpuMat
這個例程來源開頭提過的官方文件。很不錯,一級棒!!!該例程由三個檔案組成kernel.cu,swap_rb.cpp,app.cpp。
kernel.cu
#include <opencv2/opencv.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
using namespace cv;
using namespace cv::cuda;
__global__ void swap_rb_kernel(const PtrStepSz<uchar3> src, PtrStep<uchar3> dst)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < src.cols && y < src.rows)
{
uchar3 v = src(y, x); // Reads pixel in GPU memory. Valid! We are on GPU!
dst(y, x) = make_uchar3(v.z, v.y, v.x);
}
}
void swap_rb_caller(const PtrStepSz<uchar3>& src, PtrStep<uchar3> dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((src.cols + block.x - 1) / block.x, (src.rows + block.y - 1) / block.y);
swap_rb_kernel <<<grid, block, 0, stream >>>(src, dst);
if (stream == 0)
cudaDeviceSynchronize();
}
swap_rb.cpp
#include <opencv2/opencv.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>
using namespace cv;
using namespace cv::cuda;
void swap_rb_caller(const PtrStepSz<uchar3>& src, PtrStep<uchar3> dst, cudaStream_t stream);
void swap_rb(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
{
CV_Assert(src.type() == CV_8UC3);
dst.create(src.size(), src.type()); // create if not allocated yet
cudaStream_t s = StreamAccessor::getStream(stream);
swap_rb_caller(src, dst, s);
}
app.cp
#include <iostream>
#include "opencv2/core.hpp"
#include <opencv2/core/utility.hpp>
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/cudaimgproc.hpp"
using namespace std;
using namespace cv;
using namespace cv::cuda;
void swap_rb(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
void main()
{
Mat srcImage = imread("1.jpg");
Mat dstImage = Mat::zeros(srcImage.size(), srcImage.type());
GpuMat srcImageGpu(srcImage);
GpuMat dstImageGpu;
dstImageGpu.create(srcImageGpu.size(), srcImageGpu.type());
swap_rb(srcImageGpu, dstImageGpu);
dstImageGpu.download(dstImage);
imshow("source image", srcImage);
imshow("gpu image", dstImage);
waitKey(0);
}
真相
OpenCV自帶的CUDA庫
例子來源於OpenCV,Google下關鍵字“OpenCV CUDA”就能索搜到。包含一個檔案app.cpp。
程式碼
#include <cmath>
#include <iostream>
#include "opencv2/core.hpp"
#include <opencv2/core/utility.hpp>
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/cudaimgproc.hpp"
using namespace std;
using namespace cv;
using namespace cv::cuda;
static void help()
{
cout << "This program demonstrates line finding with the Hough transform." << endl;
cout << "Usage:" << endl;
cout << "./gpu-example-houghlines <image_name>, Default is ../data/pic1.png\n" << endl;
}
int main(int argc, const char* argv[])
{
const string filename = argc >= 2 ? argv[1] : "1.jpg";
Mat src = imread(filename, IMREAD_GRAYSCALE);
if (src.empty())
{
help();
cout << "can not open " << filename << endl;
return -1;
}
Mat mask;
cv::Canny(src, mask, 100, 200, 3);
Mat dst_cpu;
cv::cvtColor(mask, dst_cpu, COLOR_GRAY2BGR);
Mat dst_gpu = dst_cpu.clone();
vector<Vec4i> lines_cpu;
{
const int64 start = getTickCount();
cv::HoughLinesP(mask, lines_cpu, 1, CV_PI / 180, 50, 60, 5);
const double timeSec = (getTickCount() - start) / getTickFrequency();
cout << "CPU Time : " << timeSec * 1000 << " ms" << endl;
cout << "CPU Found : " << lines_cpu.size() << endl;
}
for (size_t i = 0; i < lines_cpu.size(); ++i)
{
Vec4i l = lines_cpu[i];
line(dst_cpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, LINE_AA);
}
GpuMat d_src(mask);
GpuMat d_lines;
{
const int64 start = getTickCount();
Ptr<cuda::HoughSegmentDetector> hough = cuda::createHoughSegmentDetector(1.0f, (float)(CV_PI / 180.0f), 50, 5);
hough->detect(d_src, d_lines);
const double timeSec = (getTickCount() - start) / getTickFrequency();
cout << "GPU Time : " << timeSec * 1000 << " ms" << endl;
cout << "GPU Found : " << d_lines.cols << endl;
}
vector<Vec4i> lines_gpu;
if (!d_lines.empty())
{
lines_gpu.resize(d_lines.cols);
Mat h_lines(1, d_lines.cols, CV_32SC4, &lines_gpu[0]);
d_lines.download(h_lines);
}
for (size_t i = 0; i < lines_gpu.size(); ++i)
{
Vec4i l = lines_gpu[i];
line(dst_gpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, LINE_AA);
}
imshow("source", src);
imshow("detected lines [CPU]", dst_cpu);
imshow("detected lines [GPU]", dst_gpu);
waitKey();
return 0;
}
一線牽
尾巴
看過官方提供的文件之後,我才將CUDA正確的牽手OpenCV。官方已經說的很好了,感覺說啥都是多餘的。就到這裡吧!今天陽光不錯,等下配眼鏡。注意身體,尤其是眼睛,不光在說我自己,也在說正在閱讀的你。
參考:
《GPGPU程式設計技術——從GLSL、CUDA到OpenCL》♥♥♥♥♥
《數字影象處理高階應用——基於MATLAB與CUDA的實現》♥♥♥
《基於CUDA的並行程式設計》♥♥♥
《CUDA專家手冊》♥♥♥♥♥
《高效能CUDA應用設計與開發》♥♥♥♥
轉:https://blog.csdn.net/xx116213/article/details/50704335