1. 程式人生 > >linux利用CMakeLists編譯cuda程序

linux利用CMakeLists編譯cuda程序

pro sum rar source tar sta cut urn ner

文件目錄:

cudaTest

|--utils.cu

|--utils.h

|--squaresum.cu

|--squaresum.h

|--test.cpp

|--CMakeLists.txt

編譯命令:

$cd /root/cudaTest

$mkdir build

$cd build

$cmake ..

$make

調傭關系:

utils:提供常用工具,這裏提供查詢設備信息功能;

squaresum:計算平方和功能,為cuda運行的核心函數實現

test:調用平方和函數

CMakeLists.txt:組織所有文件編譯生成可執行文件

註意:調用cu文件中的函數時要在頭文件聲明成extern “C”

文件內容:

CMakeLists.txt

# CMakeLists.txt to build hellocuda.cu
cmake_minimum_required(VERSION 2.8)
find_package(CUDA QUIET REQUIRED)
 
# Specify binary name and source file to build it from
#add_library(utils utils.cpp)
cuda_add_executable(
    squaresum
    test.cpp squaresum.cu utils.cu)
#target_link_libraries(squaresum utils)

test.cpp

#include <iostream>
#include "squaresum.h"

//extern "C" int squaresum();

int main(){
  squaresum();
  return 0;
}

squaresum.h

#include "utils.h"
#include <cuda_runtime.h>

extern "C" {
  int squaresum();
}

squaresum.cu

#include <stdio.h>
#include <stdlib.h>
//#include "utils.h"
#include <iostream> #include "squaresum.h" // ======== define area ======== #define DATA_SIZE 1048576 // 1M // ======== global area ======== int data[DATA_SIZE]; __global__ static void squaresSum(int *data, int *sum, clock_t *time) { int sum_t = 0; clock_t start = clock(); for (int i = 0; i < DATA_SIZE; ++i) { sum_t += data[i] * data[i]; } *sum = sum_t; *time = clock() - start; } // ======== used to generate rand datas ======== void generateData(int *data, int size) { for (int i = 0; i < size; ++i) { data[i] = rand() % 10; } } int squaresum() { // init CUDA device if (!InitCUDA()) { return 0; } printf("CUDA initialized.\n"); // generate rand datas generateData(data, DATA_SIZE); // malloc space for datas in GPU int *gpuData, *sum; clock_t *time; cudaMalloc((void**) &gpuData, sizeof(int) * DATA_SIZE); cudaMalloc((void**) &sum, sizeof(int)); cudaMalloc((void**) &time, sizeof(clock_t)); cudaMemcpy(gpuData, data, sizeof(int) * DATA_SIZE, cudaMemcpyHostToDevice); // calculate the squares‘s sum squaresSum<<<1, 1, 0>>>(gpuData, sum, time); // copy the result from GPU to HOST int result; clock_t time_used; cudaMemcpy(&result, sum, sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost); // free GPU spaces cudaFree(gpuData); cudaFree(sum); cudaFree(time); // print result printf("(GPU) sum:%d time:%ld\n", result, time_used); // CPU calculate result = 0; clock_t start = clock(); for (int i = 0; i < DATA_SIZE; ++i) { result += data[i] * data[i]; } time_used = clock() - start; printf("(CPU) sum:%d time:%ld\n", result, time_used); return 0; }

utils.h

#include <stdio.h>
#include <cuda_runtime.h>

extern "C" {
  bool InitCUDA();
}

utils.cu

#include "utils.h"
#include <cuda_runtime.h>
#include <iostream>

void printDeviceProp(const cudaDeviceProp &prop)
{
 printf("Device Name : %s.\n", prop.name);
 printf("totalGlobalMem : %d.\n", prop.totalGlobalMem);
 printf("sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock);
 printf("regsPerBlock : %d.\n", prop.regsPerBlock);
 printf("warpSize : %d.\n", prop.warpSize);
 printf("memPitch : %d.\n", prop.memPitch);
 printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
 printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
 printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
 printf("totalConstMem : %d.\n", prop.totalConstMem);
 printf("major.minor : %d.%d.\n", prop.major, prop.minor);
 printf("clockRate : %d.\n", prop.clockRate);
 printf("textureAlignment : %d.\n", prop.textureAlignment);
 printf("deviceOverlap : %d.\n", prop.deviceOverlap);
 printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}

bool InitCUDA()
{
 //used to count the device numbers
 int count; 

 // get the cuda device count
 cudaGetDeviceCount(&count);
// print("%d\n", count);
std::cout << count << std::endl;
 if (count == 0) {
  fprintf(stderr, "There is no device.\n");
  return false;
 }

 // find the device >= 1.X
 int i;
 for (i = 0; i < count; ++i) {
  cudaDeviceProp prop;
  if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
   if (prop.major >= 1) {
    printDeviceProp(prop);
    break;
   }
  }
 }

 // if can‘t find the device
 if (i == count) {
  fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
  return false;
 }

 // set cuda device
 cudaSetDevice(i);

 return true;
}

//int main(){
//  InitCUDA();
//}

linux利用CMakeLists編譯cuda程序