1. 程式人生 > >CUDA學習(三)之使用GPU進行兩個陣列相加

CUDA學習(三)之使用GPU進行兩個陣列相加

 傳入兩個陣列,在GPU中將兩個陣列對應索引位置相加

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iomanip>
#include <iostream>
#include <stdio.h>

using namespace std;

//檢測GPU
bool CheckCUDA(void){
    int count = 0;
    int i = 0;

    cudaGetDeviceCount(&count);
    
if (count == 0) { printf("找不到支援CUDA的裝置!\n"); return false; } cudaDeviceProp prop; for (i = 0; i < count; i++) { if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if (prop.major >= 1) { break; } } }
if (i == count) { printf("找不到支援CUDA的裝置!\n"); return false; } cudaGetDeviceProperties(&prop, 0); printf("GPU is: %s\n", prop.name); cudaSetDevice(0); printf("CUDA initialized success.\n"); return true; }//使用一維陣列相加 __global__ void addForOneDim(double *a, double *b, double
*c, int N); //初始化一維陣列 void InitOneDimArray(double *a, double b, int N); int main(){ //檢測GPU if (!CheckCUDA()){ cout << "No CUDA device."; return 0; }//****陣列相加************************************************************************************************************************ cout << "****************************************陣列相加*********************************************************************" << endl; int N = 20; //定義陣列大小 double *h_a_one, *h_b_one, *h_c_one; //宣告在CPU上使用的指標 double *d_a_one, *d_b_one, *d_c_one; //宣告在GPU上使用的指標 //為陣列分配大小 h_a_one = new double[N]; h_b_one = new double[N]; h_c_one = new double[N]; cudaMalloc((void **)&d_a_one, sizeof(double)*N); //在GPU上分配記憶體空間 cudaMalloc((void **)&d_b_one, sizeof(double)*N); cudaMalloc((void **)&d_c_one, sizeof(double)*N); //為陣列初始化 InitOneDimArray(h_a_one, 1.1, N); InitOneDimArray(h_b_one, 2.2, N); //使用GPU中分配的指標指向CPU中的陣列 cudaMemcpy(d_a_one, h_a_one, sizeof(double)*N, cudaMemcpyHostToDevice); cudaMemcpy(d_b_one, h_b_one, sizeof(double)*N, cudaMemcpyHostToDevice); //呼叫核函式,使用1個執行緒塊N個執行緒 //addForOneDim<<<1, N>>>(h_a_one, h_b_one, d_c_one, N); //不能使用h_a_one和h_b_one,只能使用GPU上定義的指標,不然結果如圖一所示 addForOneDim<<<1, N>>>(d_a_one, d_b_one, d_c_one, N); //結果如圖二所示 //將GPU上計算好的結果返回到CPU上定義好的變數 cudaMemcpy(h_c_one, d_c_one, sizeof(double)*N, cudaMemcpyDeviceToHost); //列印結果 for (int i = 0; i < N; i++){ cout << h_a_one[i] << " + " << h_b_one[i] << " = " << h_c_one[i] << endl; } cout << endl << endl; system("pause"); return 0; } //使用一維陣列相加 __global__ void addForOneDim(double *a, double *b, double *c, int N){ int tid = threadIdx.x; //執行緒索引 if (tid < N){ c[tid] = a[tid] + b[tid]; } } //初始化一維陣列 void InitOneDimArray(double *a, double b, int N){ for (int i = 0; i < N; i++){ a[i] = (i+1) * b; //cout << a[i] << endl; } }

 

 圖一 (該圖是錯誤的)

 

 圖二 (該圖是正確的)