C語言和cuda C實現的程式碼(教科書上的格式)
阿新 • • 發佈:2018-11-13
一般教科書都這麼寫,感覺不如STL的好。記下以便查閱。
#include <cuda_runtime.h> #include <iostream> #include <stdio.h> __global__ void vector_add_gpu_2(float*d_a,float*d_b,float*d_c,int n) { int tid = threadIdx.x; const int t_n = blockDim.x; while(tid < n) { d_c[tid] = d_a[tid] + d_b[tid]; tid += t_n; } } int main() { float* d_a,*d_b,*d_c,*h_a,*h_b,*h_c; int n = 0; std::cin >> n; int nBytes = n * sizeof(float); h_a = (float*)malloc(nBytes); h_b = (float*)malloc(nBytes); h_c = (float*)malloc(nBytes); for(int i = 0;i < n;i++) { h_a[i] = (float)i; h_b[i] = (float)i; std::cout << "h_a[" << i << "]= " << h_a[i] << "\t"; std::cout << "h_b[" << i << "]= " << h_b[i] << "\n"; } cudaMalloc((float**)&d_a,nBytes); cudaMalloc((float**)&d_b,nBytes); cudaMalloc((float**)&d_c,nBytes); cudaMemcpy(d_a,h_a,nBytes,cudaMemcpyHostToDevice); cudaMemcpy(d_b,h_b,nBytes,cudaMemcpyHostToDevice); vector_add_gpu_2<<<1,10>>>(d_a,d_b,d_c,n); cudaMemcpy(h_c,d_c,nBytes,cudaMemcpyDeviceToHost); for(int i = 0;i < n;i++) { std::cout << "h_c[" << i << "]= " << h_c[i] << "\t"; } return 0; }