1. 程式人生 > >【CUDA並行程式設計之四】矩陣相乘

【CUDA並行程式設計之四】矩陣相乘

前面介紹了基本的Cuda程式設計的相關知識,那麼這一篇在此基礎之上來看看GPU在處理資料計算上的高效能,我們拿矩陣相乘來作為例子。

1.CPU上執行矩陣相乘以及效能。

在CPU上進行矩陣相乘運算的程式碼:

mat_mul.cc:

//a[i]*b[i] + c[i] = d[i]
#include<iostream>
#include<vector>
#include<map>
#include<fstream>
#include"wtime.h" 

using namespace std;

const int N = 320;

//矩陣有兩種表達的方法用二維矩陣或者用一維矩陣表示
int a[N+1][N+1],b[N+1][N+1],c[N+1][N+1],d[N+1][N+1];
int aa[(N+1)*(N+1)],bb[(N+1)*(N+1)],cc[(N+1)*(N+1)],dd[(N+1)*(N+1)];

void init()
{
	for(int i=0;i<N;i++)
		for(int j=0;j<N;j++)
		{
			a[i][j] = 1;
			b[i][j] = 2;
			c[i][j] = 3;
		}
}

void init1()
{
	for(int i=0;i<N;i++)
		for(int j=0;j<N;j++)
		{
			aa[i*N+j] = 1;
			bb[i*N+j] = 2;
			cc[i*N+j] = 3;
		}
}

void mul()
{
	for(int i=0;i<N;i++)	
	  for(int j=0;j<N;j++)
	  {
		for(int k=0;k<N;k++)
		{
			d[i][j] += a[i][k] * b[k][j];
		}
		d[i][j] += c[i][j];
	  }
}

void mul1()
{
	for(int i=0;i<N;i++)	
	  for(int j=0;j<N;j++)
	  {
		for(int k=0;k<N;k++)
		{
			dd[i*N+j] += aa[i*N+k] * bb[k*N+j];
		}
		dd[N*i+j] += cc[N*i+j];
	  }
}

void print()
{
	ofstream fout;
	fout.open("result.txt");
	if(!fout)
	{
		perror("can not open the file");
	}
	for(int i=0;i<N;i++)
	{
	  for(int j=0;j<N;j++)
	  {
	  	  fout<<d[i][j]<<" ";
	  }
	  fout<<endl;
	}
	fout.close();
}

int main()
{
	init1();	
	
	double t = wtime();
	mul1();
	t = wtime()-t;
	printf("computation timing = %10.10f sec\n",t);
	
	//print();

	return 0;
}
wtime.h:
#ifndef _WTIME_
#define _WTIME_

double wtime();

#endif
wtime.cc:
#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>

double wtime(void)
{
	double now_time;
	struct timeval etstart;
	struct timezone tzp;

	if(gettimeofday(&etstart,&tzp)==-1)
	{
		perror("Error:calling gettimeofday() not successfully.\n");
	}

	now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;

	return now_time;
}

#if 0
int main()
{
	double time;
	time = wtime();

	printf("time of day = %10.4f\n",time);

	return 0;
}
#endif

makefile:

target:
	g++ mat_mul.cc wtime.cc
	./a.out
結果:



2.GPU上執行矩陣相乘以及效能。

程式碼:

cuda_mat_mul_v1.cu:

//matrix multiplication with global memory 
#include<iostream>
#include<fstream>
#include "wtime.h"

using namespace std;


const int BLOCK_SIZE = 16;
const int GRID_SIZE = 20;

//D = A * B + C;
__global__ void mat_mul(int *da,int *db,int *dc,int *dd,int N)
{
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	int sum = 0;
	for(int i=0;i<N;i++)
	{
		sum += da[row*N + i] * db[row*i+col];
	}
	dd[row*N + col] = sum + dc[row*N + col];
}

int main()
{
	int N = BLOCK_SIZE * GRID_SIZE;
	int *ha,*hb,*hc,*hd;
	int *da,*db,*dc,*dd;
	double time;
	ha = new int[N*N];
	hb = new int[N*N];
	hc = new int[N*N];
	hd = new int[N*N];
	cudaError_t err;

	//initialize
	for(int i=0;i<N;i++)
		for(int j=0;j<N;j++)
		{
			ha[i*N+j] = 1;
			hb[i*N+j] = 2;
			hc[i*N+j] = 3;
		}

	//malloc</strong>
	cudaMalloc(&da,N*N*sizeof(int));
	cudaMalloc(&db,N*N*sizeof(int));
	cudaMalloc(&dc,N*N*sizeof(int));
	err = cudaMalloc(&dd,N*N*sizeof(int));
	printf("Cuda Malloc C : %s\n",cudaGetErrorString(err));

	//host to device
	cudaMemcpy(da,ha,N*N*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(db,hb,N*N*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(dc,hc,N*N*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(dd,hd,N*N*sizeof(int),cudaMemcpyHostToDevice);

	dim3 threadBlock(BLOCK_SIZE,BLOCK_SIZE);
	dim3 grid(GRID_SIZE,GRID_SIZE);
	//kernel
	time = wtime();
	mat_mul<<<grid,threadBlock>>>(da,db,dc,dd,N);
	printf("Computation time is %10.10f\n",wtime()-time);

	//device to host
	cudaMemcpy(hd,dd,N*N*sizeof(int),cudaMemcpyDeviceToHost);

	//print result to file
	ofstream fout;
	fout.open("result_v1.txt");
	if(!fout)  
	{
		cerr<<"open the file error"<<endl;
		exit(-1);
	}
	for(int i=0;i<N;i++)	
	{
		for(int j=0;j<N;j++)
		{
			fout<<hd[i*N+j]<<" ";
		}
		fout<<endl;
	}
	
	delete []ha;delete []hb;delete []hc;delete []hd;
	cudaFree(da);cudaFree(db);cudaFree(dc);cudaFree(dd);

	return 0;
}
cuda_wtime.cu:
#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>

double wtime(void)
{
	double now_time;
	struct timeval etstart;
	struct timezone tzp;

	if(gettimeofday(&etstart,&tzp)==-1)
	{
		perror("Error:calling gettimeofday() not successfully.\n");
	}

	now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;

	return now_time;
}

#if 0
int main()
{
	double time;
	time = wtime();

	printf("time of day = %10.4f\n",time);

	return 0;
}
#endif
wtime.h:
#ifndef _WTIME_
#define _WTIME_

double wtime();

#endif

cuda_wtime.cu:

#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>

double wtime(void)
{
	double now_time;
	struct timeval etstart;
	struct timezone tzp;

	if(gettimeofday(&etstart,&tzp)==-1)
	{
		perror("Error:calling gettimeofday() not successfully.\n");
	}

	now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;

	return now_time;
}

#if 0
int main()
{
	double time;
	time = wtime();

	printf("time of day = %10.4f\n",time);

	return 0;
}
#endif

makefile:

cu:
	nvcc cuda_mat_mul_v1.cu cuda_wtime.cu
	./a.out

結果:


3.計算效能對比:

矩陣大小 1600*1600 1200*1200 800*800 320*320
序列時間/s 30.9 11.49865 2.597987 0.162311
並行時間 grid=100/block=16 grid=75/block=16 grid=50/block=16 grid=20/block=16
kernel執行時間/s 0.0000319 0.0000309944 0.0000309944 0.0000231266
平行計算總時間(分配記憶體加+資料拷貝+計算)/s 0.70796 0.439213 0.310214 0.237676

可見,在矩陣規模大的時候非常明顯的體現出了GPU強大的計算能力。