1. 程式人生 > >CUDA程式設計--實現並行矩陣乘法【80行程式碼】

CUDA程式設計--實現並行矩陣乘法【80行程式碼】

簡述

這裡只寫了方陣之間的乘法,但是本質上都是一樣的。

  • 我測試過100規模的方陣之間的乘法,沒有問題。

程式碼

  • 讀取檔案data.txt
  • 資料格式就是一個數值N,然後來連續的兩個N*N的矩陣。用空格隔開。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <fstream>
#include <stdio.h>
// Kernal:
__global__ void MatrixMultiply
(int *a, int * b, int *c, int N) { int tx = threadIdx.x + blockIdx.x * blockDim.x; int ty = threadIdx.y + blockIdx.y * blockDim.y; if (tx < N && ty < N) { int sum = 0; for (int k = 0; k < N; ++k) { int adata = a[tx * N + k]; int bdata = b[k * N + ty]; sum += adata * bdata;
} c[tx * N + ty] = sum; } } cudaError_t matrixMultiplyWithCuda(int *a, int *b, int *c, size_t size); int main() { std::ifstream in("data.txt"); int N; in >> N; if (in.fail()) { printf("Something wrong\n"); } else { printf("Success read\n"); } // host initial int *a = new int
[N * N]; int *b = new int[N * N]; int *c = new int[N * N]; // read for (int i = 0; i < N; ++i) for (int j = 0; j < N; ++j) in >> a[i * N + j]; for (int i = 0; i < N; ++i) for (int j = 0; j < N; ++j) in >> b[i * N + j]; cudaError_t cudaStatus = matrixMultiplyWithCuda(a, b, c, N); for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j) std::cout << c[i * N + j]<<" "; std::cout << std::endl; } cudaStatus = cudaThreadExit(); // host free delete[] a; delete[] b; delete[] c; return 0; } cudaError_t matrixMultiplyWithCuda(int *a, int *b, int *c, size_t N) { int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus; cudaStatus = cudaMalloc((void**)&dev_a, N * N * sizeof(int)); cudaStatus = cudaMalloc((void**)&dev_b, N * N * sizeof(int)); cudaStatus = cudaMalloc((void**)&dev_c, N * N * sizeof(int)); cudaStatus = cudaMemcpy(dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { printf("Something wrong\n"); goto Error; } // kernal invocation dim3 threadPerBlock(32, 32); dim3 numBlocks(N / threadPerBlock.x + 1, N / threadPerBlock.y + 1); MatrixMultiply<<<numBlocks, threadPerBlock>>>(dev_a, dev_b, dev_c, N); if (cudaStatus != cudaSuccess) { printf( "Calculate wrong\n"); goto Error; } cudaStatus = cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost); Error: cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); return cudaStatus; }

寫入檔案的 版本

(也改成了浮點數運算了)

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <fstream>
#include <stdio.h>
// Kernal:
__global__ void MatrixMultiply(float *a, float * b, float *c, int N) {
	int tx = threadIdx.x + blockIdx.x * blockDim.x;
	int ty = threadIdx.y + blockIdx.y * blockDim.y;
	if (tx < N && ty < N) {
		float sum = 0;
		for (int k = 0; k < N; ++k) {
			float adata = a[tx * N + k];
			float bdata = b[k * N + ty];
			sum += adata * bdata;
		}
		c[tx * N + ty] = sum;
	}
}

cudaError_t matrixMultiplyWithCuda(float *a, float *b, float *c, size_t size);

int main()
{
	std::ifstream in("data.txt");
	int N;
	in >> N;
	if (in.fail()) {
		printf("Something wrong\n");
	}
	else {
		printf("Success read\n");
	}
	// host initial
	float *a = new float[N * N];
	float *b = new float[N * N];
	float *c = new float[N * N];

	// read 
	for (int i = 0; i < N; ++i)
		for (int j = 0; j < N; ++j) in >> a[i * N + j];

	for (int i = 0; i < N; ++i)
		for (int j = 0; j < N; ++j) in >> b[i * N + j];

	cudaError_t cudaStatus = matrixMultiplyWithCuda(a, b, c, N);

	std::ofstream out("output.txt");
	for (int i = 0; i < N; ++i) {
		for (int j = 0; j < N; ++j) out << c[i * N + j]<<" ";
		out << std::endl;
	}
	cudaStatus = cudaThreadExit();

	// host free 
	delete[] a;
	delete[] b;
	delete[] c;
	return 0;
}
cudaError_t matrixMultiplyWithCuda(float *a, float *b, float *c, size_t N) {
	float *dev_a = 0;
	float *dev_b = 0;
	float *dev_c = 0;
	cudaError_t cudaStatus;
	cudaStatus = cudaMalloc((void**)&dev_a, N * N * sizeof(int));
	cudaStatus = cudaMalloc((void**)&dev_b, N * N * sizeof(int));
	cudaStatus = cudaMalloc((void**)&dev_c, N * N * sizeof(int));
	cudaStatus = cudaMemcpy(dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice);
	cudaStatus = cudaMemcpy(dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		printf("Something wrong\n");
		goto Error;
	}
	// kernal invocation 
	dim3 threadPerBlock(32, 32);
	dim3 numBlocks(N / threadPerBlock.x + 1, N / threadPerBlock.y + 1);
	MatrixMultiply<<<numBlocks, threadPerBlock>>>(dev_a, dev_b, dev_c, N);
	if (cudaStatus != cudaSuccess) {
		printf( "Calculate wrong\n");
		goto Error;
	}
	cudaStatus = cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);
Error:
	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);
	return cudaStatus;
}