1. 程式人生 > >cublas 實戰

cublas 實戰

導言

搞了一段時間的CUDA程式,雖然也不是寫不出複雜程式,但是寫個矩陣乘法都要搞個半天,還要擔心各種效能瓶頸。不過,還好有cublas,實現了非常多的數學運算,這下子可以好好利用一番。以後再也不用擔心自己寫的CUDA效率不高了。

入門

cuBLAS的官方文件再這裡,不懂的函式可以查。

cublas和matlab非常像,儲存陣列時,預設是列優先儲存,而且是從1開始的。從C轉過來,非常不習慣,不過你可以自己設定。

用 cublas之前你必須還是得會基本的CUDA,比如如何管理GPU視訊記憶體。這裡假設你都會了。

例項一

#include <stdio.h> 
#include <stdlib.h> #include <math.h> #include <cuda_runtime.h> #include "cublas_v2.h" #define M 6 #define N 5 #define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1)) static __inline__ void modify(cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float
beta) { cublasSscal(handle, n - p + 1, &alpha, &m[IDX2F(p, q, ldm)], ldm); cublasSscal(handle, ldm - p + 1, &beta, &m[IDX2F(p, q, ldm)], 1); } int main(void) { cudaError_t cudaStat; cublasStatus_t stat; cublasHandle_t handle; int i, j; float* devPtrA; float
* a = 0; a = (float *)malloc(M * N * sizeof(*a)); if (!a) { printf("host memory allocation failed"); return EXIT_FAILURE; } for (j = 1; j <= N; j++) { for (i = 1; i <= M; i++) { a[IDX2F(i, j, M)] = (float)((i - 1) * M + j); } } cudaStat = cudaMalloc((void**)&devPtrA, M*N*sizeof(*a)); if (cudaStat != cudaSuccess) { printf("device memory allocation failed"); return EXIT_FAILURE; } stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf("CUBLAS initialization failed\n"); return EXIT_FAILURE; } stat = cublasSetMatrix(M, N, sizeof(*a), a, M, devPtrA, M); if (stat != CUBLAS_STATUS_SUCCESS) { printf("data download failed"); cudaFree(devPtrA); cublasDestroy(handle); return EXIT_FAILURE; } modify(handle, devPtrA, M, N, 2, 3, 16.0f, 12.0f); stat = cublasGetMatrix(M, N, sizeof(*a), devPtrA, M, a, M); if (stat != CUBLAS_STATUS_SUCCESS) { printf("data upload failed"); cudaFree(devPtrA); cublasDestroy(handle); return EXIT_FAILURE; } cudaFree(devPtrA); cublasDestroy(handle); for (j = 1; j <= N; j++) { for (i = 1; i <= M; i++) { printf("%7.0f", a[IDX2F(i, j, M)]); } printf("\n"); } free(a); return EXIT_SUCCESS; }

這是官方示例的程式碼,核心是看

static __inline__ 
void modify(cublasHandle_t handle, float *m, int ldm, int n, 
            int p, int q, float alpha, float beta) {
    cublasSscal(handle, n - p + 1, &alpha, &m[IDX2F(p, q, ldm)], ldm);
    cublasSscal(handle, ldm - p + 1, &beta, &m[IDX2F(p, q, ldm)], 1);
}

函式cublasSscal,看名字,cublas是字首,中間一個S表示資料是float,後面scal顯然就是乘以一個標量了。官方解釋的是:

This function scales the vector x by the scalar α and overwrites it with the result. Hence, the performed operation is x [ j ] = α × x [ j ] for i = 1 , … , n and j = 1 + ( i - 1 ) *  incx . Notice that the last two equations reflect 1-based indexing used for compatibility with Fortran.

函式原型是

cublasStatus_t cublasSscal(cublasHandle_t handle, int n, const float *alpha, float *x, int incx)

官網上有個0-index的示例,自己看看。

實戰二:

接下來我們自己寫個小程式,功能是
Z=αAX+βY
A是矩陣,X, Y,Z是列向量。
首先,這個功能對應的函式就是
cublasSgemv
根據原型:

cublasStatus_t cublasSgemv(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float *alpha, const float *A, int lda, const float *x, int incx, const float *beta, float *y, int incy)

我們的矩陣假設就是C風格的,從0開始,而且行優先儲存。所以,在運算之前必須先進行矩陣的旋轉。注意:這裡所說的所有矩陣都是以一維儲存的。

void trans_mat(float* mat, int rows, int cols ) {
    float* tmp = (float*)(malloc)(rows*cols*sizeof(float));
    memcpy((void*)tmp, (void*)mat, rows*cols*sizeof(float));
    int count = 0;
    for (int i = 0; i < cols; ++i) {
        for (int j = 0; j < rows; ++j) {
            mat[count++] = tmp[j*cols + i];
        }
    }
    free(tmp);
}

這個矩陣轉置其實是以犧牲空間來換取時間,也可以嘗試使用傳送門的方法來用時間換空間。

所以,接下來就是申請一個矩陣A,向量X,Y,Z。然後呼叫 cublasSgemv 就好了,比較簡單。
下面把程式碼貼出來吧,沒有整理,有點亂。。。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <memory.h>

#include "cublas_v2.h"
#include "cuda_runtime.h"

#define IDX2C(i, j, ld) ((j)*(ld) +(i))
#define CHECK_CUBLAS_ERROR(state) if(CUBLAS_STATUS_SUCCESS != state) \
printf("ERROR state %d in file %s at line %d.\n", state, __FILE__, __LINE__);

void matrixAdd(float* dst, float* A, float* x, float* y, int rows, int cols) {
    float alpha = 1.0f;
    float beta = 0.0f;
    float *dev_mat1 = 0, *dev_x = 0, *dev_y = 0;
    CHECK_CUBLAS_ERROR(cudaMalloc((void**)&dev_mat1, (rows*cols)*sizeof(float)));
    CHECK_CUBLAS_ERROR(cudaMalloc((void**)&dev_x, (cols)*sizeof(float)));
    CHECK_CUBLAS_ERROR(cudaMalloc((void**)&dev_y, (rows)*sizeof(float)));
    CHECK_CUBLAS_ERROR(cublasSetMatrix(rows, cols, sizeof(float), A, rows, dev_mat1, rows));
    CHECK_CUBLAS_ERROR(cublasSetVector(cols, sizeof(float), x, 1, dev_x, 1));

    cublasHandle_t matAddHandle;
    cublasCreate(&matAddHandle);

    CHECK_CUBLAS_ERROR(cublasSgemv_v2(matAddHandle, CUBLAS_OP_N, rows, cols, 
        &alpha, dev_mat1, rows, dev_x, 1, &beta, dev_y, 1));

    CHECK_CUBLAS_ERROR(cublasGetVector(rows, sizeof(float), dev_y, 1, dst, 1));
    CHECK_CUBLAS_ERROR( cublasDestroy(matAddHandle));
    CHECK_CUBLAS_ERROR(cudaFree(dev_y));
    CHECK_CUBLAS_ERROR(cudaFree(dev_x));
    CHECK_CUBLAS_ERROR(cudaFree(dev_mat1));
}

void gen_init_matrix(float* dst, int rows, int cols, float min_val = 0, float max_val = 1) {

    if (NULL == dst)
        exit(-1);
    for (int r = 0; r < rows*cols; ++r) {
        dst[r] = (1.0f * (rand( )) / RAND_MAX) *(max_val - min_val) + min_val;
    }
}

void trans_mat(float* mat, int rows, int cols ) {
    float* tmp = (float*)(malloc)(rows*cols*sizeof(float));
    memcpy((void*)tmp, (void*)mat, rows*cols*sizeof(float));
    int count = 0;
    for (int i = 0; i < cols; ++i) {
        for (int j = 0; j < rows; ++j) {
            mat[count++] = tmp[j*cols + i];
        }
    }
    free(tmp);
}

void cpu_test(int rows, int cols, float* A, float* x, float* y, float alpha, float beta) {
    float* dst = (float*)malloc(rows*(sizeof(float)));
    for (int i = 0; i < rows; ++i) {
        float sum = 0.0f;
        for (int j = 0; j < cols; ++j) {
            sum += A[i*cols + j] * x[j] * alpha;
        }
        dst[i] = sum /*+ beta * y[i]*/;
        printf("%2.6f\t", sum);
    }
    free(dst);
}

void test( ) {
    int rows = 10, cols = 9;
    float *mat1 = 0, *mat2 = 0, *mat3 = 0;
    mat1 = (float*)malloc(rows*cols*(sizeof(float)));
    mat2 = (float*)malloc(cols*sizeof(float));
    mat3 = (float*)malloc(rows*sizeof(float));
    gen_init_matrix(mat1, rows, cols, 0, 1);
    gen_init_matrix(mat2, cols, 1, 0, 1);
    gen_init_matrix(mat3, rows, 1, 0, 1);
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j)
            printf("%f\t", mat1[i*cols + j]);
        printf("\n");
    }
    printf("\n");
    for (int i = 0; i < cols; ++i)
        printf("%f  ", mat2[i]);
    printf("\n");
    printf("CPU result!\n");
    cpu_test(rows, cols, mat1, mat2, mat3, 1.0f, 0.0f);
    printf("\n");
    trans_mat(mat1, rows, cols);
    printf("trans\n");
    for (int i = 0; i < cols; ++i) {
        for (int j = 0; j < rows; ++j)
            printf("%f\t", mat1[i*rows + j]);
        printf("\n");
    }
    printf("\n");
    matrixAdd(mat3, mat1, mat2, mat3, rows, cols);

    for (int i = 0; i < rows; ++i)
        printf("%f  ", mat3[i]);
    printf("\n");

    free(mat1);
    free(mat2);
    free(mat3);
}


int main( ) {
    srand(20161003);
    test( );
    return 0;
}