Using cublasGemmBatchedEx

Question

I am trying to use cublasGemmBatchedEx to perform matrix multiplication. Here is my code.

#include <iostream>
#include <cublas_v2.h>

#define M 4
#define N 4
#define K 4


//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float **A, int rows, int cols, int batch_size) {
    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < rows; j++){
            for(int k = 0; k < cols; k++){
                std::cout << A[i][k * rows + j] << " ";
            }
            std::cout << std::endl;
        }
        std::cout << std::endl;
    }
    
}

int main(int argc, char* argv[])
{
    // Linear dimension of matrices
    int batch_size = 2;

    float *h_A[batch_size], *h_B[batch_size], *h_C[batch_size];
    for (int i = 0; i < batch_size; i++){
        h_A[i] = (float*)malloc(M * K * sizeof(float));
        h_B[i] = (float*)malloc(K * N * sizeof(float));
        h_C[i] = (float*)malloc(M * N * sizeof(float));
    }

    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < M * K; j++)
            h_A[i][j] = j%4;
        for (int j = 0; j < K * N; j++)
            h_B[i][j] = j%4 + 4;
        for (int j = 0; j < M * N; j++)
            h_C[i][j] = 0;
    }

    std::cout << "A =" << std::endl;
    print_matrix(h_A, M, K, batch_size);
    std::cout << "B =" << std::endl;
    print_matrix(h_B, K, N, batch_size);

    float *d_A[batch_size], *d_B[batch_size], *d_C[batch_size];

    for (int i = 0; i < batch_size; i++){
        cudaMalloc(&d_A[i], sizeof(float)* M * K);
        cudaMalloc(&d_B[i], sizeof(float)* K * N);
        cudaMalloc(&d_C[i], sizeof(float)* M * N);
    }

    cudaMemcpy(d_A, h_A, sizeof(float)* M * K * batch_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sizeof(float)* K * N * batch_size, cudaMemcpyHostToDevice);

    cublasHandle_t handle;
    cublasCreate(&handle);

    // Set up the matrix dimensions and batch size
    int lda = M;
    int ldb = K;
    int ldc = M;

    // Set the alpha and beta parameters for the gemm operation
    float alpha = 1.0f;
    float beta = 0.0f;

    cublasStatus_t status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, 
                            &alpha,
                            (const void**)d_A, CUDA_R_32F, lda, 
                            (const void**)d_B, CUDA_R_32F, ldb, 
                            &beta,
                            (void**)d_C, CUDA_R_32F, ldc, 
                            batch_size,
                            CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);

    cudaMemcpy(h_C,d_C,sizeof(float) * M * N * batch_size, cudaMemcpyDeviceToHost);

    if (status == CUBLAS_STATUS_SUCCESS) {
        std::cout << "C =" << std::endl;
        print_matrix(h_C, M, N, batch_size);
    } else {
        std::cout << status << std::endl;
    }
    
    // Destroy the handle
    cublasDestroy(handle);


    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cudaFreeHost(h_A);
    cudaFreeHost(h_B);
    cudaFreeHost(h_C);

}

This is the result when I ran this code.

A =

0 0 0 0

1 1 1 1

2 2 2 2

3 3 3 3

0 0 0 0

1 1 1 1

2 2 2 2

3 3 3 3

B =

4 4 4 4

5 5 5 5

6 6 6 6

7 7 7 7

4 4 4 4

5 5 5 5

6 6 6 6

7 7 7 7

C =

0 0 0 0

The problem is I don't get a expected result. It comes full of zeros. Is there any problem with my code?

You should check the status for all CUDA runtime API and CUBLAS calls. See [What is the canonical way to check for errors using the CUDA runtime API?](https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) for an elegant way of doing this. — paleonix, Dec 22 '22 at 12:48
Probably not the cause of this behavior, but you should not call `cudaFreeHost` on pointers that were not allocated through `cudaMallocHost`. In this case you need to loop over `h_A` etc. and use `free` on each pointer that you used `malloc` on before. — paleonix, Dec 22 '22 at 12:52

Robert Crovella · Answer 1 · 2022-12-22T17:53:21.723

The matrix arguments that you pass to this cublas function need to be an array of device pointers, where each device pointer is properly allocated, properly copied, and has a proper population of its allocation.

There are at least several problems with your attempt to do this. The central problem is around these lines:

cudaMemcpy(d_A, h_A, sizeof(float)* M * K * batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, sizeof(float)* K * N * batch_size, cudaMemcpyHostToDevice);

You cannot copy an array of independent allocations that way, and furthermore you are copying pointer data using sizes and types as if it were matrix contents (float * vs. float).

The following example has various issues fixed:

$ cat t2167.cu
#include <iostream>
#include <cublas_v2.h>

#define M 4
#define N 4
#define K 4


//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float **A, int rows, int cols, int batch_size) {
    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < rows; j++){
            for(int k = 0; k < cols; k++){
                std::cout << A[i][k * rows + j] << " ";
            }
            std::cout << std::endl;
        }
        std::cout << std::endl;
    }

}

int main(int argc, char* argv[])
{
    // Linear dimension of matrices
    int batch_size = 2;

    float *h_A[batch_size], *h_B[batch_size], *h_C[batch_size];
    for (int i = 0; i < batch_size; i++){
        h_A[i] = (float*)malloc(M * K * sizeof(float));
        h_B[i] = (float*)malloc(K * N * sizeof(float));
        h_C[i] = (float*)malloc(M * N * sizeof(float));
    }

    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < M * K; j++)
            h_A[i][j] = j%4;
        for (int j = 0; j < K * N; j++)
            h_B[i][j] = j%4 + 4;
        for (int j = 0; j < M * N; j++)
            h_C[i][j] = 0;
    }

    std::cout << "A =" << std::endl;
    print_matrix(h_A, M, K, batch_size);
    std::cout << "B =" << std::endl;
    print_matrix(h_B, K, N, batch_size);

    float *d_A[batch_size], *d_B[batch_size], *d_C[batch_size];

    for (int i = 0; i < batch_size; i++){
        cudaMalloc(&d_A[i], sizeof(float)* M * K);
        cudaMemcpy(d_A[i], h_A[i], sizeof(float)*M*K, cudaMemcpyHostToDevice);
        cudaMalloc(&d_B[i], sizeof(float)* K * N);
        cudaMemcpy(d_B[i], h_B[i], sizeof(float)*N*K, cudaMemcpyHostToDevice);
        cudaMalloc(&d_C[i], sizeof(float)* M * N);
        cudaMemcpy(d_C[i], h_C[i], sizeof(float)*N*M, cudaMemcpyHostToDevice);
    }
    float **d_dA, **d_dB, **d_dC;
    cudaMalloc(&d_dA, sizeof(float *)*batch_size);
    cudaMalloc(&d_dB, sizeof(float *)*batch_size);
    cudaMalloc(&d_dC, sizeof(float *)*batch_size);
    cudaMemcpy(d_dA, d_A, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dB, d_B, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dC, d_C, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);

    cublasHandle_t handle;
    cublasStatus_t status = cublasCreate(&handle);

    // Set up the matrix dimensions and batch size
    int lda = M;
    int ldb = K;
    int ldc = M;

    // Set the alpha and beta parameters for the gemm operation
    float alpha = 1.0f;
    float beta = 0.0f;
    status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
                            &alpha,
                            (const void**)d_dA, CUDA_R_32F, lda,
                            (const void**)d_dB, CUDA_R_32F, ldb,
                            &beta,
                            (void**)d_dC, CUDA_R_32F, ldc,
                            batch_size,
                            CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
    for (int i = 0; i < batch_size; i++)
        cudaMemcpy(h_C[i], d_C[i], sizeof(float)*M*N, cudaMemcpyDeviceToHost);

    if (status == CUBLAS_STATUS_SUCCESS) {
        std::cout << "C =" << std::endl;
        print_matrix(h_C, M, N, batch_size);
    } else {
        std::cout << status << std::endl;
    }

    // Destroy the handle
    cublasDestroy(handle);


    cudaFree(d_dA);
    cudaFree(d_dB);
    cudaFree(d_dC);
    for (int i = 0; i < batch_size; i++){
      free(h_A[i]);
      free(h_B[i]);
      free(h_C[i]);
      cudaFree(d_A[i]);
      cudaFree(d_B[i]);
      cudaFree(d_C[i]);}

}
$ nvcc -o t2167 t2167.cu -lcublas
$ compute-sanitizer ./t2167
========= COMPUTE-SANITIZER
A =
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3

0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3

B =
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7

4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7

C =
0 0 0 0
22 22 22 22
44 44 44 44
66 66 66 66

0 0 0 0
22 22 22 22
44 44 44 44
66 66 66 66

========= ERROR SUMMARY: 0 errors
$

given that cublasGemmBatchedEx() requires that all the matrices across the batch be of identical sizes, and given that we are starting with a clean slate for this exercise, we can use a somewhat simpler realization by concatenating matrices so we can do single allocations for the batch. This won't work for every use case, but may be of interest:

#include <iostream>
#include <cublas_v2.h>

#define M 4
#define N 4
#define K 4


//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float *A, int rows, int cols, int batch_size) {
    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < rows; j++){
            for(int k = 0; k < cols; k++){
                std::cout << A[i*rows*cols+k * rows + j] << " ";
            }
            std::cout << std::endl;
        }
        std::cout << std::endl;
    }

}

int main(int argc, char* argv[])
{
    // Linear dimension of matrices
    int batch_size = 2;

    float *h_A, *h_B, *h_C;
    h_A = (float*)malloc(M * K * sizeof(float) * batch_size);
    h_B = (float*)malloc(K * N * sizeof(float) * batch_size);
    h_C = (float*)malloc(M * N * sizeof(float) * batch_size);

    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < M * K; j++)
            h_A[i*M*K+j] = j%4;
        for (int j = 0; j < K * N; j++)
            h_B[i*K*N+j] = j%4 + 4;
        for (int j = 0; j < M * N; j++)
            h_C[i*M*N+j] = 0;
    }

    std::cout << "A =" << std::endl;
    print_matrix(h_A, M, K, batch_size);
    std::cout << "B =" << std::endl;
    print_matrix(h_B, K, N, batch_size);

    float *d_A, *d_B, *d_C;

    cudaMalloc(&d_A, sizeof(float)* M * K*batch_size);
    cudaMemcpy(d_A, h_A, sizeof(float)*M*K*batch_size, cudaMemcpyHostToDevice);
    cudaMalloc(&d_B, sizeof(float)* K * N*batch_size);
    cudaMemcpy(d_B, h_B, sizeof(float)*N*K*batch_size, cudaMemcpyHostToDevice);
    cudaMalloc(&d_C, sizeof(float)* M * N*batch_size);
    cudaMemcpy(d_C, h_C, sizeof(float)*N*M*batch_size, cudaMemcpyHostToDevice);
    float *h_dA[batch_size], *h_dB[batch_size], *h_dC[batch_size];
    for (int i = 0; i < batch_size; i++){
      h_dA[i] = d_A+i*M*K;
      h_dB[i] = d_B+i*K*N;
      h_dC[i] = d_C+i*M*N;}
    float **d_dA, **d_dB, **d_dC;
    cudaMalloc(&d_dA, sizeof(float *)*batch_size);
    cudaMalloc(&d_dB, sizeof(float *)*batch_size);
    cudaMalloc(&d_dC, sizeof(float *)*batch_size);
    cudaMemcpy(d_dA, h_dA, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dB, h_dB, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dC, h_dC, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);

    cublasHandle_t handle;
    cublasStatus_t status = cublasCreate(&handle);

    // Set up the matrix dimensions and batch size
    int lda = M;
    int ldb = K;
    int ldc = M;

    // Set the alpha and beta parameters for the gemm operation
    float alpha = 1.0f;
    float beta = 0.0f;
    status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
                            &alpha,
                            (const void**)d_dA, CUDA_R_32F, lda,
                            (const void**)d_dB, CUDA_R_32F, ldb,
                            &beta,
                            (void**)d_dC, CUDA_R_32F, ldc,
                            batch_size,
                            CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
    cudaMemcpy(h_C, d_C, sizeof(float)*M*N*batch_size, cudaMemcpyDeviceToHost);

    if (status == CUBLAS_STATUS_SUCCESS) {
        std::cout << "C =" << std::endl;
        print_matrix(h_C, M, N, batch_size);
    } else {
        std::cout << status << std::endl;
    }

    // Destroy the handle
    cublasDestroy(handle);


    cudaFree(d_dA);
    cudaFree(d_dB);
    cudaFree(d_dC);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
}

Using cublasGemmBatchedEx

1 Answers1