The matrix arguments that you pass to this cublas function need to be an array of device pointers, where each device pointer is properly allocated, properly copied, and has a proper population of its allocation.
There are at least several problems with your attempt to do this. The central problem is around these lines:
cudaMemcpy(d_A, h_A, sizeof(float)* M * K * batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, sizeof(float)* K * N * batch_size, cudaMemcpyHostToDevice);
You cannot copy an array of independent allocations that way, and furthermore you are copying pointer data using sizes and types as if it were matrix contents (float *
vs. float
).
The following example has various issues fixed:
$ cat t2167.cu
#include <iostream>
#include <cublas_v2.h>
#define M 4
#define N 4
#define K 4
//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float **A, int rows, int cols, int batch_size) {
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < rows; j++){
for(int k = 0; k < cols; k++){
std::cout << A[i][k * rows + j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
int main(int argc, char* argv[])
{
// Linear dimension of matrices
int batch_size = 2;
float *h_A[batch_size], *h_B[batch_size], *h_C[batch_size];
for (int i = 0; i < batch_size; i++){
h_A[i] = (float*)malloc(M * K * sizeof(float));
h_B[i] = (float*)malloc(K * N * sizeof(float));
h_C[i] = (float*)malloc(M * N * sizeof(float));
}
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < M * K; j++)
h_A[i][j] = j%4;
for (int j = 0; j < K * N; j++)
h_B[i][j] = j%4 + 4;
for (int j = 0; j < M * N; j++)
h_C[i][j] = 0;
}
std::cout << "A =" << std::endl;
print_matrix(h_A, M, K, batch_size);
std::cout << "B =" << std::endl;
print_matrix(h_B, K, N, batch_size);
float *d_A[batch_size], *d_B[batch_size], *d_C[batch_size];
for (int i = 0; i < batch_size; i++){
cudaMalloc(&d_A[i], sizeof(float)* M * K);
cudaMemcpy(d_A[i], h_A[i], sizeof(float)*M*K, cudaMemcpyHostToDevice);
cudaMalloc(&d_B[i], sizeof(float)* K * N);
cudaMemcpy(d_B[i], h_B[i], sizeof(float)*N*K, cudaMemcpyHostToDevice);
cudaMalloc(&d_C[i], sizeof(float)* M * N);
cudaMemcpy(d_C[i], h_C[i], sizeof(float)*N*M, cudaMemcpyHostToDevice);
}
float **d_dA, **d_dB, **d_dC;
cudaMalloc(&d_dA, sizeof(float *)*batch_size);
cudaMalloc(&d_dB, sizeof(float *)*batch_size);
cudaMalloc(&d_dC, sizeof(float *)*batch_size);
cudaMemcpy(d_dA, d_A, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_dB, d_B, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_dC, d_C, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
// Set up the matrix dimensions and batch size
int lda = M;
int ldb = K;
int ldc = M;
// Set the alpha and beta parameters for the gemm operation
float alpha = 1.0f;
float beta = 0.0f;
status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
&alpha,
(const void**)d_dA, CUDA_R_32F, lda,
(const void**)d_dB, CUDA_R_32F, ldb,
&beta,
(void**)d_dC, CUDA_R_32F, ldc,
batch_size,
CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
for (int i = 0; i < batch_size; i++)
cudaMemcpy(h_C[i], d_C[i], sizeof(float)*M*N, cudaMemcpyDeviceToHost);
if (status == CUBLAS_STATUS_SUCCESS) {
std::cout << "C =" << std::endl;
print_matrix(h_C, M, N, batch_size);
} else {
std::cout << status << std::endl;
}
// Destroy the handle
cublasDestroy(handle);
cudaFree(d_dA);
cudaFree(d_dB);
cudaFree(d_dC);
for (int i = 0; i < batch_size; i++){
free(h_A[i]);
free(h_B[i]);
free(h_C[i]);
cudaFree(d_A[i]);
cudaFree(d_B[i]);
cudaFree(d_C[i]);}
}
$ nvcc -o t2167 t2167.cu -lcublas
$ compute-sanitizer ./t2167
========= COMPUTE-SANITIZER
A =
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3
B =
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
C =
0 0 0 0
22 22 22 22
44 44 44 44
66 66 66 66
0 0 0 0
22 22 22 22
44 44 44 44
66 66 66 66
========= ERROR SUMMARY: 0 errors
$
given that cublasGemmBatchedEx()
requires that all the matrices across the batch be of identical sizes, and given that we are starting with a clean slate for this exercise, we can use a somewhat simpler realization by concatenating matrices so we can do single allocations for the batch. This won't work for every use case, but may be of interest:
#include <iostream>
#include <cublas_v2.h>
#define M 4
#define N 4
#define K 4
//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float *A, int rows, int cols, int batch_size) {
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < rows; j++){
for(int k = 0; k < cols; k++){
std::cout << A[i*rows*cols+k * rows + j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
int main(int argc, char* argv[])
{
// Linear dimension of matrices
int batch_size = 2;
float *h_A, *h_B, *h_C;
h_A = (float*)malloc(M * K * sizeof(float) * batch_size);
h_B = (float*)malloc(K * N * sizeof(float) * batch_size);
h_C = (float*)malloc(M * N * sizeof(float) * batch_size);
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < M * K; j++)
h_A[i*M*K+j] = j%4;
for (int j = 0; j < K * N; j++)
h_B[i*K*N+j] = j%4 + 4;
for (int j = 0; j < M * N; j++)
h_C[i*M*N+j] = 0;
}
std::cout << "A =" << std::endl;
print_matrix(h_A, M, K, batch_size);
std::cout << "B =" << std::endl;
print_matrix(h_B, K, N, batch_size);
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, sizeof(float)* M * K*batch_size);
cudaMemcpy(d_A, h_A, sizeof(float)*M*K*batch_size, cudaMemcpyHostToDevice);
cudaMalloc(&d_B, sizeof(float)* K * N*batch_size);
cudaMemcpy(d_B, h_B, sizeof(float)*N*K*batch_size, cudaMemcpyHostToDevice);
cudaMalloc(&d_C, sizeof(float)* M * N*batch_size);
cudaMemcpy(d_C, h_C, sizeof(float)*N*M*batch_size, cudaMemcpyHostToDevice);
float *h_dA[batch_size], *h_dB[batch_size], *h_dC[batch_size];
for (int i = 0; i < batch_size; i++){
h_dA[i] = d_A+i*M*K;
h_dB[i] = d_B+i*K*N;
h_dC[i] = d_C+i*M*N;}
float **d_dA, **d_dB, **d_dC;
cudaMalloc(&d_dA, sizeof(float *)*batch_size);
cudaMalloc(&d_dB, sizeof(float *)*batch_size);
cudaMalloc(&d_dC, sizeof(float *)*batch_size);
cudaMemcpy(d_dA, h_dA, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_dB, h_dB, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_dC, h_dC, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
// Set up the matrix dimensions and batch size
int lda = M;
int ldb = K;
int ldc = M;
// Set the alpha and beta parameters for the gemm operation
float alpha = 1.0f;
float beta = 0.0f;
status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
&alpha,
(const void**)d_dA, CUDA_R_32F, lda,
(const void**)d_dB, CUDA_R_32F, ldb,
&beta,
(void**)d_dC, CUDA_R_32F, ldc,
batch_size,
CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
cudaMemcpy(h_C, d_C, sizeof(float)*M*N*batch_size, cudaMemcpyDeviceToHost);
if (status == CUBLAS_STATUS_SUCCESS) {
std::cout << "C =" << std::endl;
print_matrix(h_C, M, N, batch_size);
} else {
std::cout << status << std::endl;
}
// Destroy the handle
cublasDestroy(handle);
cudaFree(d_dA);
cudaFree(d_dB);
cudaFree(d_dC);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
}