Can anyone tell me why my CUDA C code is returning my array Z to be wholly zero? (again - but with different code this time)

Question

I've written a CUDA C program to parallelize matrix multiplication. I've stored my matrices as 1D arrays in row-major form. I can't seem to find anywhere why my program shouldn't be working, be it issues with pointers or the kernel code. Help will be appreciated, thanks! (I know I have already asked this question - however, I have taken a step further in parallelization this time and I have written my code such that each thread calculates a single element of the result matrix instead of a single row - also, I have checked that the nvidia_uvm module responsible for CUDA device memory is loaded on my OS currently right now - so that isn't the issue, the issue must lie with my source code)

Code:

#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <stdlib.h>

__global__ void multiplyMatricesKernel(float* d_x, float* d_y, float* d_z, int m, int n, int p)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.x + threadIdx.y;

    if(i < p && j < m)
    {
        for(int k = 0; k < n; ++k)
        {
            d_z[j * p + i] += d_x[j * n + k] * d_y[k * p + i];
        }
    }
}

void multiplyMatrices(float* x, float* y, float* z, int m, int n, int p)
{
    dim3 dimOfGrid(ceil(m * p) / 64.0);
    dim3 dimOfBlock(64, 64);

    size_t elements_x = m * n * sizeof(float);
    size_t elements_y = n * p * sizeof(float);
    size_t elements_z = m * p * sizeof(float);

    float* d_x;
    float* d_y;
    float* d_z;

    cudaMalloc((void**) &d_x, elements_x);
    cudaMalloc((void**) &d_y, elements_y);
    cudaMalloc((void**) &d_z, elements_z);

    cudaMemcpy(d_x, x, elements_x, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_y, y, elements_y, cudaMemcpyHostToDevice);

    multiplyMatricesKernel<<<dimOfGrid, dimOfBlock>>>(d_x, d_y, d_z, m, n, p);

    cudaMemcpy(z, d_z, elements_z, cudaMemcpyDeviceToHost);

    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_z);
}

int main()
{
    srand(time(NULL));

    size_t m = rand() % 8 + 1;
    size_t n = rand() % 8 + 1;
    size_t p = rand() % 8 + 1;

    float x[m * n] = {0};
    float y[n * p] = {0};
    float z[m * p] = {0};

    printf("X =\n[");
    for(int i = 0; i < sizeof(x) / sizeof(float); ++i)
    {
        x[i] = rand() % 129 - 64;
        printf("%.1f ", x[i]);
        if((i + 1) % n == 0 && i != (sizeof(x) / sizeof(float) - 1))
        {
            printf("]\n[");
        }
        if(i == (sizeof(x) / sizeof(float) - 1))
        {
            printf("]\n\n");
        }
    }
    
    printf("Y = \n[");
    for(int i = 0; i < sizeof(y) / sizeof(float); ++i)
    {
        y[i] = rand() % 129 - 64;
        printf("%.1f ", y[i]);
        if((i + 1) % p == 0 && i != (sizeof(y) / sizeof(float) - 1))
        {
            printf("]\n[");
        }
        if(i == (sizeof(y) / sizeof(float) - 1))
        {
            printf("]\n\n");
        }
    }

    multiplyMatrices(x, y, z, m, n, p);

    printf("Z = \n[");
    for(int i = 0; i < sizeof(z) / sizeof(float); ++i)
    {   
        printf("%.1f ", z[i]);
        if((i + 1) % p == 0 && i != (sizeof(z) / sizeof(float) - 1))
        {
            printf("]\n[");
        }
        if(i == (sizeof(z) / sizeof(float) - 1))
        {
            printf("]\n\n");
        }
    }
    return 0;
}

My suggestion is that you learn to use [proper CUDA error checking](https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) and also run your code with `cuda-memcheck`. My suggestion would be to do these things *before* asking others for help. If you do that, you'll get an error message which might lead you with a bit of [searching](https://stackoverflow.com/search?q=%5Bcuda%5D+invalid+configuration+argument) to discover the problem in your code. — Robert Crovella, Oct 23 '20 at 22:44
@RobertCrovella Thank you, will do - indeed, I should've thought of this before asking! Appreciated! — catfood, Oct 23 '20 at 22:56
CUDA threadblocks are limited to a maximum of 1024 threads per block. This limitation applies to the *product* of the threadblock dimensions. — Robert Crovella, Oct 23 '20 at 23:04
Ah, I see - 64 * 64 = 4096 > 1024. Didn't ever think of that. Thanks! (will check to see if this solves my issue tomorrow - it is getting very late at night for me! Will mark a self-posted answer reiterating what you said here tomorrow if it works!) Although I do wonder why CUDA didn't throw a runtime error here. — catfood, Oct 23 '20 at 23:10

Can anyone tell me why my CUDA C code is returning my array Z to be wholly zero? (again - but with different code this time)

0 Answers0