When i'm running this code on my system, after some seconds my system get stuck and i have to restart system again. So my question is what's i'm doing wrong here? Any suggestion will appreciated.
__global__ void matMul(float* d_M, float* d_N, float* d_P, int width) {
int row = blockIdx.y*width + threadIdx.y;
int col = blockIdx.x*width + threadIdx.x;
if (row < width && col < width) {
float product_val = 0;
for (int k = 0; k < width; k++) {
product_val += d_M[row*width + k] * d_N[k*width + col];
}
d_P[row*width + col] = product_val;
}
}
int main() {
const int n = 9;
float* d_M;
float* d_N;
float* d_P;
cudaMallocManaged(&d_M, SIZE * sizeof(float));
cudaMallocManaged(&d_N, SIZE * sizeof(float));
cudaMallocManaged(&d_P, SIZE * sizeof(float));
for (int i = 0; i < n; ++i) {
d_P[i] = 0;
}
int count = 0;
for (int i = 0; i < n; ++i) {
d_N[i] = ++count;
}
count = 0;
for (int i = 0; i < n; ++i) {
d_M[i] = ++count;
}
matMul <<<1, n>>> (d_M, d_N, d_P, 3);
cudaDeviceSynchronize();
for (int i = 0; i < n; ++i) {
printf("%f\n", d_P[i]);
}
cudaFree(d_N);
cudaFree(d_M);
cudaFree(d_P);
return 0;
}