I'm trying to calculate a matrice multiplication of size N (square matrix) but I'm getting a stack overflow error(I'm new to Cuda ):
if I test the code for N < 300 everything is fine, but if I test it with N> 300 it does not work, and a stack overflow error was displayed but there is enough memory.in my graphics card GF 820M . if N = 300 then 300 * 300 * 4(size of float) = 360000 byte : necessary space in the device to allocate for an array of type float.and here it must allocate for 3 Table to do multiplication .therefore 360000 * 3 = 1080000 bytes and if I control the CudaMalloc nothing is displayed.
I inform you that my main goal is to test for N large enough.How do I solve that? thank you in advance for any help you might be able to provide.
#include <stdio.h>
#include<device_launch_parameters.h>
#include<cuda.h>
#include<time.h>
#include<cuda_runtime.h>
#include <math.h>
__global__ void MatrixMul( float *Md , float *Nd , float *Pd , const int WIDTH )
{ // calculate thread id
unsigned int row = blockIdx.y*blockDim.y+threadIdx.y;
unsigned int col = blockIdx.x*blockDim.x+threadIdx.x;
for (int k = 0 ; k<WIDTH ; k++ )
{ Pd[row*WIDTH + col]+= Md[row * WIDTH + k ] * Nd[ k * WIDTH + col] ; }}
int main ()
{ const int i=64 ;
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
const int WIDTH =300;
cudaError_t cudaStatus;
float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH] ,M_result_array_h[WIDTH][WIDTH];
float *array1_d , *array2_d ,*M_result_array_d ; // device array
// Allocate GPU buffers for 2 vectors (two input, one output)
cudaStatus = cudaMalloc((void **) &array1_d , WIDTH*WIDTH*sizeof (float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!"); }
cudaStatus = cudaMalloc((void **) &array2_d , WIDTH*WIDTH*sizeof (float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!"); }
for ( int i = 0 ; i<WIDTH ; i++ ) {
for (int j = 0 ; j<WIDTH ; j++ )
{ array1_h[i][j] = 1 ; array2_h[i][j] = 2 ; }}
//copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )
cudaMemcpy ( array1_d , array1_h , WIDTH*WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;
cudaMemcpy ( array2_d , array2_h , WIDTH*WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;
//allocating memory for resultent device array
cudaStatus = cudaMalloc((void **) &M_result_array_d , WIDTH*WIDTH*sizeof (float) ) ;
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!"); }
//calling kernal
dim3 dimBlock( i,i, 1 ) ;
dim3 dimGrid ( ((WIDTH-1)/i) +1 , ((WIDTH-1)/i)+1 ,1 ) ;
cudaEventRecord(start, 0);
MatrixMul <<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf ("taille du probleme:%d Time for the kernel: %f \n",WIDTH,time);
//copy back result_array_d to result_array_h
cudaMemcpy(M_result_array_h , M_result_array_d , WIDTH*WIDTH*sizeof(float) , cudaMemcpyDeviceToHost) ;
//printf the result array
for (int i = 0 ; i<WIDTH ; i++ )
{ for (int j = 0 ; j < WIDTH ; j++ )
{ printf ("%f ",M_result_array_h[i][j] ) ; }
printf ("\n") ; }
cudaFree(array1_d);
cudaFree(array2_d);
cudaFree(M_result_array_h);
system("pause") ; }