I am pretty new to CUDA and I'm very struggling with converting a C code to CUDA C, it builds successfully but it keeps crashing. Triple loop function is wrong for sure and I have no idea what should I change.
Function call:
for (z=0;z<=max;z++)
{
correlationsum=coefficient(x, n, dim, z);
printf("result for epsilon %d returns %d\n", z, correlation_sum);
}
Function
long coefficient(int vctr[40000], long numberofpoints, int coefficientrow, int epsilon)
{
long i, j, k, sum, numberofpairs;
long sq_epsilon;
sq_epsilon=epsilon*epsilon;
numberofpairs=0;
for (i=1;i<=numberofpoints-coefficientrow;i++)
{
sum=0;
for (j=i+1;j<=numberofpoints+1-coefficientrow;j++)
{
for (k=0;k<coefficientrow;k++)
{
sum=sum+(vctr[i+k]-vctr[j+k])*(vctr[i+k]-vctr[j+k]);
}
if(sum<sq_epsilon)
{
numberofpairs++;
sum=0;
}
}
}
return (numberofpairs);
}
I have problems limiting the function in GPU part, so it doesn't go out of bounds (e.g. k is less than coefficientrow above). I saw that it is possible to assign block/threadids and use if function. I have tried it but in triple for loop it is kinda... strange.
Here is almost full code.
#define THREADS 1024
__global__ void coefficient(int *vctr, int numberofpoints, int coefficient_row, int epsilon, int *numbofpairs){
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
int sum;
numbofpairs = 0;
int sq_epsilon = epsilon*epsilon;
if (i <= numberofpoints - coefficient_row)
{
sum = 0;
if (j <= numberofpoints + 1 - coefficient_row)
{
if (k < coefficient_row)
sum = sum + (vctr[i + k] - vctr[j + k])*(vctr[i + k] - vctr[j + k]);
if (sum < sq_epsilon){
numbofpairs++;
sum = 0;
}}}}
int main()
{
int n, dim, max, z;
int *d_n, *d_dim, *d_z, *d_x, *d_numbofpairs;
int x[40000], correlation_sum = 0;
n=10;
max=10;
dim=3;
cudaMalloc((void **)&d_n, sizeof(int));
cudaMalloc((void **)&d_dim, sizeof(int));
cudaMalloc((void **)&d_z, sizeof(int));
cudaMalloc((void **)&d_x, sizeof(int));
cudaMalloc((void **)&d_numbofpairs, sizeof(int));
cudaMemcpy(d_n, &n, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_dim, &dim, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, &x, sizeof(int), cudaMemcpyHostToDevice);
for (z = 0; z <= max; z++)
{
cudaMemcpy(d_z, &z, sizeof(int), cudaMemcpyHostToDevice);
coefficient << <1, THREADS >> >(d_x, *d_n, *d_dim, *d_z, d_numbofpairs);
cudaMemcpy(&correlation_sum, d_numbofpairs, sizeof(int), cudaMemcpyDeviceToHost);
printf("result for epsilon %d returns %d\n", z, correlation_sum);
}
cudaFree(d_n);
cudaFree(d_dim);
cudaFree(d_z);
cudaFree(d_x);
cudaFree(d_numbofpairs);
return 0;
}
I would like some help or tips what to change, what is wrong and why it keeps crashing so I could fix it. Thank you!
EDIT: I completed some parts, sorry my bad. As for threads and blocks, I am very confused, GPU shows 1024 threads per block, and I'm not sure whether it's it or not.