I am new to CUDA programming and writing a simple CUDA program to do simple calculations but when I am printing a particular float array back on CPU it is giving me segmentation fault. I am stuck at this please help!
#include <cuda.h>
#include <cuComplex.h>
#include <thrust/complex.h>
#include <cuda_runtime.h>
#include <math.h>
#include "generate.h" // it is used to generate vectors X1,X2,Y1,Y2
#include <bits/stdc++.h>
using namespace std;
__global__ void shdce(float *dx1, float *dx2, float *dy1, float *dy2,float *dax, float *dbx, float *day, float *dby, float *dsr, float *deta1, float *deta2)
{
int ii = threadIdx.x;
dax[ii] = (dx2[ii] - dx1[ii])/2.0;
dbx[ii] = (dx2[ii] + dx1[ii])/2.0;
day[ii] = (dy2[ii] - dy1[ii])/2.0;
dby[ii] = (dy2[ii] - dy1[ii])/2.0;
// element normal vector (float)
dsr[ii] = sqrt(pow(dax[ii],2) + pow(day[ii],2));
deta1[ii] = (dy2[ii]-dy1[ii])/(2.0*dsr[ii]);
deta2[ii] = (dx2[ii]-dx1[ii])/(2.0*dsr[ii]);
}
int main()
{
vector<float> X1, Y1, X2, Y2;
int size1, size2;
float *dx1, *dx2, *dy1, *dy2, *dax, *dbx, *day, *dby, *dsr, *deta1, *deta2;
X1 = generate1(); //X1=[0:10:1]
size1 = X1.size();
Y1 = generate3(size1); //Y1=zeroes (sizeof X1)
X2 = generate2(); //X2=[1:11:1]
size2 = X2.size();
Y2 = generate3(size2); //Y2=zeroes (sizeof X2)
float eta2[size1];
for(int i = 0; i < size1; i++)
{
eta2[i]=0.0;
}
cudaMalloc( (void**)&dx1, size1 * sizeof(float) );
cudaMalloc( (void**)&dx2, size2 * sizeof(float) );
cudaMalloc( (void**)&dy1, size1 * sizeof(float) );
cudaMalloc( (void**)&dy2, size2 * sizeof(float) );
cudaMalloc( (void**)&dax, size1 * sizeof(float) );
cudaMalloc( (void**)&dbx, size1 * sizeof(float) );
cudaMalloc( (void**)&day, size1 * sizeof(float) );
cudaMalloc( (void**)&dby, size1 * sizeof(float) );
cudaMalloc( (void**)&dsr, size1 * sizeof(float) );
cudaMalloc( (void**)&deta1, size1 * sizeof(float) );
cudaMalloc( (void**)&deta2, size1 * sizeof(float) );
cudaMemcpy( dx1, &X1, size1 * sizeof(float), cudaMemcpyHostToDevice );
cudaMemcpy( dx2, &X2, size1 * sizeof(float), cudaMemcpyHostToDevice );
cudaMemcpy( dy1, &Y1, size1 * sizeof(float), cudaMemcpyHostToDevice );
cudaMemcpy( dy2, &Y2, size1 * sizeof(float), cudaMemcpyHostToDevice );
cudaMemcpy( deta2, &eta2, size1 * sizeof(float), cudaMemcpyHostToDevice );
dim3 dimBlock( size1, 1 );
dim3 dimGrid( 1, 1 );
shdce <<< dimGrid, dimBlock >>> dx1,dx2,dy1,dy2,dax,dbx,day,dby,dsr,deta1,deta2);
cudaMemcpy( eta2, deta2, size1 * sizeof(float), cudaMemcpyDeviceToHost);
for(int i=0;i<size1;i++)
{
printf("%f \n",deta2[i]);
}
return 0;
}