This is my minimal not-working test case. I'm running this on the Stampede cluster (where CUDA/the rest of the environment) is automatically set up, so there should be no issues there.
When I run it, the output host b
array never changes (nor is it even copied from the device). My output is 0.0
, 0.0
- it should obviously be 100.0
, 100.0
.
#include <cuda.h>
#include <stdio.h>
struct point {
double x,y;
};
__global__ void MyFunc(point* d_a) {
d_a->x = 100.0;
d_a->y = 100.0;
}
int main(void) {
point * a = (point*)malloc(sizeof(point));
a->x=10.0;
a->y=10.0;
point * d_a;
cudaMalloc((void**)&d_a,sizeof(point));
cudaMemcpy(d_a,a,sizeof(point),cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
MyFunc<<<1,1>>>(d_a);
cudaDeviceSynchronize();
point * b = (point*)malloc(sizeof(point));
cudaMemcpy(b,d_a,sizeof(point),cudaMemcpyDeviceToHost);
printf("%lf %lf\n",b->x,b->y);
cudaFree(d_a);
free(a);
free(b);
return 0;
}
Code compiles fine with CUDA 6.5.12 using:
nvcc -c -O3 -arch=compute_35 -code=sm_35 test.cu
Running it can be done using the ibrun ./test
command on an interactive terminal session (idev -n 1 -N 1
).
No errors/segfaults are reported during run; it just gives the wrong output. Is it something wrong with the code, or am I just running it incorrectly?