I'm trying to code in CUDA C++ with cuRAND. I almost got what I want, except that I get weird outputs when I update the global cuRAND state from the shared memory one. If I remove that update, everything is working as expected (e.g. I get numbers between 0.0 and 1.0). If I include the update (line 22) I see negative numbers, a bunch of 0s and even some extreme numbers, like 2e+31. I also can not spot the difference to the cuRAND manual. Pretty sure it's a dumb oversight - any help is appreciated! Thank you.
Here is my code:
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#include <iostream>
#define ITER 32
__global__ void setup_kernel(curandState* state) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
curand_init(1234, idx, 0, &state[idx]);
}
__global__ void generate_kernel(curandState* curand_state, const unsigned int n, float* result_float) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
curandState localState = curand_state[idx];
if (idx < n) {
float myrandf = curand_uniform(&localState);
result_float[idx] = myrandf;
curand_state[idx] = localState;
}
}
int main() {
curandState* d_state;
cudaMalloc(&d_state, sizeof(curandState));
float* d_result_float, * h_result_float;
cudaMalloc(&d_result_float, ITER * sizeof(float));
h_result_float = (float*)malloc(ITER * sizeof(float));
int BLOCK_SIZE = 1024;
int GRID_SIZE = (ITER + BLOCK_SIZE - 1) / BLOCK_SIZE;
std::cout << "BLOCK_SIZE: " << BLOCK_SIZE << "; GRID_SIZE: " << GRID_SIZE << "\n";
setup_kernel << <GRID_SIZE, BLOCK_SIZE >> > (d_state);
generate_kernel << <GRID_SIZE, BLOCK_SIZE >> > (d_state, ITER, d_result_float);
cudaDeviceSynchronize();
cudaMemcpy(h_result_float, d_result_float, ITER * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < ITER; i++)
std::cout << h_result_float[i] << "\n";
return 0;
}
Output:
BLOCK_SIZE: 1024; GRID_SIZE: 1
0
0.820181
0
0
4.6068e-09
-1.56062e+09
-0.758724
[...]
0
0
4.6068e-09
-3.77124e-23
2.8262e+33
-3.31968e+19