I have a program which (for now) calculates values of two functions in random points on GPU , sends these values back to host, and then visualizes them. This is what I get, some nice semi-random points:
Now, if I modify my kernel code, and add the local array initalization code at the very end,
__global__ void optymalize(curandState * state, float* testPoints)
{
int ind=blockDim.x*blockIdx.x+threadIdx.x;
int step=blockDim.x*gridDim.x;
for(int i=ind*2;i<NOF*TEST_POINTS;i+=step*2)
{
float* x=generateX(state);
testPoints[i]=ZDT_f1(x);
testPoints[i+1]=ZDT_f2(x);
}
//works fine with 'new'
//float* test_array=new float[2];
float test_array[2]={1.0f,2.0f};
}
I get something like this everytime:
Does anyone know the cause of this behavior? All the drawn points are computed BEFORE test_array is initialized, yet they are affected by it. It doesn't happen when I initialize test_array before the 'for' loop.
Host/device code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand_kernel.h"
#include "device_functions.h"
#include <random>
#include <iostream>
#include <time.h>
#include <fstream>
using namespace std;
#define XSIZE 5
#define TEST_POINTS 100
#define NOF 2
#define BLOCK_COUNT 64
#define THR_COUNT 128
#define POINTS_PER_THREAD (NOF*TEST_POINTS+THR_COUNT*BLOCK_COUNT-1)/(THR_COUNT*BLOCK_COUNT)
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=false)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__device__ float g(float* x)
{
float tmp=1;
for(int i=1;i<XSIZE;i++)
tmp*=x[i];
return 1+9*(tmp/(XSIZE-1));
}
__device__ float ZDT_f1(float* x)
{
return x[0];
}
__device__ float ZDT_f2(float* x)
{
float gp=g(x);
return gp*(1-sqrtf(x[0]/gp));
}
__device__ bool oneDominatesTwo(float* x1, float* x2)
{
for(int i=0;i<XSIZE;i++)
if(x1[i]>=x2[i])
return false;
return true;
}
__device__ float* generateX(curandState* globalState)
{
int ind = threadIdx.x;
float x[XSIZE];
for(int i=0;i<XSIZE;i++)
x[i]=curand_uniform(&globalState[ind]);
return x;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = blockDim.x*blockIdx.x+threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void optymalize(curandState * state, float* testPoints)
{
int ind=blockDim.x*blockIdx.x+threadIdx.x;
int step=blockDim.x*gridDim.x;
for(int i=ind*2;i<NOF*TEST_POINTS;i+=step*2)
{
float* x=generateX(state);
testPoints[i]=ZDT_f1(x);
testPoints[i+1]=ZDT_f2(x);
}
__syncthreads();
//float* test_array=new float[2];
//test_array[0]=1.0f;
//test_array[1]=1.0f;
float test_array[2]={1.0f,1.0f};
}
void saveResultToFile(float* result)
{
ofstream resultFile;
resultFile.open ("result.txt");
for(unsigned int i=0;i<NOF*TEST_POINTS;i+=NOF)
{
resultFile << result[i] << " "<<result[i+1]<<"\n";
}
resultFile.close();
}
int main()
{
float* dev_fPoints;
float* fPoints=new float[NOF*TEST_POINTS];
gpuErrchk(cudaMalloc((void**)&dev_fPoints, NOF * TEST_POINTS * sizeof(float)));
curandState* devStates;
gpuErrchk(cudaMalloc(&devStates,THR_COUNT*sizeof(curandState)));
cudaEvent_t start;
gpuErrchk(cudaEventCreate(&start));
cudaEvent_t stop;
gpuErrchk(cudaEventCreate(&stop));
gpuErrchk(cudaThreadSetLimit(cudaLimitMallocHeapSize, 128*1024*1024));
gpuErrchk(cudaEventRecord(start, NULL));
setup_kernel<<<BLOCK_COUNT, THR_COUNT>>>(devStates,unsigned(time(NULL)));
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaGetLastError());
optymalize<<<BLOCK_COUNT,THR_COUNT>>>(devStates, dev_fPoints);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaMemcpy(fPoints, dev_fPoints, NOF * TEST_POINTS * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaEventRecord(stop, NULL));
gpuErrchk(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, start, stop);
cout<<"Kernel execution time: "<<msecTotal<< "ms"<<endl;
saveResultToFile(fPoints);
system("start pythonw plot_data.py result.txt");
cudaFree(dev_fPoints);
cudaFree(devStates);
system("pause");
return 0;
}
Plot script code:
import matplotlib.pyplot as plt;
import sys;
if len(sys.argv)<2:
print("Usage: python PlotScript <filename>");
sys.exit(0);
path=sys.argv[1];
x=[]
y=[]
with open(path,"r") as f:
for line in f:
vals=line.strip().split(" ");
x.append(vals[0]);
y.append(vals[1]);
plt.plot(x,y,'ro')
plt.show();