I'm using VS2019 and have an NVIDIA GeForce GPU. I tried the code from this link: https://towardsdatascience.com/writing-lightning-fast-code-with-cuda-c18677dcdd5f
However, I want to try using cudaMalloc
instead of using managed memory with cudaMallocManaged
I tried the code below:
__global__
void add(int n, float* x, float* y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
int main()
{
int N = 1 << 20;
float* x, * y;
cudaMalloc(&x, N * sizeof(float));
cudaMalloc(&y, N * sizeof(float));
cudaMemset(x,1.0, N * sizeof(float)); //want to set x as an array of 1.0s
cudaMemset(y,2.0, N * sizeof(float)); //want to set y as an array of 2.0s
int device = -1;
cudaGetDevice(&device);
int blockSize = 1024;
int numBlocks = (N + blockSize - 1) / blockSize;
auto t1 = std::chrono::high_resolution_clock::now();
add << <numBlocks, blockSize >> > (N, x, y);
cudaDeviceSynchronize();
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f));
std::cout << "Max error: " << maxError << std::endl;
std::cout << "duration CUDA: "<<duration;
cudaFree(x);
cudaFree(y);
return 0;
}
But I'm getting an unhandled exception error at maxError = fmax(maxError, fabs(y[i] - 3.0f));
, I'm guessing because I didn't use cudaMemset
correctly? How should I modify it?