Trying to run a CUDA program from command prompt using nvcc, but it seems like GPU code is not running as expected. The exact same code runs successfully on Visual Studio and outputs the expected output.
nvcc -arch=sm_60 -std=c++11 -o test.cu test.exe
test.exe
Environment: Windows 10, NVIDIA Quadro k4200, CUDA 10.2
Source Code
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <iostream>
/* this is the vector addition kernel.
:inputs: n -> Size of vector, integer
a -> constant multiple, float
x -> input 'vector', constant pointer to float
y -> input and output 'vector', pointer to float */
__global__ void saxpy(int n, float a, const float x[], float y[])
{
int id = threadIdx.x + blockDim.x*blockIdx.x; /* Performing that for loop */
// check to see if id is greater than size of array
if(id < n){
y[id] += a*x[id];
}
}
int main()
{
int N = 256;
//create pointers and device
float *d_x, *d_y;
const float a = 2.0f;
//allocate and initializing memory on host
std::vector<float> x(N, 1.f);
std::vector<float> y(N, 1.f);
//allocate our memory on GPU
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
//Memory Transfer!
cudaMemcpy(d_x, x.data(), N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y.data(), N*sizeof(float), cudaMemcpyHostToDevice);
//Launch the Kernel! In this configuration there is 1 block with 256 threads
//Use gridDim = int((N-1)/256) in general
saxpy<<<1, 256>>>(N, a, d_x, d_y);
//Transfering Memory back!
cudaMemcpy(y.data(), d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << y[0] << std::endl;
cudaFree(d_x);
cudaFree(d_y);
return 0;
}
Output
1
Expected Output
3
Things I tried
When I first tried to compile with nvcc, it had the same error as discussed here.
Cuda compilation error: class template has already been defined
So I tried the suggested solution "now: D:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.22.27905\bin\Hostx64\x64" and now it compiles and runs but the output is not as expected.