I am currently experimenting with CUDA in C++.
I know how CPUs and GPUs work and how they are supposed to function.
I wrote a testing program that does vector additions on big arrays.
On the CPU, this finishes in approximately 18866700ns.
Running this on a single thread and a single block on the GPU only needs 51300ns.
How can this be?
A single thread on the GPU should be slower than one CPU core and not ~370x faster, shouldn't it?
Here is my code:
#define COUNT 10000000
// 18866700ns runtime
void vector_add(float* out, float* a, float* b, int n)
{
for (int i = 0; i < n; i++)
out[i] = a[i] + b[i];
}
// 51300ns runtime
__global__ void vector_add_cuda(float* out, float* a, float* b, int n)
{
for (int i = 0; i < n; i++)
out[i] = a[i] + b[i];
}
int main()
{
float* out = new float[COUNT], * a = new float[COUNT], * b = new float[COUNT];
for (int i = 0; i < COUNT; i++)
{
a[i] = 2 * i;
b[i] = COUNT - i;
}
float* d_out, * d_a, * d_b;
cudaMalloc(&d_out, sizeof(float) * COUNT);
cudaMalloc(&d_a, sizeof(float) * COUNT);
cudaMalloc(&d_b, sizeof(float) * COUNT);
cudaMemcpy(d_a, a, sizeof(float) * COUNT, cudaMemcpyDefault);
cudaMemcpy(d_b, b, sizeof(float) * COUNT, cudaMemcpyDefault);
auto start = std::chrono::high_resolution_clock::now();
//only one thread and block here!
vector_add_cuda<<<1, 1>>>(d_out, d_a, d_b, COUNT);
//vector_add(out, a, b, COUNT);
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now() - start).count();
cudaMemcpy(out, d_out, sizeof(float) * COUNT, cudaMemcpyDefault);
cudaFree(d_out);
cudaFree(d_a);
cudaFree(d_b);
}
I have no idea why this is and I can't find anything online.
Can someone explain this to me?