This is my cuda code:
#include<stdio.h>
#include<stdint.h>
#include <chrono>
#include <cuda.h>
__global__ void test(int base, int* out)
{
int curTh = threadIdx.x+blockIdx.x*blockDim.x;
{
int tmp = base * curTh;
#pragma unroll
for (int i = 0; i<1000*1000*100; ++i) {
tmp *= tmp;
}
out[curTh] = tmp;
}
}
typedef std::chrono::high_resolution_clock Clock;
int main(int argc, char *argv[])
{
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
int data = rand();
int* d_out;
void* va_args[10] = {&data, &d_out};
int nth = 10;
if (argc > 1) {
nth = atoi(argv[1]);
}
int NTHREADS = 128;
printf("nth: %d\n", nth);
cudaMalloc(&d_out, nth*sizeof(int));
for (int i = 0; i < 10; ++i) {
auto start = Clock::now();
cudaLaunchKernel((const void*) test,
nth>NTHREADS ? nth/NTHREADS : 1,
nth>NTHREADS ? NTHREADS : nth, va_args, 0, stream);
cudaStreamSynchronize(stream);
printf("use :%ldms\n", (Clock::now()-start)/1000/1000);
}
cudaDeviceReset();
printf("host Hello World from CPU!\n");
return 0;
}
I compile my code, and run in 2080Ti, I found the thread elapse time is around 214 ms, but the thread count is 3 times of gpu core(in 2080Ti, it's 4352)
root@d114:~# ./cutest 1
nth: 1
use :255ms
use :214ms
use :214ms
use :214ms
use :214ms
use :214ms
use :214ms
use :214ms
use :214ms
use :214ms
root@d114:~# ./cutest 13056
nth: 13056
use :272ms
use :223ms
use :214ms
use :214ms
use :214ms
use :214ms
use :214ms
use :214ms
use :214ms
use :214ms
root@d114:~# ./cutest 21760
nth: 21760
use :472ms
use :424ms
use :424ms
use :424ms
use :424ms
use :424ms
use :424ms
use :424ms
use :424ms
use :428ms
So my question is Why is the elapse time the same as the number of thread increase to 3 times of gpu core?
It's mean the NVIDIA gpu computing power is 3 times of gpu core?