I have some doubts about understanding the CUDA thread processing in the SM. The following propositions are inferred from what I have been reading: My GPU is: GTX650Ti.
- Thread count in a block must be ALWAYS a multiple of the Warp size. So, each SM can process blocks of 32 threads (warpSize).
- The maximum thread count my SM can compute at same time is 2048 (maxThreadsPerMultiProcessor).
- Due to 2048 threads can be computed at same time in each SM and the warpSize is 32, then 64 blocks can be computed at same time.
- Due to my GPU has 4 SMs, there can be 64X4=256 blocks of threads executed at same time.
- Therefore, the kernel launch may have the following launch parameters: <<<256, 32>>> and each kernel launch will invoke 8192 threads.
Is that right?
So if I have a vector of 10M elements to process in my kernel it means that I have to segment it in 1221 jobs (kernel launches) of 8192 elements each?
This quest arised because I am comparing the time performance between a sequential program and my CUDA program. But all I can see is that the CPU overtakes the GPU. I also tried with the maximum launch parameters such as <<<65535, 1024>>>. The results are very similar.
So, what am I doing or configuring wrong?
This is the code I'm using:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <time.h>
#include "C:\cdev.h"
#include <thrust/device_vector.h>
using namespace thrust;
using namespace std;
#define N (1024 * 16384)
cdev devices;
__global__ void eucliDist(double *c, double *a, double *b)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
c[i] = sqrt(pow(a[i], 2) + pow(b[i], 2));
}
int main()
{
clock_t start, end;
double elapsed;
static double A[N];
static double B[N];
for (int i = 0; i < N; i++)
{
A[i] = double(i);
B[i] = double(i);
}
static double C[N];
// Sequential execution of F(x,y) = sqrt((x^2 + y^2))
start = clock();
for (int i = 0; i < N; i++)
C[i] = sqrt(pow(A[i], 2) + pow(B[i], 2));
end = clock();
elapsed = double(end - start) / CLOCKS_PER_SEC;
cout << "Elapsed time for sequential processing is: " << elapsed << " seconds." << endl;
// CUDA Initialization
unsigned int threadNum;
unsigned int blockNum;
cudaError_t cudaStatus;
threadNum = devices.ID[0].maxThreadsPerBlock;
blockNum = ceil(double(N) / double(threadNum));
// Parallel execution with Thrust of F(x,y) = sqrt((x^2 + y^2))
vector<double> vectorA(N);
vector<double> vectorB(N);
for (int i = 0; i < N; i++)
{
vectorA[i] = double(i);
vectorB[i] = double(i);
}
vector<double> vectorC(N);
start = clock();
device_vector<double> thrustA(N);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
cerr << "Device vector allocation failed: " << cudaGetErrorString(cudaStatus) << " (thrustA)" << endl;
cin.get();
return 1;
}
device_vector<double> thrustB(N);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
cerr << "Device vector allocation failed: " << cudaGetErrorString(cudaStatus) << " (thrustB)" << endl;
cin.get();
return 1;
}
device_vector<double> thrustC(N);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
cerr << "Device vector allocation failed: " << cudaGetErrorString(cudaStatus) << " (thrustC)" << endl;
cin.get();
return 1;
}
thrustA = vectorA;
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
cerr << "Host to device copy failed (Thrust): " << cudaGetErrorString(cudaStatus) << " (vectorA -> thrustA)" << endl;
cin.get();
return 1;
}
thrustB = vectorB;
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
cerr << "Host to device copy failed (Thrust): " << cudaGetErrorString(cudaStatus) << " (vectorB -> thrustB)" << endl;
cin.get();
return 1;
}
eucliDist <<<blockNum, threadNum>>>(raw_pointer_cast(thrustC.data()), raw_pointer_cast(thrustA.data()), raw_pointer_cast(thrustB.data()));
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
cerr << "Kernel launch failed (Thrust): " << cudaGetErrorString(cudaStatus) << " (euclidDist)" << endl;
cin.get();
return 1;
}
thrust::copy(thrustC.begin(), thrustC.end(), vectorC.begin());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
cerr << "Device to host copy failed: " << cudaGetErrorString(cudaStatus) << " (thrustC -> vectorC)" << endl;
cin.get();
return 1;
}
end = clock();
elapsed = double(end - start) / CLOCKS_PER_SEC;
cout << "Elapsed time parallel processing is (Thrust): " << elapsed << " seconds." << endl;
cin.get();
return 0;
}
Suggestions will be appreciated.