I was looking for on the internet for a official way to get the maximum number of threads per block for the current NVIDIA GPU and I found nothing.
I was read that the number must be 2^N so I did some code that find it but I am not sure that it is 100% correct and maybe there is a better way.
In my case it give me the output 1024 (for my NVIDIA GTX 660M).
This is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <Windows.h>
// Easy print macros:
#define pp std::cout <<
#define ss << " " <<
#define ee << std::endl;
#define ww while(1){Sleep(100);} return 0;
namespace nameMyCudaSystem {
int threadsPerBlock = 0;
__global__ void ThreadPerBlockCounter(){/*Do nothing*/}
int InitializeCuda()
{
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) return -1;
threadsPerBlock = 1;
do
{
if (threadsPerBlock > 1)
{
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)
{
return -2;
break;
}
}
threadsPerBlock *= 2;
ThreadPerBlockCounter << <1, threadsPerBlock >> >();
} while (cudaGetLastError() == cudaSuccess);
threadsPerBlock /= 2;
if (threadsPerBlock == 0) return -3;
return 0;
}
}
int main()
{
if (nameMyCudaSystem::InitializeCuda() != 0) return -1; // error (some error inside InitializeCuda)
pp nameMyCudaSystem::threadsPerBlock ee; // print the maximum number of thread per block
ww; // just a loop to keep the console window open...
return 0;
}