I am developing an application with CUDA in Ubuntu 16.04 using the cuda toolkit 8.0.
My problem is about the number of threads per block that a gtx960m (capability 5.0) can contains. I'm trying to use the max number of threads in each block and, for this reason, I'm using the cudaGetDeviceProperties() function to get this information (1024 threads, you can see in documentation) but when I use more than 512 threads per block in my kernel the the API returns the error code 0x7 ("warning: Cuda API error detected: cudaLaunch returned (0x7)") that means "Launch out of resources".
A little example code with my problem:
#include <random>
#include <curand.h>
#include <curand_kernel.h>
#define min(a,b) (a<b?a:b);
__global__ void bootstrap_V1(int nSamples, int sampleFraction, int seed, unsigned int* sampleIDs, unsigned int* inbagCounts){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int offset = gridDim.x * blockDim.x;
/*Generating a random number in a specific ranger:
1- Use CURAND to generate a uniform distribution between 0.0 and 1.0
2- Then multiply this by the desired range (largest value - smallest value + 0.999999).
3- Then add the offset (+ smallest value).
4- Then truncate to an integer.
*/
curandState state;
curand_init(seed, tid, 0, &state);
while(tid < nSamples*sampleFraction){
float randf = curand_uniform(&state);
randf *= ((nSamples - 1) - 0) + 0.999999;
randf += 0;
int rand = (int)truncf(randf);
sampleIDs[tid] = rand;
atomicAdd(&(inbagCounts[rand]), 1);
tid += offset;
}
}
int main(void) {
int nSamples = 100;
int sampleFraction = 1;
std::random_device rd;
std::mt19937_64 gen(rd());
std::uniform_int_distribution<size_t>dist;
cudaError_t error;
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int blocks = prop.multiProcessorCount;
int maxThreadsPerBlock = prop.maxThreadsPerBlock;
int seed = dist(gen);
unsigned int *sampleIDs = (unsigned int *)malloc(nSamples * sampleFraction * sizeof(int));
unsigned int *inbagCounts = (unsigned int *)malloc(nSamples * sizeof(int));
unsigned int *dev_sampleIDs, *dev_inbagCounts;
error = cudaMalloc((void **)&dev_sampleIDs, nSamples*sampleFraction*sizeof(int));
error = cudaMalloc((void **)&dev_inbagCounts, nSamples*sizeof(int));
error = cudaMemset(dev_sampleIDs, 0, nSamples*sampleFraction*sizeof(int));
error = cudaMemset(dev_inbagCounts, 0, nSamples*sizeof(int));
if (error != cudaSuccess)
printf("%s\n", cudaGetErrorString(error));
int threadsPerBlock = min(maxThreadsPerBlock, nSamples);
bootstrap_V1<<<blocks,threadsPerBlock>>>(nSamples, sampleFraction, seed, dev_sampleIDs, dev_inbagCounts);
cudaMemcpy(sampleIDs, dev_sampleIDs, nSamples*sampleFraction*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(inbagCounts, dev_inbagCounts, nSamples*sizeof(int), cudaMemcpyDeviceToHost);
free(sampleIDs);
free(inbagCounts);
cudaFree(dev_inbagCounts);
cudaFree(dev_sampleIDs);
}
and here are the compilation lines I am using:
/usr/local/cuda-8.0/bin/nvcc -G -g -O0 -std=c++11 -gencode arch=compute_50,code=sm_50 -odir "." -M -o "main.d" "../main.cu
/usr/local/cuda-8.0/bin/nvcc -G -g -O0 -std=c++11 --compile --relocatable-device-code=false -gencode arch=compute_50,code=compute_50 -gencode arch=compute_50,code=sm_50 -x cu -o "main.o" "../main.cu"
/usr/local/cuda-8.0/bin/nvcc --cudart static --relocatable-device-code=false -gencode arch=compute_50,code=compute_50 -gencode arch=compute_50,code=sm_50 -link -o "prueba" ./main.o
Can someone explain why this is happening to me? Thank you very much.
By request, PTAX Verbose:
ptxas info : 77696 bytes gmem, 72 bytes cmem[3]
ptxas info : Function properties for cudaDeviceGetAttribute
16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Compiling entry function '_Z12bootstrap_V1iiiPjS_' for 'sm_50'
ptxas info : Function properties for _Z12bootstrap_V1iiiPjS_
6560 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 128 registers, 6560 bytes cumulative stack size, 352 bytes cmem[0], 8 bytes cmem[2]
ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for _ZN4dim3C2Ejjj
16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
ptxas info : Function properties for cudaMalloc
16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for cudaGetDevice
8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for _ZN4dim3C1Ejjj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
40 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for _ZN61_INTERNAL_39_tmpxft_000013a3_00000000_7_main_cpp1_ii_055b743a9atomicAddEPjj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for cudaFuncGetAttributes
16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads