The threadIdx index is architecture-dependent and you cannot just set it at whatever you like.
The following code works until arraySize == 1024
on my system, but then at arraySize == 1025
I get undefined values
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
using namespace std;
__global__ void gpucopy( int* src, int* dst )
{
int i = threadIdx.x;
dst[i] = src[i];
}
int main()
{
const int arraySize = 500; // >= 1025 will fail on my system!
int* data1 = new int[arraySize];
int* data2 = new int[arraySize];
// Initialized both data1 and data2
// ...
for(int i=0; i<arraySize; i++)
data1[i] = 2*i;
int* dev_data1 = NULL;
int* dev_data2 = NULL;
// Initialized both dev_data1 and dev_data2
// ...
cudaMalloc(&dev_data1, arraySize*sizeof(int));
cudaMalloc(&dev_data2, arraySize*sizeof(int));
// copy data1 to device
cudaMemcpy(dev_data1, data1, arraySize*sizeof(int), cudaMemcpyHostToDevice );
// copy dev_data1 to dev_data2 with gpu
gpucopy<<<1, arraySize>>>( dev_data1, dev_data2 );
// copy dev_data2 to data
cudaMemcpy(data2, dev_data2, arraySize*sizeof(int), cudaMemcpyDeviceToHost );
for(int i=0; i<arraySize; i++)
if(data2[i] != data1[i])
cout << "Error: data is different - data2[" << i << "] is " << data2[i] << endl;
return 0;
}
You can find out this value by either looking at the documentation or with the deviceQuery() function
C:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK 4.1\C\bin\win64\Release\deviceQuery.exe Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
Found 2 CUDA Capable device(s)
Device 0: "Tesla C2050"
CUDA Driver Version / Runtime Version 4.2 / 4.1
CUDA Capability Major/Minor version number: 2.0
Total amount of global memory: 2688 MBytes (2818572288 bytes)
(14) Multiprocessors x (32) CUDA Cores/MP: 448 CUDA Cores
GPU Clock Speed: 1.15 GHz
Memory Clock rate: 1500.00 Mhz
Memory Bus Width: 384-bit
L2 Cache Size: 786432 bytes
Max Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536,65535), 3D=(2048,2048,2048)
Max Layered Texture Size (dim) x layers 1D=(16384) x 2048, 2D=(16384,16384) x 2048
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 32768
Warp size: 32
Maximum number of threads per block: 1024 <-----