1

in cuda c programming guide document there is a sample that show a 2d array:

// Kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}

int main()
{
...
// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
...
}

i use 2d array with below form and works correctly:

dim3 grid[COLUMNS][ROWS];
kernel_Matrix<<<grid,1>>>(dev_strA, dev_strB, dev_Matrix);

__global__ void add(int *a, int *b, int *c)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
}

there is a way that implement 2d array with [ ][ ] definition? i tested this way but not works.

mahdimb
  • 139
  • 1
  • 4
  • 11
  • Maybe I misunderstood your question. What are you trying to do? Pass 2D array to your kernel function or define 2D grid? – stuhlo Apr 06 '13 at 19:54
  • @stuhlo: i want pass 2d array to kernel and access it in __global__ void function with [ ] [ ] definition. – mahdimb Apr 06 '13 at 20:11
  • 1
    You need to fix your `dim3 grid` definition as stuhlo indicated. Then you may want to look at [this question](http://stackoverflow.com/questions/15799086/cuda-how-to-copy-a-3d-array-from-host-to-device) for some ideas. Usually flattening a 2D array is easiest, but if you have a fixed size array, you can use the approach in the first example I gave in my answer to that question. – Robert Crovella Apr 06 '13 at 20:35
  • I added host code that allocates memory on device, copies data from host to device memory, launches kernel and finally copies data from device to host memory. Don't forget to provide CUDA calls with error checking. – stuhlo Apr 07 '13 at 11:03

1 Answers1

7

dim3 is not array but structure defined in CUDA header file (vector_types.h). This structure is used to specify dimensions of GRID in execution configuration of global functions, i.e. in <<< >>>. It doesn't keep the 'real' blocks it just configures a number of blocks that will be executed.

The only two ways (to my knowledge) to initialize this structure are:
1. dim3 grid(x, y, z);
2. dim3 grid = {x, y, z};

EDIT: Host code with dim3 initialization and with passing the arrays to kernel function in a way you will be able to access its elements via [][]:

float A[N][N];
float B[N][N];
float C[N][N];

float (*d_A)[N]; //pointers to arrays of dimension N
float (*d_B)[N];
float (*d_C)[N];

for(int i = 0; i < N; i++) {
    for(int j = 0; j < N; j++) {
        A[i][j] = i;
        B[i][j] = j;
    }
}       

//allocation
cudaMalloc((void**)&d_A, (N*N)*sizeof(float));
cudaMalloc((void**)&d_B, (N*N)*sizeof(float));
cudaMalloc((void**)&d_C, (N*N)*sizeof(float));

//copying from host to device
cudaMemcpy(d_A, A, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, (N*N)*sizeof(float), cudaMemcpyHostToDevice);

// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C);

//copying from device to host
cudaMemcpy(A, (d_A), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(B, (d_B), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(C, (d_C), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
stuhlo
  • 1,479
  • 9
  • 17