Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D

Question

I'm new in CUDA, I appreciate your help and hope you can help me.

I need to store multiple elements of a 2D array into a vector, and then work with the vector, but my code does not work well, when I debug, I find a mistake in allocating the 2D array in the device with cudaMallocPitch and copying to that array with cudaMemcpy2D. This is my code:

#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cmath>

#define maxThreads 96

__global__ void extract(int mSize, float* dev_vector, float* dev_matrix, int N)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    while(idx<N)
    {
        dev_vector[idx] = *(dev_matrix+(mSize*idx+N));
        idx += blockDim.x * gridDim.x;
    }
}

int main()
{
    //CPU variables
    int mSize = 5;
    float* matrix;
    int N = 4; // Vector size
    int i,j;
    float* vector;
    int blocks, threads;

    float* dev_matrix;
    float* dev_vector;

    blocks = 1+((N-1)/maxThreads);
    threads = 1+((N-1)/blocks);

    unsigned long int pitch;
    unsigned long int memsize_vector = N*sizeof(float);
    unsigned long int memsize_matrix = mSize*sizeof(float);


    matrix = new float[memsize_matrix*memsize_matrix];
    vector = new float[memsize_vector];

    //Create 2D array
    for(i=0; i<mSize; i++)
        for(j=0; j<mSize; j++)
        {
            matrix[i+mSize*j] = ((i+1)+(j+1));
        }

    printf("\n");
    for (i=0; i<mSize; i++){
        for(j=0; j<mSize; j++){
            printf("% 1.5f ", matrix[i+mSize*j]);
        }
        printf("\n");
    }
    printf("\n");


    cudaMallocPitch((void **)&dev_matrix, &pitch, memsize_matrix, mSize);
    cudaMalloc((void **)&dev_vector, memsize_vector);

    cudaMemcpy2D(dev_matrix, pitch, matrix, memsize_matrix, memsize_matrix, mSize,
                     cudaMemcpyHostToDevice);

    extract<<<blocks,threads>>>(mSize, dev_vector, dev_matrix, N);
    cudaDeviceSynchronize();

    cudaMemcpy(vector, dev_vector, memsize_vector, cudaMemcpyDeviceToHost);

    printf("Vector values are:\n");
    for(i=0; i<N; i++)
        printf(" % 1.5f ", vector[i]);
    printf("\n");

    cudaFree(dev_matrix);
    cudaFree(dev_vector);

}

What do you mean with ` find a mistake in allocating the 2D array`. Did you try check for CUDA API call errors? — pQB, Feb 21 '13 at 08:36

talonmies · Accepted Answer · 2013-02-21T06:45:17.563

There are lots of problems in this code, including but not limited to using array sizes in bytes and word sizes interchangeably in several places in code, using incorrect types (note that size_t exists for a very good reason) , potential truncation and type casting problems, and more.

But the core problem is the addressing of pitched memory inside the kernel, to which you are never even passing the pitch value. Reading the documentation for cudaMallocPitch will give you the correct method for addressing pitched memory inside a kernel. Your kernel might then look like this:

__global__ void extract(size_t mpitch, float* dev_vector, float* dev_matrix, int N)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;

    while(idx<N)
    {          
        dev_vector[idx] = *(float *)( ((char*)dev_matrix + idx * mpitch) + N );
        idx += stride;
    }
}

[disclaimer: never compiled or tested, use at own risk].

You will have to fix then all the problems in the host code to reflect whatever kernel changes you make.

score 1 · Answer 2 · answered Feb 21 '13 at 21:00

Thanks to all, Alex I had not seen that, and fix it, thanks.

talonmies, thank you, my code works, with your suggestions. thanks a lot, finally this my kernel:

__global__ void sumreduct(size_t pitch, float* dev_vector, float* dev_matrix, int  columns, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;

while(idx<N)
{
    dev_vector[idx] = *(float *)( ((char*)dev_matrix + idx * pitch) + columns);
    idx += stride;
} 
}

About "size_t", I was using "Unsigned int" because Nsight show me the next warning:

Type 'size_t' could not be resolved

Thanks

score 0 · Answer 3 · answered Feb 20 '13 at 23:38

Did you really mean to declare a source matrix of length [memsizeMatrix*memsizeMatrix] ?

This will allocate 400 floats, or 1600 bytes. This means your source-pitch is off, and the Memcpy2D call is failing.

I'm assuming you meant to say

matrix = new float[mSize*mSize];

Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D

3 Answers3

Linked

Related