CUDA unkown error when copying from device to host

Question

I wrote some CUDA code, and everything seems great until I try to get the results from the code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdlib> 
#include <ctime> 
#include <iostream>

#define maskSize 3 

__constant__ float masks[32*maskSize*maskSize];

__global__ void myConv(float *res, const float* mats, int mSize)
{
    extern __shared__ float curr[];
    int rSize=maskSize+mSize-1;
    int idxmod=(threadIdx.x+maskSize-1) % (mSize+2*maskSize-2); //these two map any value not within (mSize-1,mSize-1) to the boarders for padding.
    int idymod=(threadIdx.y+maskSize-1) % (mSize+2*maskSize-2);
    if (threadIdx.x < mSize && threadIdx.y < mSize) //put the value of mats in the middle of the curr matrix
        curr[(threadIdx.x+ maskSize-1)*(mSize+2*(maskSize-1)) + threadIdx.y + maskSize-1]=mats[mSize*(blockIdx.y*mSize + threadIdx.x) + threadIdx.y];
    else //zero padding
        if (threadIdx.x < mSize)
            curr[threadIdx.x*(mSize+2*(maskSize-1)) +idymod] =0;
        else 
            curr[idxmod*(mSize+2*(maskSize-1)) +threadIdx.y] =0;

    __syncthreads();
    float tmp=0;

if (threadIdx.x < mSize+maskSize-1 && threadIdx.y < mSize+maskSize-1)
{
#pragma unroll
    for (int i=0;i<maskSize;i++)
        #pragma unroll
        for (int j=0;j<maskSize;j++)

            tmp+=curr[(threadIdx.x+i)*(mSize+2*(maskSize-1)) + threadIdx.y+j]*masks[blockIdx.x*maskSize*maskSize +maskSize*i +j];
    res[blockIdx.y*rSize*rSize + threadIdx.x*rSize + threadIdx.y]=tmp;
}
}

int main()
{
    int MatSize=5;
    int bSize=2000;
    int maskNum=10;
    int resSize=MatSize+maskSize-1;
    float* ms;
    ms=(float *)malloc(maskSize*maskSize*maskNum*sizeof(float));
    float* resPtr=(float *)malloc((MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float));
    for (int i=0; i<maskSize;i++)
        for (int j=0; j<maskSize; j++)
            for (int k=0; k<maskNum; k++)
                ms[k*maskSize*maskSize + j*maskSize + i]=(float)(rand() % 1000)/100;
    float* inp=(float *)malloc(MatSize*MatSize*bSize*sizeof(float));
    for (int i=0; i<MatSize; i++)
        for (int j=0; j<MatSize; j++)
            for (int k=0;k<bSize;k++)
                inp[k*MatSize*MatSize + j*MatSize + i]=(float)(rand() % 500)/100;
    float *cudams, *cudaresPtr,*cudainp;
    cudaMalloc((void **) &cudams,maskSize*maskSize*maskNum*sizeof(float));
    cudaMalloc((void **) &cudaresPtr,(MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float));
    cudaMalloc((void **) &cudainp,MatSize*MatSize*bSize*sizeof(float));

    cudaMemcpy((void *)cudams,(void *)ms,maskSize*maskSize*maskNum*sizeof(float),cudaMemcpyHostToDevice);

    cudaMemcpy((void *)cudainp,(void *)inp,MatSize*MatSize*bSize*sizeof(float),cudaMemcpyHostToDevice);

    cudaMemcpyToSymbol(masks,(void *)cudams,maskSize*maskSize*maskNum*sizeof(float),0,cudaMemcpyDeviceToDevice);
    dim3 threadSize(MatSize+2*(maskSize-1),MatSize+2*(maskSize-1));
    dim3 blockSize(1, 1); //for testing purposes. should be dim3 blockSize(maskNum,bSize);
    myConv<<<blockSize, threadSize, (MatSize+2*(maskSize-1))*(MatSize+2*(maskSize-1))>>>(cudaresPtr,cudainp,MatSize);
    cudaMemcpy((void *)resPtr,(const void *)cudaresPtr,(MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float),cudaMemcpyDeviceToHost);
    //The problem is here - They copying won't work!

    free(inp);
    free(ms);
    free(resPtr);
    return 0;
}

I put printf in various places, used error checking as recommended here, printed error string... Can't find anything that would cause an error copying the contents of the pointer back to the host.

Edit: memcheck result: no errors if I understand correctly:

O:\CudaTst>cuda-memcheck CUDA_TST ========= CUDA-MEMCHECK

Time spent: 0.144000 secondsError: Failed to read the strings for error record ========= ERROR SUMMARY: 0 errors

Re-ran with -l (leak) - 0 leaks.

try running your code with cuda-memcheck. You may have an out-of-bounds access in your kernel. — Robert Crovella, Aug 06 '13 at 04:47
done. No errors reported, I've edited my question accordingly — user1999728, Aug 06 '13 at 07:45
Could you edit the shortest possible, complete example that someone else could compile and run into your question? In all likelihood this is an error in the kernel (see [this question](http://stackoverflow.com/q/17507033/681865) for example). I recommend following the error checking described in [this question and answer set](http://stackoverflow.com/q/14038589/681865) *exactly*, it will give you much more precise information about the source of the error. — talonmies, Aug 06 '13 at 08:20
Done. I tried the error checking you mentioned earlier, but the only thing that gave me an error was the attempt to copy the contents back to the host after the kernel was done. the function used by the macro printed some text, but even if i commented out the if (abort) line, it still exited before I could see what was printed. — user1999728, Aug 06 '13 at 08:48
@user1999728: The key thing your code misses is thorough error checking after the kernel launch. Look at the second code snippet in my answer. The `cudaPeekAtLastError(); cudaDeviceSynchronize()` pattern isolates a kernel arguments error from a kernel execution error from a subsequent API error. — talonmies, Aug 06 '13 at 08:52
Ok so when I run your code with cuda-memcheck I get a huge stream of kernel out-of-bounds shared memory write errors. Your kernel is broken and the errors you are seeing are only being reported by and not caused by the cudaMemcpy calls. — talonmies, Aug 06 '13 at 09:12
I saw the second code snippet a few days ago, but whenever I looked for it, I only looked at the first part of the answer and though I'm in the wrong place. (It happens to me surprisingly often). Thank you so much! I actually still got "unkown error", so if you hadn't said "kernel arguments" i never would have figured it out :) — user1999728, Aug 06 '13 at 09:13
Oh, and about the out of bounds writing - I didn't get errors with my memcheck... which Is odd, because I was pretty sure I had some indexing problems before you pointed my attention to the fact that i forgot *sizeof(float)... maybe I ran the memcheck wrong =/ — user1999728, Aug 06 '13 at 09:15

score 1 · Accepted Answer · answered Aug 06 '13 at 09:20

It would appear that you are (at least) launching your kernel with insufficient dynamically allocated shared memory for it to run without a buffer overflow inside the kernel.

The amount of shared memory per block is specific in bytes, so I suspect you want something like:

size_t shmsz = sizeof(float)*size_t((MatSize+2*(maskSize-1))*
                                    (MatSize+2*(maskSize-1));
myConv<<<blockSize, threadSize, shmz)>>>(cudaresPtr,cudainp,MatSize);

Beyond that, I leave the debugging to you.

CUDA unkown error when copying from device to host

1 Answers1

Linked