-3

I have the following kernel

   __global__ void filter(unsigned char *image, unsigned char *out, int n, int m)
    {
        int x = blockIdx.x * blockDim.x + threadIdx.x;
        int y = blockIdx.y * blockDim.y + threadIdx.y;
        int offset = x + y * blockDim.x * gridDim.x;
        int sumx, sumy, sumz, k, l;

        __shared__ float shared[16][16];

        shared[threadIdx.x][threadIdx.y] = image[offset];
            out[offset] = shared[threadIdx.x][threadIdx.y]; 

    }

which I am calling like filter<<<dimGrid, dimBlock>>>(dev_image, dev_out, n, m);.

The strange thing is that even if I comment the call to the kernel and compile, the image remain the same. Any idea why this is happening? Isn't the memory on the gpu freed?

void Draw()
{
    unsigned char *image, *out;
    int n, m;
    unsigned char *dev_image, *dev_out;
    image = readppm("maskros512.ppm", &n, &m);
    out = (unsigned char*) malloc(n*m*3);
    printf("%d %d\n",n,m );
    cudaMalloc( (void**)&dev_image, n*m*3);
    cudaMalloc( (void**)&dev_out, n*m*3);
    cudaMemcpy( dev_image, image, n*m*3, cudaMemcpyHostToDevice);
    dim3 threads( 1, 256 );
    dim3 blocks( 32, 32 );
    filter<<<blocks, threads>>>(dev_image, dev_out, n, m);
    cudaMemcpy( out, dev_out, n*m*3, cudaMemcpyDeviceToHost );
    cudaFree(dev_image);
    cudaFree(dev_out);
    glClearColor( 0.0, 0.0, 0.0, 1.0 );
    glClear( GL_COLOR_BUFFER_BIT );
    glRasterPos2f(-1, -1);
    glDrawPixels( n, m, GL_RGB, GL_UNSIGNED_BYTE, image );
    glRasterPos2i(0, -1);
    glDrawPixels( n, m, GL_RGB, GL_UNSIGNED_BYTE, out );
    glFlush();
}
Avraam Mavridis
  • 8,698
  • 19
  • 79
  • 133
  • Your kernel does nothing at all. (Note that nowhere in your kernel is anything being written to `out`, or any global memory location.) It's very likely that the compiler is optimizing it down to an empty function. Whatever is going on with your image is a result of some aspect of your code that you haven't shown. In fact, SO expects: "Questions concerning problems with code you've written must describe the specific problem — and include valid code to reproduce it — in the question itself. See SSCCE.org for guidance. " Voting to close - you haven't provided a SSCCE.org code. – Robert Crovella Dec 06 '13 at 19:03
  • Even though you've now added a line of code to your kernel which writes to `out`, it's still not possible to explain what is happening, without showing a complete code. – Robert Crovella Dec 06 '13 at 20:01
  • @RobertCrovella I added my Draw function that calls the filter, the strange thing is that even if I comment the 'filter' line the image doesn't change. – Avraam Mavridis Dec 06 '13 at 20:53

1 Answers1

1

If you just comment out the filter line, there is nothing populating dev_out. So if you then copy dev_out to out you're going to get garbage, which may be whatever was in dev_out last.

These lines are not right:

dim3 threads( 1, 256 );
dim3 blocks( 32, 32 );

You are launching thread blocks that are 1 thread in x by 256 threads in y. This doesn't make sense for your kernel. Your kernel expects one thread launched per pixel, and it expects a sufficient thread array in both x and y to cover the image space in pixels. Furthermore, your shared memory allocation is expecting a 16x16 block of threads. Try this:

dim3 threads(16,16);
dim3 blocks((n+threads.x-1)/threads.x, (m+threads.y-1)/threads.y);

In addition, your image appears to be made of 3-byte pixels. But you are only launching one thread per pixel. So you will need to copy 3 bytes per pixel, not one. Something like this:

#define RED 0
#define GRN 1
#define BLU 2

__global__ void filter(unsigned char *image, unsigned char *out, int n, int m)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int offset = x + y * blockDim.x * gridDim.x;
    // the above numbers are all pixel dimensions.  To convert to byte dimensions, 
    // we must multiply by 3
    int sumx, sumy, sumz, k, l;

    __shared__ unsigned char shared[16][16*3];

    shared[threadIdx.x][(threadIdx.y*3)+RED] = image[(offset*3)+RED]; // pick up red
    shared[threadIdx.x][(threadIdx.y*3)+GRN] = image[(offset*3)+GRN]; // pick up green
    shared[threadIdx.x][(threadIdx.y*3)+BLU] = image[(offset*3)+BLU]; // pick up blue
    out[(offset*3)+RED] = shared[threadIdx.x][(threadIdx.y*3)+RED]; 
    out[(offset*3)+GRN] = shared[threadIdx.x][(threadIdx.y*3)+GRN]; 
    out[(offset*3)+BLU] = shared[threadIdx.x][(threadIdx.y*3)+BLU]; 
}

Finally you should do proper cuda error checking

Community
  • 1
  • 1
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257