1

I'm trying to create a negative image using CUDA uses the same functionality as the CPU computing.

This is the main class.

int main(int argc, char** argv)
{

    IplImage* image_input = cvLoadImage("test.jpg", CV_LOAD_IMAGE_UNCHANGED);
    IplImage* image_output = cvCreateImage(cvGetSize(image_input),
                    IPL_DEPTH_8U,image_input->nChannels);

    unsigned char *h_out = (unsigned char*)image_output->imageData;
    unsigned char *h_in =  (unsigned char*)image_input->imageData;

    width     = image_input->width;
    height    = image_input->height;
    widthStep = image_input->widthStep;
    channels  = image_input->nChannels;

    negatif_parallel(h_in, h_out,  width, height, widthStep, channels);

    cvShowImage("Original", image_input);
    cvShowImage("CPU", image_output);

    waitKey(0);
    cvReleaseImage(&image_input);
    cvReleaseImage(&image_output);

}

and this is the CUDA class

__global__ void kernel ( unsigned char *d_in ,unsigned char* d_out, int width , int height, int widthStep, int channels) {
int x = blockIdx . x * blockDim . x + threadIdx . x ;
int y = blockIdx . y * blockDim . y + threadIdx . y ;

int s;

if( x < width && y < height){
    int i = y;
    int j = x;
        for(int k=0;k<channels;k++){
            s = d_in[i*widthStep + j*channels + k];
            s = 255-d_in[i*widthStep + j*channels + k];
            d_out[i*widthStep + j*channels + k]=s;
        }

    }
}

extern "C" void negatif_parallel( unsigned char* h_in, unsigned char* h_out,  int width, int height, int widthStep,int channels){

unsigned char* d_in;
unsigned char* d_out;
cudaMalloc((void**) &d_in, width*height);
cudaMalloc((void**) &d_out, width*height);

cudaMemcpy(d_in, h_in, width*height*sizeof( unsigned char), cudaMemcpyHostToDevice);
dim3 block (16,16);
dim3 grid (width/16, height/16);
kernel<<<grid,block>>>(d_in, d_out, width, height, widthStep, channels);

cudaMemcpy(h_out, d_out, width*height*sizeof( unsigned char), cudaMemcpyDeviceToHost);
cudaFree(d_in);
cudaFree(d_out);

}

When done using CPU computing, the negative image successfully. But when using CUDA, the negative image is not successful, just blank white image appears. Whats wring with my code ? T_T

bagusbekam
  • 99
  • 10
  • 1
    Possible problem in this cudaMemcpy(d_in, h_in, width*height*sizeof( unsigned char), cudaMemcpyHostToDevice); or cudaMemcpy(h_out, d_out, 3*width*height*sizeof( unsigned char), cudaMemcpyDeviceToHost); You are copying back 3 times more than the input. Are sure you want to do this ? – Sagar Masuti Nov 12 '13 at 05:41
  • my god, im sorry, I mean this cudaMemcpy(d_in, h_in, width*height*sizeof( unsigned char), cudaMemcpyHostToDevice) cudaMemcpy(h_out, d_out, width*height*sizeof( unsigned char), cudaMemcpyDeviceToHost); but the result still same(blank image). – bagusbekam Nov 12 '13 at 05:45
  • Edit the same in question. Could you add proper error check mentioned [here](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) ?. Also try to run with `cuda-memcheck`. Or give the complete reproducer for people to help you. – Sagar Masuti Nov 12 '13 at 05:50
  • 1
    try some [cuda error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) in your "cuda class". As pointed out by Sagar Masuti, I think your second cudaMemcpy will throw an error, since you are trying to transfer 3 times as much data as you have allocated for `d_out` in the `cudaMalloc` operation. – Robert Crovella Nov 12 '13 at 05:50
  • thanks, i revised `cudamalloc` n `cudamemcpy`, but the error still same. T_T – bagusbekam Nov 12 '13 at 07:50
  • Can you put your new code by editing the above code. Did you try running with `cuda-memcheck` ? Dont post your findings as answer until you fully solve your problem. – Sagar Masuti Nov 12 '13 at 08:42
  • You still don't show any proper cuda error checking in your code. – Robert Crovella Nov 12 '13 at 14:10

1 Answers1

3

You were pretty close. Just needed to add the number of bytes in each channel to your memory allocations and transfers. Here's a working version of your code. I added some error checking as well. See this question for more information about the error checking. Note that you don't have to use two buffers on the GPU in this case. You can use a single buffer and do the conversion in-place.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;
using namespace std;

void negatif_parallel( unsigned char* h_in, unsigned char* h_out,  int width, int height, int widthStep,int channels);

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main(int argc, char** argv)
{
    IplImage* image_input = cvLoadImage("test.jpg", CV_LOAD_IMAGE_UNCHANGED);
    IplImage* image_output = cvCreateImage(cvGetSize(image_input), IPL_DEPTH_8U,image_input->nChannels);

    unsigned char *h_out = (unsigned char*)image_output->imageData;
    unsigned char *h_in =  (unsigned char*)image_input->imageData;

    int width     = image_input->width;
    int height    = image_input->height;
    int widthStep = image_input->widthStep;
    int channels  = image_input->nChannels;

    negatif_parallel(h_in, h_out,  width, height, widthStep, channels);

    cvShowImage("Original", image_input);
    cvShowImage("CPU", image_output);

    waitKey(0);

    cvReleaseImage(&image_input);
    cvReleaseImage(&image_output);
}

__global__ void kernel (unsigned char *d_in,unsigned char* d_out, int width, int height, int widthStep, int channels) {
    int x = blockIdx . x * blockDim . x + threadIdx . x ;
    int y = blockIdx . y * blockDim . y + threadIdx . y ;

    int s;

    if (x < width && y < height) {
        int i = y;
        int j = x;
        for(int k=0; k< channels; k++) {
            s = d_in[i*widthStep + j*channels + k];
            s = 255-d_in[i*widthStep + j*channels + k];
            d_out[i*widthStep + j*channels + k]=s;
        }

    }
}

void negatif_parallel( unsigned char* h_in, unsigned char* h_out,  int width, int height, int widthStep,int channels)
{
    unsigned char* d_in;
    unsigned char* d_out;
    cudaMalloc((void**) &d_in, width*height*channels);
    cudaMalloc((void**) &d_out, width*height*channels);

    gpuErrchk(cudaMemcpy(d_in, h_in, width*height*channels, cudaMemcpyHostToDevice));
    dim3 block (16,16);
    dim3 grid (width / 16, height /16);
    kernel<<<grid, block>>>(d_in, d_out, width, height, widthStep, channels);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() ); // Not strictly required because the next call, cudaMemcpy, is blocking

    gpuErrchk(cudaMemcpy(h_out, d_out, width * height * channels, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaFree(d_in));
    gpuErrchk(cudaFree(d_out));
}
Community
  • 1
  • 1
Roger Dahl
  • 15,132
  • 8
  • 62
  • 82
  • my god, I do not add the number of channels for memory allocation. thank you for the way in CUDA error checking because I am still a beginner in learning CUDA. Thanks very much, god bless u all. Its solved :)) – bagusbekam Nov 13 '13 at 03:03