0

I am trying to copy data from host to device in my GPU greyscale filter program. However, there is some kind of problem because when I try to do so, nothing happens. Probably I have some mistakes in my code but compiler doesn't show any errors. I need to copy variables d_bufferRGB into GPU, process it and return it in d_new_bufferRGB in order to save it with function save_bmp();

EDIT 1: implemented CUDA error checking in main() It says there is invalid argument in this line cudaMemcpy(d_bufferRGB, bufferRGB, size, cudaMemcpyHostToDevice)

HERE is the code >>>

     #include <stdio.h>
        #include <stdlib.h>
        #include <Windows.h>
        #include <cuda_runtime.h>
        #include <cuda.h>
        #include "device_launch_parameters.h"
        #include <iostream>


        #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}
        int width, heigth;
        long size;
        long *d_size;
        RGBTRIPLE *bufferRGB, *new_bufferRGB;
        RGBTRIPLE *d_bufferRGB, *d_new_bufferRGB;


        void load_bmp(RGBTRIPLE **bufferRGB, int *width, int *heigth, const char *file_name)
        {
            BITMAPFILEHEADER bmp_file_header;
            BITMAPINFOHEADER bmp_info_header;
            FILE *file;

            file = fopen(file_name, "rb");

            fread(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);

            fread(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);

            *width = bmp_info_header.biWidth;
            *heigth = bmp_info_header.biHeight;
            size = (bmp_file_header.bfSize - bmp_file_header.bfOffBits);
            std::cout << "velkost nacitanych pixelov je " << size <<'\n';

            int x, y;
            *bufferRGB = (RGBTRIPLE *)malloc(*width* *heigth * 4);

            fseek(file, bmp_file_header.bfOffBits - sizeof(bmp_file_header) - sizeof(bmp_info_header), SEEK_CUR);

            for (y = 0; y < *heigth; y++)
            {
                for (x = 0; x < *width; x++)
                {
                    (*bufferRGB)[(y * *width + x)].rgbtBlue = fgetc(file);
                    (*bufferRGB)[(y * *width + x)].rgbtGreen = fgetc(file);
                    (*bufferRGB)[(y * *width + x)].rgbtRed = fgetc(file);
                }
                for (x = 0; x < (4 - (3 * *width) % 4) % 4; x++)
                    fgetc(file);
            }
            fclose(file);
        }

        void save_bmp(RGBTRIPLE *bufferRGB, const char *new_name, const char *old_name)
        {
            BITMAPFILEHEADER bmp_file_header;
            BITMAPINFOHEADER bmp_info_header;
            FILE *file;

            file = fopen(old_name, "rb");

            fread(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);

            fread(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
            fclose(file);

            file = fopen(new_name, "wb");

            fwrite(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
            fwrite(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
            fseek(file, bmp_file_header.bfOffBits - sizeof(bmp_file_header) - sizeof(bmp_info_header), SEEK_CUR);

            int alligment_x = (4 - (3 * width) % 4) % 4;
            unsigned char *to_save = (unsigned char *)malloc((width * 3 + alligment_x)*heigth);
            unsigned int index = 0;
            int x, y;

            for (y = 0; y < heigth; y++)
            {
                for (x = 0; x < width; x++)
                {
                    to_save[index++] = bufferRGB[(y * width + x)].rgbtBlue;
                    to_save[index++] = bufferRGB[(y * width + x)].rgbtGreen;
                    to_save[index++] = bufferRGB[(y * width + x)].rgbtRed;
                }
                for (x = 0; x < alligment_x; x++)
                    to_save[index++] = 0;
            }
            std::cout << "velkost na ulozenie je " << sizeof(&to_save) << '\n';
            fwrite(to_save, (width * 3 + alligment_x)*heigth, 1, file);

            fclose(file);
            free(to_save);
        }


        __global__ void CUDA_filter_grayscale(const RGBTRIPLE *d_bufferRGB, RGBTRIPLE *d_new_bufferRGB, long *d_size)
        {
            int idx = blockIdx.x*blockDim.x + threadIdx.x;
            BYTE grayscale;

            if (idx < *d_size)
            {
                grayscale = ((d_bufferRGB[idx].rgbtRed + d_bufferRGB[idx].rgbtGreen + d_bufferRGB[idx].rgbtBlue) / 3);
                d_new_bufferRGB[idx].rgbtRed = grayscale;
                d_new_bufferRGB[idx].rgbtGreen = grayscale;
                d_new_bufferRGB[idx].rgbtBlue = grayscale;
            }
        }

        int main()
    {

            gpuErrchk(cudaMalloc(&d_new_bufferRGB, width*heigth * 4));
            gpuErrchk(cudaMalloc(&d_bufferRGB, width*heigth * 4));
            gpuErrchk(cudaMalloc(&d_size, sizeof(size)));

            load_bmp(&bufferRGB, &width, &heigth, "test.bmp"); //tu je vztvoreny a naplneny smernik *buffer_RGB

            gpuErrchk(cudaMemcpy(d_size, &size, sizeof(size), cudaMemcpyHostToDevice));
            gpuErrchk(cudaMemcpy(d_bufferRGB, bufferRGB, size, cudaMemcpyHostToDevice));

            CUDA_filter_grayscale << <32, 512 >> > (d_bufferRGB, d_new_bufferRGB, d_size); //size of kernel dont bother me for now

            gpuErrchk(cudaMemcpy(new_bufferRGB, d_new_bufferRGB, size, cudaMemcpyDeviceToHost));

            save_bmp(new_bufferRGB, "filter_grayscale_GPU.bmp", "test.bmp");
    } 

It's killing my brain for several days, plese help me with this.

  • 1
    any time you are having trouble with a CUDA code, it's good practice to use [proper CUDA error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) and run your code with `cuda-memcheck`, *before* asking others for help. Even if you don't understand the error output, it will be useful to others trying to help you. – Robert Crovella May 10 '17 at 14:48
  • Why are you using so many C functions (`fread`, `malloc`, `fwrite`)? – Ben Steffan May 10 '17 at 14:49
  • To Robert : Ok, I will implement error checking and edit question. To Ben: Those are used for loading and storing information about pixels from .bmp file. This way I managed it -> Seems like I'm not so good in programming – Jaroslav Petrík May 10 '17 at 14:53
  • Try giving a [mcve] – Passer By May 10 '17 at 15:00
  • 1
    at least one problem is here: `cudaMemcpy(&d_size, &size, sizeof(size), cudaMemcpyHostToDevice);` The ampersand (`&`) on `d_size` does not belong there. It should be removed. If you had implemented proper CUDA error checking, you would have been alerted to this already. – Robert Crovella May 10 '17 at 15:01
  • @JaroslavPetrík: I am not sure if you have the same problem but check this nice question about copying RGB image in CUDA. http://stackoverflow.com/questions/31673359/understanding-the-copy-done-by-memcpy – skm May 10 '17 at 15:05
  • @skm: I'm afraid this question will not help because it is copying memory only on host(CPU memory) and I need to transfer part of memory between CPU and GPU. Also it was copying from .png to .jpg data type. – Jaroslav Petrík May 10 '17 at 15:38
  • The problem is in variables in these lines of code according to error check, but I don't know which variable is problem. cudaMemcpy(d_bufferRGB, &bufferRGB, size, cudaMemcpyHostToDevice); cudaMemcpy(new_bufferRGB, d_new_bufferRGB, size, cudaMemcpyDeviceToHost); I undesrtand that the first variable is destination, the second is source and the third is size to copy. What is my mistake ? – Jaroslav Petrík May 10 '17 at 15:49
  • What error do you get? Are you sure `size` is not greater than `width*height*4`? Is `sizeof(RGBTRIPLE)` meant to be `4`? (Referring to the first two lines in the `main` function) And I am curious: Why do you declare the variables global _and_ parse them to the functions? – BlameTheBits May 10 '17 at 16:41
  • The line of code you are showing in your comment: `cudaMemcpy(d_bufferRGB, &bufferRGB, size, cudaMemcpyHostToDevice); ` does not match the line of code you have posted in your question: `cudaMemcpy(d_bufferRGB, bufferRGB, size, cudaMemcpyHostToDevice)`. The one in your posting is of the correct form. The one in your comment is not. If you're working with code that you haven't posted, I'm not sure how you expect help from others. – Robert Crovella May 10 '17 at 17:16
  • @Shadow The error says : "Invalid argument". size is int value read from BMP header and it is the same value as width*height*3; I made *4 bcs sizeof(RGBTRIPLE) is 4. This type is defined by microsoft in some of its libraries. In my opinion it should be 3 bcs it stores valued red, green and blue (8 bits each). The variables are global because i have implemented much more filters on host CPU (clasical c++ code) and now I am trying to do the same filters on GPU device. OFF TOPIC: how do you mark variables with grey ? I am new to this website. – Jaroslav Petrík May 10 '17 at 17:29
  • @Robert Crovella I made a typo in comment. Sorry. But I am so unlucky that either one option does not work for me, still with the same error. Compiler acts as everything is ok but cudaerrorcheck says that I have invalid argument in function cudaMemcpy. The result is that i am not able to save bitmap and i am afraid that the code dont even load the bufferRGB into gpu's d_bufferRGB. – Jaroslav Petrík May 10 '17 at 17:39
  • one problem you have now is that your `width` and `heigth` parameters are being used in `main` uninitialized. They are initialized by `load_bmp`, but you are not calling that till later. Call `load_bmp` as the very first line of code in `main`. The next issue is that your bitmap `size` calculation is not correct. – Robert Crovella May 10 '17 at 20:08
  • another problem is that you are not allocating for `new_bufferRGB` anywhere. You should provide a host buffer allocation before attempting to use it in the final `cudaMemcpy` – Robert Crovella May 10 '17 at 20:14
  • @ Robert Crovella MAN I ABSOLUTELY ADMIRE YOU. You made it. Thanks a milion times. Now I only need to make dynamic allocation of size of kernel but i think i can make it. Please make your comment an answer so I can mark it as solution and give you points you deserve. BTW : why is size calculation wrong ? as far as I can see it works and when I was debbuging filters for CPU, the actual size was the same as width*heigth*3 (which I think is size of pixel data). – Jaroslav Petrík May 10 '17 at 20:55
  • I was getting an incorrect size. I was also (obviously) using a different bitmap file than you, and that may have been the issue. I was using a BM8 (photoshop) bitmap file. Anyway, if the `size` is correct for you, great. – Robert Crovella May 10 '17 at 21:12
  • Yeah, this code is wrote for 24 color depth, which is BMP ver 3 by Microsoft. BMP of other version have different variables in headers. – Jaroslav Petrík May 10 '17 at 21:15
  • I was never able to get my code to run perfectly, probably because I was using the wrong kind of BMP file. My suggestion would be that you post your updated/fixed code as an answer. I think that would be most useful for future readers. If you post a reasonably complete answer I would upvote it. – Robert Crovella May 11 '17 at 16:18
  • Ok. I'm gonna put it there. – Jaroslav Petrík May 12 '17 at 17:55
  • One last question @RobertCrovella U dont have to answer me. When i tried to somehow make a frame around a picture in cuda kernel, it made som strange lines around each kernel. Don't you know how to find out if some pixel is for example 5 r less pixels far from the edge of original picture, not 5 pixels far from kernel edge ? – Jaroslav Petrík May 12 '17 at 18:29
  • I wouldn't know without seeing the code. Probably you should ask a new question if you need help with it. – Robert Crovella May 12 '17 at 19:01

1 Answers1

1

So, with significant help obtained from @Robert Crovella i had finished my code. I also made some extra features like dynamic kernel allocation as a free gift for internet users. Code is fully functional for BMP ver. 3 from Microsoft(one can create some in Paint). I've tried to upload some image but it can be max 2MB big, which is not enough for true color depth. When compiling, there is error of null pointer but the program is created and stored in project Debug folder. When you run it with an image in the folder, it works without problem.

The problem with code above are > 1, uninicialised new_bufferRGB 2, load function do not provide variables sooner then I use them 3, mistakes in cudaMemcpy function

SO, HERE IS THE CODE >>>

#include <stdio.h>
#include <stdlib.h>
#include <Windows.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "device_launch_parameters.h"
#include <iostream>


int width, heigth;
long size;
long *d_size;
RGBTRIPLE *bufferRGB, *new_bufferRGB;
RGBTRIPLE *d_bufferRGB, *d_new_bufferRGB;

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        //if (abort) exit(code);
    }
}

void load_bmp(RGBTRIPLE **bufferRGB, int *width, int *heigth, const char *file_name)
{
    BITMAPFILEHEADER bmp_file_header;
    BITMAPINFOHEADER bmp_info_header;
    FILE *file;

    file = fopen(file_name, "rb");

    fread(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);

    fread(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);

    *width = bmp_info_header.biWidth;
    *heigth = bmp_info_header.biHeight;
    size = (bmp_file_header.bfSize - bmp_file_header.bfOffBits);
    std::cout << "size of loaded pixels is " << size << '\n';

    int x, y;
    *bufferRGB = (RGBTRIPLE *)malloc(*width* *heigth * 4);

    fseek(file, bmp_file_header.bfOffBits - sizeof(bmp_file_header) - sizeof(bmp_info_header), SEEK_CUR);

    for (y = 0; y < *heigth; y++)
    {
        for (x = 0; x < *width; x++)
        {
            (*bufferRGB)[(y * *width + x)].rgbtBlue = fgetc(file);
            (*bufferRGB)[(y * *width + x)].rgbtGreen = fgetc(file);
            (*bufferRGB)[(y * *width + x)].rgbtRed = fgetc(file);
        }
        for (x = 0; x < (4 - (3 * *width) % 4) % 4; x++)
            fgetc(file);
    }
    fclose(file);
}

void save_bmp(RGBTRIPLE *bufferRGB, const char *new_name, const char *old_name)
{
    BITMAPFILEHEADER bmp_file_header;
    BITMAPINFOHEADER bmp_info_header;
    FILE *file;

    file = fopen(old_name, "rb");

    fread(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);

    fread(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
    fclose(file);

    file = fopen(new_name, "wb");

    fwrite(&bmp_file_header, sizeof(BITMAPFILEHEADER), 1, file);
    fwrite(&bmp_info_header, sizeof(BITMAPINFOHEADER), 1, file);
    fseek(file, bmp_file_header.bfOffBits - sizeof(bmp_file_header) - sizeof(bmp_info_header), SEEK_CUR);

    int alligment_x = (4 - (3 * width) % 4) % 4;
    unsigned char *to_save = (unsigned char *)malloc((width * 3 + alligment_x)*heigth);
    unsigned int index = 0;
    int x, y;

    for (y = 0; y < heigth; y++)
    {
        for (x = 0; x < width; x++)
        {
            to_save[index++] = bufferRGB[(y * width + x)].rgbtBlue;
            to_save[index++] = bufferRGB[(y * width + x)].rgbtGreen;
            to_save[index++] = bufferRGB[(y * width + x)].rgbtRed;
        }
        for (x = 0; x < alligment_x; x++)
            to_save[index++] = 0;
    }
    fwrite(to_save, (width * 3 + alligment_x)*heigth, 1, file);

    fclose(file);
    free(to_save);
}


__global__ void CUDA_filter_grayscale(const RGBTRIPLE *d_bufferRGB, RGBTRIPLE *d_new_bufferRGB, long *d_size)
{
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    BYTE grayscale;

    if (idx < *d_size)
    {
        grayscale = ((d_bufferRGB[idx].rgbtRed + d_bufferRGB[idx].rgbtGreen + d_bufferRGB[idx].rgbtBlue) / 3);
        d_new_bufferRGB[idx].rgbtRed = grayscale;
        d_new_bufferRGB[idx].rgbtGreen = grayscale;
        d_new_bufferRGB[idx].rgbtBlue = grayscale;
    }
}

int main()
{
    // load to have all variables reachable and loaded
    load_bmp(&bufferRGB, &width, &heigth, "test.bmp");

    // inicialise buffer for copy of proccesed image from device to host 
    new_bufferRGB = (RGBTRIPLE *)malloc(width* heigth * 4);

    //inicializing variables on GPU
    gpuErrchk(cudaMalloc(&d_new_bufferRGB, width*heigth * 4));
    gpuErrchk(cudaMalloc(&d_bufferRGB, width*heigth * 4));
    gpuErrchk(cudaMalloc(&d_size, sizeof(size)));

    // copying variables to GPU
    gpuErrchk(cudaMemcpy(d_size, &size, sizeof(size), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_bufferRGB, bufferRGB, size, cudaMemcpyHostToDevice));

    // find out the kernel size, number of threads depends on your GPU max number of threads
    int numbThreads = 1024;
    int numbBlocks = (width*heigth) / numbThreads;
    if (((width*heigth) % numbThreads)>0)   numbBlocks++;

    CUDA_filter_grayscale <<<numbBlocks, numbThreads >>> (d_bufferRGB, d_new_bufferRGB, d_size); 

    //copy result from device to host
    gpuErrchk(cudaMemcpy(new_bufferRGB, d_new_bufferRGB, size, cudaMemcpyDeviceToHost));

    //save result
    save_bmp(new_bufferRGB, "filter_grayscale_GPU.bmp", "test.bmp");

    return 0;
}