0

I wrote this piece of code to make a median Blur in CUDA but I am running into an issue, where the channel of image is blurred but it creates stripes which look unusual for blurring.

#include <iostream>
#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>

using namespace std;
using namespace cv;

#define BLOCK_SIZE      16
#define TILE_SIZE       14
#define FILTER_WIDTH    3
#define FILTER_HEIGHT   3


__device__ void sort(unsigned char* filterVector)
{
    for (int i = 0; i < FILTER_WIDTH*FILTER_HEIGHT; i++) {
        for (int j = i + 1; j < FILTER_WIDTH*FILTER_HEIGHT; j++) {
            if (filterVector[i] > filterVector[j]) {
                unsigned char tmp = filterVector[i];
                filterVector[i] = filterVector[j];
                filterVector[j] = tmp;
            }
        }
    }
}

__global__ void medianFilter(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height)
{

    int x_o = TILE_SIZE * blockIdx.x + threadIdx.x;
    int y_o = TILE_SIZE * blockIdx.y + threadIdx.y;

    int x_i = x_o - (FILTER_HEIGHT / 2);
    int y_i = y_o - (FILTER_WIDTH / 2);

    __shared__ unsigned char sBuffer[BLOCK_SIZE][BLOCK_SIZE];

    if ((x_i >= 0) && (x_i < width) && (y_i >= 0) && (y_i < height)) {
        sBuffer[threadIdx.y][threadIdx.x] = srcImage[y_i * width + x_i];
    } else {
        sBuffer[threadIdx.y][threadIdx.x] = 0;
    }

    __syncthreads();

    unsigned char filterVector[FILTER_WIDTH*FILTER_HEIGHT];

    // int size_vec = sizeof(filterVector) / sizeof(filterVector[0]);

    // printf("%d \n", size_vec);

    if (threadIdx.x < TILE_SIZE && threadIdx.y < TILE_SIZE) {
        for (int r = 0; r < FILTER_HEIGHT; r++) {
            for (int c = 0; c < FILTER_HEIGHT; c++) {
                filterVector[r*FILTER_HEIGHT+c] = sBuffer[threadIdx.y + r][threadIdx.x + c];
            }
        }
    }

    sort(filterVector);

    if (x_o < width && y_o < height) {
        dstImage[y_o * width + x_o] =  filterVector[4]; // (FILTER_WIDTH*FILTER_HEIGHT)/2
    }

}

int main(int argc, char **argv)
{

    std::string image_path = "./test.jpg";
    cv::Mat img = imread(image_path, IMREAD_COLOR);
    std::string output_file = "test_gpu.jpg";

    if(img.empty())
    {
        std::cout << "Couldn't read img:" << image_path << std::endl;
    }

    Mat bgr[3];
    split(img, bgr);
    
    cv::Mat dstImg (bgr[1].size(), bgr[1].type());

    const int inputSize = img.cols * img.rows;
    const int outputSize = dstImg.cols * dstImg.rows; 
    unsigned char *d_input, *d_output;

    cudaMalloc<unsigned char>(&d_input, inputSize);
    cudaMalloc<unsigned char>(&d_output, outputSize);

    cudaMemcpy(d_input, bgr[1].ptr(), inputSize, cudaMemcpyHostToDevice);

    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    const dim3 grid((dstImg.cols + TILE_SIZE - 1)/TILE_SIZE, (dstImg.rows + TILE_SIZE - 1)/TILE_SIZE);

    medianFilter<<<grid,block>>>(d_input, d_output, dstImg.cols, dstImg.rows);

    cudaMemcpy(dstImg.ptr(), d_output, outputSize, cudaMemcpyDeviceToHost);

    cudaFree(d_input);
    cudaFree(d_output);

    imwrite(output_file, dstImg);
}

This is my original image:

original_image

and here is one blurred channel:

enter image description here

For some reason I get those stripes on the output image, which is just one of the channels for now. Any idea why this is happening?

talonmies
  • 70,661
  • 34
  • 192
  • 269
craaaft
  • 211
  • 1
  • 8
  • I check this code on CPU. Works fine. Seams you have problem on border case. I suggest to look at `sBuffer` size. It must be larger on 2 pixels for blur size == 3. And you need to fill this values. With shared buffer it's hard to implement.. Maybe do not use shared buffer? Or you hit with performance issue? – Gralex Aug 16 '22 at 11:24
  • Thanks for checking. Yes, it is. So the sBuffer 1 pixel around the image (because filter is 3x3) with 0 in it. I already accounted for it. – craaaft Aug 16 '22 at 11:34
  • I just originally implemented it with shared buffer, how would I go about doing it without it without writing all the code from scratch? – craaaft Aug 16 '22 at 11:55
  • Did't notice that you have different `TILE_SIZE` and `BLOCK_SIZE`. I've simulate your code on CPU. Seams everything works and your code is correct. Can't check how it works on GPU. – Gralex Aug 16 '22 at 12:16

1 Answers1

2

Your intention is that even though you are launching a block of dimension (BLOCK_SIZE, BLOCK_SIZE), you only intend (TILE_SIZE, TILE_SIZE) threads in that block to actually compute the values for output pixels.

However you are not properly accounting for that here:

if (x_o < width && y_o < height) {

that should be, instead:

if (x_o < width && y_o < height  && threadIdx.x < TILE_SIZE && threadIdx.y < TILE_SIZE) {

(In fact, everything after the __syncthreads() in your kernel can be conditioned to only execute if threadIdx.x < TILE_SIZE && threadIdx.y < TILE_SIZE if you wish.)

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • That was it! Thanks a ton! Quite easy to get lost with these conditions in CUDA. – craaaft Aug 17 '22 at 06:33
  • Hmm, somehow for big blocks I get a black image. For example #define BLOCK_SIZE 64 #define TILE_SIZE 54 #define FILTER_WIDTH 11 #define FILTER_HEIGHT 11. Any idea? – craaaft Aug 17 '22 at 15:05
  • 2
    CUDA is limited to 1024 threads per block, which is the product of the dimensions. BLOCK_SIZE of 64 results in a request of 64*64 = 4096 threads per block. That is illegal. Please use proper CUDA error checking. See [here](https://stackoverflow.com/questions/16125389/invalid-configuration-argument-error-for-the-call-of-cuda-kernel/16125510#16125510) – Robert Crovella Aug 17 '22 at 15:29