I doubt that either of your kernels are working correctly. You have at least 2 issues:
This is not how to create a proper index:
blockDim.x * blockIdx.x * threadIdx.x
a proper index looks like this:
blockDim.x * blockIdx.x + threadIdx.x
this error is evident for both .x
and .y
indices, in both kernels.
You are confused about CUDA kernel launch syntax, e.g. here: CreateGaussFilter<<<dat.height,dat.width>>>
. The first <<<...>>>
argument is the number of blocks per grid. The second is the number of threads per block. If you pass scalar quantities for both of these arguments (which you do) you will get a 1D grid of 1D threadblocks. 1D here means that in-kernel, your .y
index values will always be zero, so this statement: int col = ( blockDim.y * blockIdx.y * threadIdx.y ) + horizontalImageBound;
will result in every thread in your kernel having a col
value of horizontalImageBound
in your in-kernel printf
statement, %i
is not the correct format parameter for a float
quantity.
You should study any CUDA 2D kernel code for proper usage. Changes are needed in both your host code as well as your kernel code.
A few other notes.
It's nice if you don't strip off the headers your code uses. Some people trying to help you will want to run your code. Make it easy for them (if you want help from them). Just my suggestion, as is this entire post of mine.
You are expected to provide a complete example. See item 1 here. For example, your posted code does not indicate a definition of FilterCreation
anywhere. And I don't have your konik.bmp
, so either indicate how I can get it, or even better, write your code that you post here in such a way that it does not depend on an external file. For example create a dummy image in code, and skip the file load process.
This doesn't have to be that hard. Take what you've posted and create a new project with just that code. Does it compile? If not, keep adding to your posting until it compiles. Then does your posted code reproduce the issue? If not, keep adjusting till it does. In other words, put yourself in the place of those trying to help you. Again, just suggestions.
What follows is a code I attempted to build around what you have shown, while avoiding the issues I mentioned above. I make no claim that it produces the correct output, but should give you an idea how to fix some of the mistakes indicated above.
#include <iostream>
#include <fstream>
struct Info{
int width;
int height;
int offset;
unsigned char * info;
unsigned char * data;
int size;
};
Info readBMP(char* filename)
{
int i;
std::ifstream is(filename, std::ifstream::binary);
is.seekg(0, is.end);
i = is.tellg();
is.seekg(0);
unsigned char *info = new unsigned char[i];
is.read((char *)info,i);
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int offset = *(int*)&info[10];
unsigned char a[offset];
unsigned char *b = new unsigned char[i - offset];
std::copy(info,
info + offset,
a);
std::copy(info + offset,
info + i,
b + 0);
Info dat;
dat.width = width;
dat.height = height;
dat.offset = offset;
dat.size = i;
dat.info = new unsigned char[offset - 1];
dat.data = new unsigned char[i - offset + 1];
for( int j = 0; j < offset ; j++ ){
dat.info[j] = a[j];
}
for( int j = 0; j < i - offset; j++ ){
dat.data[j] = b[j];
}
return dat;
}
__global__ void CreateGaussFilter(unsigned char * src, unsigned char * dst, int kernalHeight, int kernalWidth, double *kernalArray, int rows, int cols){
int verticleImageBound=(kernalHeight-1)/2;
int horizontalImageBound=(kernalWidth-1)/2;
int row = ( blockDim.x * blockIdx.x + threadIdx.x ) + verticleImageBound;
int col = ( blockDim.y * blockIdx.y + threadIdx.y ) + horizontalImageBound;
if ( row >= rows - verticleImageBound || col >= cols - horizontalImageBound ) return;
float value=0;
for(int kRow=0;kRow<kernalHeight;kRow++){
for(int kCol=0;kCol<kernalWidth;kCol++){
float pixel = src[ 3 * ((kRow+row-verticleImageBound ) * cols + (kCol+col-horizontalImageBound)) ] * kernalArray[kRow + kCol * kernalWidth];
value+=pixel;
}
}
printf("value = %f\n",round(value));
dst[3 * ( row * cols + col )] = round(value);
dst[3 * ( row * cols + col ) + 1] = round(value);
dst[3 * ( row * cols + col ) + 2] = round(value);
}
__global__ void greyScale( unsigned char * src , int rows, int cols){
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
if( i >= rows || j >= cols ) {
return;
}
unsigned char r = src[3 * (i * cols + j)];
unsigned char g = src[3 * (i * cols + j) + 1];
unsigned char b = src[3 * (i * cols + j) + 2];
unsigned char linearIntensity = (unsigned char)(0.2126f * r + 0.7512f * g + 0);
src[3 * (i * cols + j)] = linearIntensity;
src[3 * (i * cols + j) + 1] = linearIntensity;
src[3 * (i * cols + j) + 2] = linearIntensity;
}
int main() {
double GKernel[5][5] = {0.1};
//FilterCreation(GKernel);
double * kernel = new double[25];
int i,j,k = 0;
for( int i = 0; i < 5; i++){
for( int j = 0; j < 5; j++){
kernel[k++] = GKernel[i][j];
}
}
double * deviceKernel;
cudaMalloc((void **)&deviceKernel, 25 * sizeof(double));
cudaMemcpy(deviceKernel, kernel, 25 * sizeof(double), cudaMemcpyHostToDevice);
Info dat; // = readBMP("konik.bmp");
dat.width = 766;
dat.height = 511;
dat.size = dat.width*dat.height*3;
dat.offset = 0;
dat.data = new unsigned char[dat.size];
unsigned char * devPtr;
unsigned char * devPtrFilter;
size_t pitch;
unsigned char * test= new unsigned char [dat.size - dat.offset ];
cudaMalloc (( void **)& devPtr , dat.size * sizeof ( unsigned char ));
cudaMalloc (( void **)& devPtrFilter , dat.size * sizeof ( unsigned char ));
cudaMemcpy ( devPtr , dat.data , sizeof ( unsigned char ) * dat.size , cudaMemcpyHostToDevice );
dim3 block(32,32);
dim3 grid((dat.height+31)/32, (dat.width+31)/32);
greyScale<<<grid,block>>>(devPtr,dat.height,dat.width);
CreateGaussFilter<<<grid,block>>>(devPtr,devPtrFilter,5,5,deviceKernel,dat.height,dat.width);
cudaMemcpy ( test, devPtrFilter , sizeof ( unsigned char ) * dat.size ,cudaMemcpyDeviceToHost );
cudaDeviceSynchronize ();
#if 0
std::ofstream fout;
fout.open("output.bmp", std::ios::binary | std::ios::out);
fout.write( reinterpret_cast<char *>(dat.info), dat.offset);
fout.write( reinterpret_cast<char *>(test), dat.size - dat.offset );
fout.close();
#endif
return 0;
}
There were a few other issues with your code.
- your RGB to grayscale is using 0 instead of
b
.
- your gaussian kernel does not write to all output points, so we will fill the output with 0 first.
- I've provided my own 5x5 gaussian kernel coefficients.
With those additional changes:
$ cat t8.cu
#include <iostream>
#include <fstream>
#include <stdio.h>
struct Info{
int width;
int height;
int offset;
unsigned char * info;
unsigned char * data;
int size;
};
Info readBMP(const char* filename)
{
int i;
std::ifstream is(filename, std::ifstream::binary);
is.seekg(0, is.end);
i = is.tellg();
is.seekg(0);
unsigned char *info = new unsigned char[i];
is.read((char *)info,i);
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int offset = *(int*)&info[10];
unsigned char a[offset];
unsigned char *b = new unsigned char[i - offset];
std::copy(info,
info + offset,
a);
std::copy(info + offset,
info + i,
b + 0);
Info dat;
dat.width = width;
dat.height = height;
dat.offset = offset;
dat.size = i;
dat.info = new unsigned char[offset - 1];
dat.data = new unsigned char[i - offset + 1];
for( int j = 0; j < offset ; j++ ){
dat.info[j] = a[j];
}
for( int j = 0; j < i - offset; j++ ){
dat.data[j] = b[j];
}
return dat;
}
__global__ void CreateGaussFilter(unsigned char * src, unsigned char * dst, int kernalHeight, int kernalWidth, double *kernalArray, int rows, int cols){
int verticleImageBound=(kernalHeight-1)/2;
int horizontalImageBound=(kernalWidth-1)/2;
int row = ( blockDim.x * blockIdx.x + threadIdx.x ) + verticleImageBound;
int col = ( blockDim.y * blockIdx.y + threadIdx.y ) + horizontalImageBound;
if ( row >= rows - verticleImageBound || col >= cols - horizontalImageBound ) return;
float value=0;
for(int kRow=0;kRow<kernalHeight;kRow++){
for(int kCol=0;kCol<kernalWidth;kCol++){
float pixel = src[ 3 * ((kRow+row-verticleImageBound ) * cols + (kCol+col-horizontalImageBound)) ] * kernalArray[kRow + kCol * kernalWidth];
value+=pixel;
}
}
// printf("value = %f\n",round(value));
dst[3 * ( row * cols + col )] = round(value);
dst[3 * ( row * cols + col ) + 1] = round(value);
dst[3 * ( row * cols + col ) + 2] = round(value);
}
__global__ void greyScale( unsigned char * src , int rows, int cols){
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
if( i >= rows || j >= cols ) {
return;
}
unsigned char r = src[3 * (i * cols + j)];
unsigned char g = src[3 * (i * cols + j) + 1];
unsigned char b = src[3 * (i * cols + j) + 2];
unsigned char linearIntensity = (unsigned char)(0.21f * r + 0.72f * g + 0.07 * b);
src[3 * (i * cols + j)] = linearIntensity;
src[3 * (i * cols + j) + 1] = linearIntensity;
src[3 * (i * cols + j) + 2] = linearIntensity;
}
int main() {
double GKernel[5][5] = {{1,4,7,4,1},{4,16,26,16,4},{7,26,41,26,7},{4,16,26,16,4},{1,4,7,4,1}};
//FilterCreation(GKernel);
double * kernel = new double[25];
int k = 0;
for( int i = 0; i < 5; i++){
for( int j = 0; j < 5; j++){
kernel[k++] = GKernel[i][j]/273;
}
}
double * deviceKernel;
cudaMalloc((void **)&deviceKernel, 25 * sizeof(double));
cudaMemcpy(deviceKernel, kernel, 25 * sizeof(double), cudaMemcpyHostToDevice);
Info dat = readBMP("input.bmp");
#if 0
dat.width = 766;
dat.height = 511;
dat.size = dat.width*dat.height*3;
dat.offset = 0;
dat.data = new unsigned char[dat.size];
for (int i = 0; i<dat.size; i++) dat.data[i] = (i%dat.width)%255;
#endif
unsigned char * devPtr;
unsigned char * devPtrFilter;
unsigned char * test= new unsigned char [dat.size - dat.offset ];
cudaMalloc (( void **)& devPtr , dat.size * sizeof ( unsigned char ));
cudaMalloc (( void **)& devPtrFilter , dat.size * sizeof ( unsigned char ));
cudaMemset(devPtrFilter, 0, dat.size);
cudaMemcpy ( devPtr , dat.data , sizeof ( unsigned char ) * dat.size , cudaMemcpyHostToDevice );
dim3 block(32,32);
dim3 grid((dat.height+31)/32, (dat.width+31)/32);
greyScale<<<grid,block>>>(devPtr,dat.height,dat.width);
CreateGaussFilter<<<grid,block>>>(devPtr,devPtrFilter,5,5,deviceKernel,dat.height,dat.width);
cudaMemcpy ( test, devPtrFilter , sizeof ( unsigned char ) * (dat.size - dat.offset) ,cudaMemcpyDeviceToHost );
cudaDeviceSynchronize ();
std::ofstream fout;
fout.open("output.bmp", std::ios::binary | std::ios::out);
fout.write( reinterpret_cast<char *>(dat.info), dat.offset);
fout.write( reinterpret_cast<char *>(test), dat.size - dat.offset );
fout.close();
return 0;
}
$ nvcc -o t8 t8.cu
$ cuda-memcheck ./t8
and starting with the BarbaraBlocks1.bmp
file that is available at /usr/local/cuda/samples/3_Imaging/dct8x8/data
on a standard, current CUDA linux install, which looks like this:

It produces output like this:
