I am trying to optimize the computation of the mean of each row in my 512w x 1024h
image, and then subtract the mean from the row from which it was computed. I wrote a piece of code which does it in 1.86 ms
, but I want to reduce the speed. This piece of code works fine, but does not use shared memory, and it utilizes for loops. I want to do away with them.
__global__ void subtractMean (const float *__restrict__ img, float *lineImg, int height, int width) {
// height = 1024, width = 512
int tidy = threadIdx.x + blockDim.x * blockIdx.x;
float sum = 0.0f;
float sumDiv = 0.0f;
if(tidy < height) {
for(int c = 0; c < width; c++) {
sum += img[tidy*width + c];
}
sumDiv = (sum/width)/2;
//__syncthreads();
for(int cc = 0; cc < width; cc++) {
lineImg[tidy*width + cc] = img[tidy*width + cc] - sumDiv;
}
}
__syncthreads();
I called the above kernel using:
subtractMean <<< 2, 512 >>> (originalImage, rowMajorImage, actualImHeight, actualImWidth);
However, the following code I wrote uses shared memory to optimize. But, it does not work as expected. Any thoughts on what the problem might be?
__global__ void subtractMean (const float *__restrict__ img, float *lineImg, int height, int width) {
extern __shared__ float perRow[];
int idx = threadIdx.x; // set idx along x
int stride = width/2;
while(idx < width) {
perRow[idx] = 0;
idx += stride;
}
__syncthreads();
int tidx = threadIdx.x; // set idx along x
int tidy = blockIdx.x; // set idx along y
if(tidy < height) {
while(tidx < width) {
perRow[tidx] = img[tidy*width + tidx];
tidx += stride;
}
}
__syncthreads();
tidx = threadIdx.x; // reset idx along x
tidy = blockIdx.x; // reset idx along y
if(tidy < height) {
float sumAllPixelsInRow = 0.0f;
float sumDiv = 0.0f;
while(tidx < width) {
sumAllPixelsInRow += perRow[tidx];
tidx += stride;
}
sumDiv = (sumAllPixelsInRow/width)/2;
tidx = threadIdx.x; // reset idx along x
while(tidx < width) {
lineImg[tidy*width + tidx] = img[tidy*width + tidx] - sumDiv;
tidx += stride;
}
}
__syncthreads();
}
The shared memory function was called using:
subtractMean <<< 1024, 256, sizeof(float)*512 >>> (originalImage, rowMajorImage, actualImHeight, actualImWidth);