I am trying to implement image processing algorithm like gaussian filtering, bilateral filtering in GPU using glsl.
And I am getting confused with which part is "really" parallel execution. for example, I have a 1280*720 preview as texture. I am not quite sure which part is really running for 1280*720 times and which part is not.
what's the dispatching mechanism of glsl codes?
my gaussian filtering code is like:
#extension GL_OES_EGL_image_external : require
precision mediump float;
varying vec2 vTextureCoord;
uniform samplerExternalOES sTexture;
uniform sampler2D sTextureMask;
void main() {
float r=texture2D(sTexture, vTextureCoord).r;
float g=texture2D(sTexture, vTextureCoord).g;
float b=texture2D(sTexture, vTextureCoord).b;
// a test sample
float test=1.0*0.5;
float width=1280.0;
float height=720.0;
vec4 sum;
//offsets of a 3*3 kernel
vec2 offset0=vec2(-1.0,-1.0); vec2 offset1=vec2(0.0,-1.0); vec2 offset2=vec2(1.0,-1.0);
vec2 offset3=vec2(-1.0,0.0); vec2 offset4=vec2(0.0,0.0); vec2 offset5=vec2(1.0,0.0);
vec2 offset6=vec2(-1.0,1.0); vec2 offset7=vec2(0.0,1.0); vec2 offset8=vec2(1.0,1.0);
//gaussina kernel with sigma==100.0;
float kernelValue0 = 0.999900; float kernelValue1 = 0.999950; float kernelValue2 = 0.999900;
float kernelValue3 = 0.999950; float kernelValue4 =1.000000; float kernelValue5 = 0.999950;
float kernelValue6 = 0.999900; float kernelValue7 = 0.999950; float kernelValue8 = 0.999900;
vec4 cTemp0;vec4 cTemp1;vec4 cTemp2;vec4 cTemp3;vec4 cTemp4;vec4 cTemp5;vec4 cTemp6;vec4 cTemp7;vec4 cTemp8;
//getting 3*3 pixel values around current pixel
vec2 src_coor_2;
src_coor_2=vec2(vTextureCoord[0]+offset0.x/width,vTextureCoord[1]+offset0.y/height);
cTemp0=texture2D(sTexture, src_coor_2);
src_coor_2=vec2(vTextureCoord[0]+offset1.x/width,vTextureCoord[1]+offset1.y/height);
cTemp1=texture2D(sTexture, src_coor_2);
src_coor_2=vec2(vTextureCoord[0]+offset2.x/width,vTextureCoord[1]+offset2.y/height);
cTemp2=texture2D(sTexture, src_coor_2);
src_coor_2=vec2(vTextureCoord[0]+offset3.x/width,vTextureCoord[1]+offset3.y/height);
cTemp3=texture2D(sTexture, src_coor_2);
src_coor_2=vec2(vTextureCoord[0]+offset4.x/width,vTextureCoord[1]+offset4.y/height);
cTemp4=texture2D(sTexture, src_coor_2);
src_coor_2=vec2(vTextureCoord[0]+offset5.x/width,vTextureCoord[1]+offset5.y/height);
cTemp5=texture2D(sTexture, src_coor_2);
src_coor_2=vec2(vTextureCoord[0]+offset6.x/width,vTextureCoord[1]+offset6.y/height);
cTemp6=texture2D(sTexture, src_coor_2);
src_coor_2=vec2(vTextureCoord[0]+offset7.x/width,vTextureCoord[1]+offset7.y/height);
cTemp7=texture2D(sTexture, src_coor_2);
src_coor_2=vec2(vTextureCoord[0]+offset8.x/width,vTextureCoord[1]+offset8.y/height);
cTemp8=texture2D(sTexture, src_coor_2);
//convolution
sum =kernelValue0*cTemp0+kernelValue1*cTemp1+kernelValue2*cTemp2+
kernelValue3*cTemp3+kernelValue4*cTemp4+kernelValue5*cTemp5+
kernelValue6*cTemp6+kernelValue7*cTemp7+kernelValue8*cTemp8;
float factor=kernelValue0+kernelValue1+kernelValue2+kernelValue3+kernelValue4+kernelValue5+kernelValue6+kernelValue7+kernelValue8;
gl_FragColor = sum/factor;
//gl_FragColor=texture2D(sTexture, vTextureCoord);
}
this code is running with lower fps against pure preview on my phone(galaxy nexus).
but if I change the last part of my code to direct output with original pixel value, like
//gl_FragColor = sum/factor;
gl_FragColor=texture2D(sTexture, vTextureCoord);
it would run fast and same fps as pure preview.
the quesion is: things I write for test and useless in the beginning like:
float test=1.0*0.5;
how many time is it executed?
other parts like:
sum =kernelValue0*cTemp0+kernelValue1*cTemp1+kernelValue2*cTemp2+
kernelValue3*cTemp3+kernelValue4*cTemp4+kernelValue5*cTemp5+
kernelValue6*cTemp6+kernelValue7*cTemp7+kernelValue8*cTemp8;
would not run 1280*720 times just when I change
gl_FragColor = sum/factor;
to
gl_FragColor=texture2D(sTexture, vTextureCoord);
?
how is the mechanism to decide which is to run 1280*720 times, which is just useless when parallel though out the pixels? is it done automatically?
what's the architecture, dispatching, how it organize the data to the GPU and other things for a glsl program?
I am wondering what should I do for more complicated operations like bilateral filtering and with kernel size like 9*9 and 9 times per pixel than this 3*3 gaussian kernel.