In the process of making a rendering engine that fundamentally relies on four times four oversampling, I ran into the performance of the downscaling itself.
#include <stdint.h>
const int_fast32_t sRGBtolinear[256] = {0, 20, 40, 60, 80, 99, 119, 139, 159, 179, 199, 219, 241, 264, 288, 313, 340, 367, 396, 427, 458, 491, 526, 562, 599, 637, 677, 718, 761, 805, 851, 898, 947, 997, 1048, 1101, 1156, 1212, 1270, 1330, 1391, 1453, 1517, 1583, 1651, 1720, 1791, 1863, 1937, 2013, 2090, 2170, 2250, 2333, 2418, 2504, 2592, 2681, 2773, 2866, 2961, 3058, 3157, 3258, 3360, 3464, 3570, 3678, 3788, 3900, 4014, 4129, 4247, 4366, 4488, 4611, 4736, 4864, 4993, 5124, 5257, 5392, 5530, 5669, 5810, 5953, 6099, 6246, 6395, 6547, 6701, 6856, 7014, 7174, 7336, 7500, 7666, 7834, 8004, 8177, 8352, 8529, 8708, 8889, 9072, 9258, 9446, 9636, 9828, 10022, 10219, 10418, 10619, 10822, 11028, 11236, 11446, 11658, 11873, 12090, 12309, 12531, 12754, 12981, 13209, 13440, 13673, 13909, 14147, 14387, 14629, 14874, 15122, 15372, 15624, 15878, 16135, 16394, 16656, 16920, 17187, 17456, 17727, 18001, 18278, 18556, 18838, 19121, 19408, 19696, 19988, 20281, 20578, 20876, 21178, 21481, 21788, 22096, 22408, 22722, 23038, 23357, 23679, 24003, 24329, 24659, 24991, 25325, 25662, 26002, 26344, 26689, 27036, 27387, 27739, 28095, 28453, 28813, 29177, 29543, 29911, 30283, 30657, 31033, 31413, 31795, 32180, 32567, 32957, 33350, 33746, 34144, 34545, 34949, 35355, 35765, 36177, 36591, 37009, 37429, 37852, 38278, 38707, 39138, 39572, 40009, 40449, 40892, 41337, 41786, 42237, 42691, 43147, 43607, 44069, 44534, 45003, 45474, 45947, 46424, 46904, 47386, 47871, 48360, 48851, 49345, 49842, 50342, 50844, 51350, 51859, 52370, 52884, 53402, 53922, 54445, 54972, 55501, 56033, 56568, 57106, 57647, 58191, 58738, 59288, 59841, 60397, 60956, 61518, 62083, 62651, 63222, 63796, 64373, 64953, 65536};
const int_fast32_t lineartosRGBthr[256] = {0, 10, 30, 50, 70, 90, 110, 130, 150, 170, 189, 209, 230, 253, 276, 301, 327, 354, 382, 412, 443, 475, 509, 544, 580, 618, 657, 698, 740, 783, 828, 875, 923, 972, 1023, 1075, 1129, 1185, 1242, 1300, 1360, 1422, 1486, 1551, 1617, 1685, 1755, 1827, 1900, 1975, 2052, 2130, 2210, 2292, 2376, 2461, 2548, 2637, 2727, 2820, 2914, 3010, 3108, 3208, 3309, 3412, 3518, 3625, 3734, 3844, 3957, 4072, 4188, 4307, 4427, 4550, 4674, 4800, 4929, 5059, 5191, 5325, 5461, 5600, 5740, 5882, 6026, 6172, 6321, 6471, 6624, 6779, 6935, 7094, 7255, 7418, 7583, 7750, 7920, 8091, 8265, 8440, 8618, 8798, 8981, 9165, 9352, 9541, 9732, 9925, 10121, 10318, 10518, 10721, 10925, 11132, 11341, 11552, 11766, 11981, 12200, 12420, 12643, 12868, 13095, 13325, 13557, 13791, 14028, 14267, 14508, 14752, 14998, 15247, 15498, 15751, 16007, 16265, 16525, 16788, 17054, 17322, 17592, 17864, 18140, 18417, 18697, 18980, 19265, 19552, 19842, 20135, 20430, 20727, 21027, 21330, 21635, 21942, 22252, 22565, 22880, 23198, 23518, 23841, 24166, 24494, 24825, 25158, 25494, 25832, 26173, 26517, 26863, 27212, 27563, 27917, 28274, 28633, 28995, 29360, 29727, 30097, 30470, 30845, 31223, 31604, 31987, 32373, 32762, 33154, 33548, 33945, 34345, 34747, 35152, 35560, 35971, 36384, 36800, 37219, 37641, 38065, 38493, 38923, 39355, 39791, 40229, 40671, 41115, 41562, 42011, 42464, 42919, 43377, 43838, 44302, 44769, 45238, 45711, 46186, 46664, 47145, 47629, 48116, 48605, 49098, 49593, 50092, 50593, 51097, 51604, 52114, 52627, 53143, 53662, 54184, 54709, 55236, 55767, 56300, 56837, 57377, 57919, 58465, 59013, 59564, 60119, 60676, 61237, 61800, 62367, 62936, 63509, 64084, 64663, 65245};
uint_least8_t lineartosRGB(int32_t value){
uint_least8_t a = 0;
if(lineartosRGBthr[a+128] <= value) a+=128;
if(lineartosRGBthr[a+ 64] <= value) a+= 64;
if(lineartosRGBthr[a+ 32] <= value) a+= 32;
if(lineartosRGBthr[a+ 16] <= value) a+= 16;
if(lineartosRGBthr[a+ 8] <= value) a+= 8;
if(lineartosRGBthr[a+ 4] <= value) a+= 4;
if(lineartosRGBthr[a+ 2] <= value) a+= 2;
if(lineartosRGBthr[a+ 1] <= value) a+= 1;
return a;
}
uint32_t RGBavg16(const uint32_t* pixel){
int_fast32_t red = 0;
int_fast32_t green = 0;
int_fast32_t blue = 0;
for(int_fast16_t i=0; i<16; i++){
red += sRGBtolinear[(pixel[i]>>16)&0xFF];
green += sRGBtolinear[(pixel[i]>> 8)&0xFF];
blue += sRGBtolinear[(pixel[i] )&0xFF];
}
return lineartosRGB((red+8)>>4)*65536+lineartosRGB((green+8)>>4)*256+lineartosRGB((blue+8)>>4)*1;
}
void fourtimesfouroversampling(int* stagesize, uint32_t* pixels, int pixelsscanlineoffset, uint32_t* oversampled, int oversampledscanlineoffset){
for(int i=0; i<stagesize[1]; i++){
for(int j=0; j<stagesize[0]; j++){
uint32_t pixel[16];
for(int k=0; k<4; k++){
for(int l=0; l<4; l++){
pixel[k*4+l] = oversampled[i*4*oversampledscanlineoffset+j*4+l+k*oversampledscanlineoffset];
}
}
pixels[i*pixelsscanlineoffset+j] = RGBavg16(pixel);
}
}
}
Four times four oversampling is a way to achieve anti-aliasing by rendering the exact same way one would do with no anti-aliasing or smoothing methods (bilevel outline rendering, nearest neighbor graphics, etc.) but on a four times four times larger oversampled stage. Then a box filter is used to downscale each pixel by taking the linear average of each sixteen pixel square. The sRGB/linear conversions are required because the sRGB values cannot directly be averaged as they are not a linear scale.
To test performance, use the following main after the previous code, which draws random oversampled pixels and downscales them:
#include <stdlib.h>
#include <time.h>
const int width = 640;
const int height = 480;
int stagesize[2] = {width, height};
uint32_t pixels[width*height];
uint32_t oversampled[width*4*height*4];
int main(){
for(int i=0; i<width*4*height*4; i++) oversampled[i]=0;
srand(time(NULL));
for(int i=0; i<60; i++){
for(int j=0; j<4096; j++){
oversampled[rand()%(height*4)*(width*4)+rand()%(width*4)] = rand();
}
fourtimesfouroversampling(stagesize, pixels, width, oversampled, width*4);
}
return 0;
}
It ends up being on average about 3.731 seconds when -O3 compiled. As the code is unable to render the 60 frames in a second, it is unable to sustain 60fps rendering, and the 60fps program using this renderer would not run at the full speed. What should be done to make the four times four oversampling sustain the 60fps?