I am trying to re-structure my code for MJPEG decoding so that I can implement it in a heterogeneous fashion. I started by finding code segments which I thought could benefit from parallel processing. I got the single threaded CPU code from this link.
The following code works properly. Please note that I omitted some portions of the code to represent it better here.
//Original MCU blocks
MCU_main AllMCUs[nb_MCU_X * nb_MCU_Y];
int32_t MCU[64];
//All RGB blocks
RGB_struct AllRGBs[nb_MCU_X * nb_MCU_Y];
//Counter for the main MCUs
uint32_t x;
x=0;
///////--------Unpack-blocks for 3 color components
//process 8x8 blocks horizontally
for (index_X = 0; index_X < nb_MCU_X; index_X++) {
//process 8x8 blocks vertically
for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
AllMCUs[x].block_id=x;
component_index = component_order[0];
AllMCUs[x].MCU= MCU;
unpack_block(movie, &scan_desc,0, AllMCUs[x].MCU);
for (int i=0; i < 64; i++)
{
AllMCUs[x].MCU_Y[i]= AllMCUs[x].MCU[i];
}
unpack_block(movie, &scan_desc,1, AllMCUs[x].MCU);
for (int i=0; i < 64; i++)
{
AllMCUs[x].MCU_Cb[i]= AllMCUs[x].MCU[i];
}
unpack_block(movie, &scan_desc,2, AllMCUs[x].MCU);
for (int i=0; i < 64; i++)
{
AllMCUs[x].MCU_Cr[i]= AllMCUs[x].MCU[i];
}
x+= 1;
}
}
///////-----------IQZZ, IDCT, Upsampling and Color-conversion---------///////
uint32_t y=0;
x=0;
for (index_X = 0; index_X < nb_MCU_X; index_X++) {
for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
if(AllMCUs[x].block_id == y){
//iqzz, idct and upsampling for Y
iqzz_block();
idct();
upsampler();
//iqzz, idct and upsampling for Cb
iqzz_block();
idct();
upsampler();
//iqzz, idct and upsampling for Cr
iqzz_block();
idct();
upsampler();
AllRGBs[x].block_id= x;
//color-conversion
if(color && (SOF_section.n > 1)){
YCbCr_to_ARGB(YCbCr_MCU, &AllRGBs[x].RGB_MCU, max_ss_h, max_ss_v);
}
else{
to_NB(YCbCr_MCU, RGB_MCU, max_ss_h, max_ss_v);
}
screen_cpyrect(index_Y * MCU_sy * max_ss_h, index_X * MCU_sx * max_ss_v, MCU_sy * max_ss_h, MCU_sx * max_ss_v, &AllRGBs[x].RGB_MCU);
x+=1;
y+=1;
}
}
}
The correct output looks like this:
My structure for RGB_struct is as follows:
typedef struct{
uint32_t block_id;
uint32_t *RGB_MCU
}RGB_struct;
My structure for MCU_main (not immediately relevant though) is this:
typedef struct{
uint32_t block_id;
int32_t *MCU_original;
int32_t MCU_Cb[64], MCU_Cr[64], MCU_Y[64];
}MCU_main;
screen_cpyrect
copies every 8x8 block on the screen according to values of the offset. The code for it is as follows (same as the code from this link):
void screen_cpyrect(uint32_t x, uint32_t y, uint32_t w, uint32_t h, void *ptr)
{
void *dest_ptr;
void *src_ptr;
uint32_t line;
uint32_t w_internal = w, h_internal = h;
int w_length;
SDL_LockSurface(screen);
w_length = screen->pitch / (screen->format->BitsPerPixel / 8 );
#ifdef DEBUG
if ((y) > screen->h) {
printf("[%s] : block can't be copied, "
"not in the screen (too low)\n", __func__);
exit(1);
}
if ((x) > screen->w) {
printf("[%s] : block can't be copied, "
"not in the screen (right border)\n", __func__);
exit(1);
}
#endif
if ((x+w) > screen->w) {
w_internal = screen->w -x;
}
if ((y+h) > screen->h) {
h_internal = screen->h -y;
}
for(line = 0; line < h_internal ; line++)
{
// Positionning src and dest pointers
// _ src : must be placed at the beginning of line "line"
// _ dest : must be placed at the beginning
// of the corresponding block :
//(line offset + current line + position on the current line)
// We assume that RGB is 4 bytes
dest_ptr = (void*)((uint32_t *)(screen->pixels) +
((y+line)*w_length) + x);
src_ptr = (void*)((uint32_t *)ptr + ((line * w)));
memcpy(dest_ptr,src_ptr,w_internal * sizeof(uint32_t));
}
SDL_UnlockSurface(screen);
}
I want the function screen_cpyrect
to remain on the CPU. However, in my working code, it is connected to the rest of the JPEG functions. In order to make my code more parallel, I sought to separate it from the other JPEG functions.
The following code does NOT produce the correct output:
///////--------Unpack-blocks for 3 color components
//process 8x8 blocks horizontally
for (index_X = 0; index_X < nb_MCU_X; index_X++) {
//process 8x8 blocks vertically
for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
//For Y
unpack_block(movie, &scan_desc,0, AllMCUs[x].MCU);
//For Cb
unpack_block(movie, &scan_desc,1, AllMCUs[x].MCU);
//For Cr
unpack_block(movie, &scan_desc,2, AllMCUs[x].MCU);
x+= 1;
}
}
///////-----------IQZZ, IDCT, Upsampling and Color-conversion---------///////
uint32_t y=0;
x=0;
for (index_X = 0; index_X < nb_MCU_X; index_X++) {
for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
if(AllMCUs[x].block_id == y){
//iqzz, idct and upsampling for Y
iqzz_block();
idct();
upsampler();
//iqzz, idct and upsampling for Cb
iqzz_block();
idct();
upsampler();
//iqzz, idct and upsampling for Cr
iqzz_block();
idct();
upsampler();
AllRGBs[x].block_id= x;
//color-conversion
if(color && (SOF_section.n > 1)){
YCbCr_to_ARGB(YCbCr_MCU, &AllRGBs[x].RGB_MCU, max_ss_h, max_ss_v);
}
else{
to_NB(YCbCr_MCU, RGB_MCU, max_ss_h, max_ss_v);
}
x+=1;
y+=1;
}
}
}
//COPYING ALL THE PROCESSED BLOCKS TO THE SCREEN SEPARATELY
x=0;
for (index_X = 0; index_X < nb_MCU_X; index_X++) {
for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
screen_cpyrect(index_Y * MCU_sy * max_ss_h, index_X * MCU_sx * max_ss_v, MCU_sy * max_ss_h, MCU_sx * max_ss_v, &AllRGBs[x].RGB_MCU);
x+=1;
}
}
if (screen_refresh() == 1)
{
end_of_file = 1;
}
I get a completely distorted output. The frames are processed but they are not displayed correctly. The incorrect output looks like this:
The code for color conversion is as follows (copied from this link):
void YCbCr_to_ARGB(uint8_t *YCbCr_MCU[3], uint32_t *RGB_MCU, uint32_t nb_MCU_H, uint32_t nb_MCU_V)
{
uint8_t *MCU_Y, *MCU_Cb, *MCU_Cr;
int R, G, B;
uint32_t ARGB;
uint8_t index, i, j;
MCU_Y = YCbCr_MCU[0];
MCU_Cb = YCbCr_MCU[1];
MCU_Cr = YCbCr_MCU[2];
for (i = 0; i < 8 * nb_MCU_V; i++) {
for (j = 0; j < 8 * nb_MCU_H; j++) {
index = i * (8 * nb_MCU_H) + j;
R = (MCU_Cr[index] - 128) * 1.402f + MCU_Y[index];
B = (MCU_Cb[index] - 128) * 1.7772f + MCU_Y[index];
G = MCU_Y[index] - (MCU_Cb[index] - 128) * 0.34414f -
(MCU_Cr[index] - 128) * 0.71414f;
/* Saturate */
if (R > 255)
R = 255;
if (R < 0)
R = 0;
if (G > 255)
G = 255;
if (G < 0)
G = 0;
if (B > 255)
B = 255;
if (B < 0)
B = 0;
ARGB = ((R & 0xFF) << 16) | ((G & 0xFF) << 8) | (B & 0xFF);
// ARGB = 0xFF << 8;
RGB_MCU[(i * (8 * nb_MCU_H) + j)] = ARGB;
}
}
}
I do not understand why my modified code for decoding frames should not work correctly. I tried to debug it in various ways and I could find no glaring problems in the alternate implementation. All the values of the AllRGBs
array are being stored properly. I printed out the values and they are identical.
RGB before cpy_rect:0x7fff8f868bf8 RGB before cpy_rect:0x7fff8f868c08 RGB before cpy_rect:0x7fff8f868c18 RGB before cpy_rect:0x7fff8f868c28.......
RGB after cpy_rect:0x7fff8f868bf8 RGB after cpy_rect:0x7fff8f868c08 RGB after cpy_rect:0x7fff8f868c18 RGB after cpy_rect:0x7fff8f868c28
Do you have any thoughts on why one version of the code is giving the correct output and not the other?
EDIT:
In order to test my code, I wrote the code to display each 8x8 block as soon as it is copied to the screen. I noticed that in my correct code, the blocks are copied correctly. However, in my incorrect code the 8x8 blocks are not displayed correctly (as you can see in the photos I attached).
Why should this happen even though the RGB values in both my programs are the same? How is it that when I copy all the RGB blocks on the screen in one go I get the incorrect output?
EDIT:
After testing the code in every way I can think of, I am starting to think this issue is arising because of the void pointer.
One of the input parameters of screen_cpyrect
is a void pointer:
extern void screen_cpyrect(uint32_t x, uint32_t y, uint32_t w, uint32_t h, void *ptr)
RGB_MCU
is being used as a function argument in the place of ptr
. However, ptr
is a pointer to everything whereas RGB_MCU
is a pointer to the type uint32_t
as shown in this declaration:
typedef struct{
uint32_t block_id;
uint32_t *RGB_MCU
}RGB_struct;
I read in the comment for this answer that when a void pointer is used incorrectly it can cause the program to give the incorrect output or simply crash.
How can I confirm that the problem I am experiencing is due to the void pointer?