Multithreading have NO improvement on speed - Using pthread in C - Why?

Question

in order to become more confortable with multithreading in the i have programmed a little c programm with "intensive" calculation. It is a picture of the mandelbrot set where each pixel is calculated seperately, then pixels buffered to rows. Each thread is getting an equal share of the total number of rows. thus for example having a picture calculated with a height of 1000 rows should end up in two 500 row packages if the number of threads chosen is two. Thus i have suggested that the speed kind of reduces by the factor of two, but no improvement. WHY??? I dont get it, because everything works and seems logical. If anybody can give me a hint, I would be very grateful. Below you see the main and a function for caluclation of the mandelbrot set called by main.

int main(int argc, char ** argv, char ** envp) {

if(argc != 4)
{
printf("Bitte genau 3 Argumente eingeben.\n");
 return 1;
}
//Structs und Variablen für die Stopuhr
struct timeval start, ende;
long ttlende, ttlstart;

width  = str2num(argv[1]);
height = str2num(argv[2]);

int y;
//char blueGreenRed[3];
//Ist Buffer für ganze Zeile: Breite * 3 wegen den 3 Bytes pro Pixel
//char zeile[width*3];

unsigned char info[BMPHEADER_SIZE] = {
              //size
    'B','M',  0,0,0,0, 0,0, 0,0, 54,0,0,0,
              //width  //height
    40,0,0,0, 0,0,0,0, 0,0,0,0,  1,0, 24,0,
              // datasize
    0,0,0,0,  0,0,0,0
};

// BMP lines must be of lengths divisible by 4
char span[4] = "\0\0\0\0";
int spanBytes = 4 - ((width * 3) % 4);
if (spanBytes == 4) spanBytes = 0;
int psize = ((width * 3) + spanBytes) * height;

*( (int*) &info[2])  = BMPHEADER_SIZE + psize;
*( (int*) &info[18]) = width;
*( (int*) &info[22]) = height;
*( (int*) &info[34]) = psize;

write(1, (char *) info, BMPHEADER_SIZE);
//Stoppuhr starten, d.h. get time stamp

//create chunks
int threads= str2num(argv[3]);
int i;
int reminder = height%threads;
int blocksize = height/threads;
int rounds = height/blocksize;
int begin = 1;


//init structs
threadinfo *tinfoptr = getptr(rounds);
//threadinfo tinfo = *tinfoptr;
for (i=1; i<=rounds; ++i){
        int res = blocksize*i;
        if((i==rounds)){
                res = res+reminder;
        }

        //update parameters of tinfo
        (*(tinfoptr+(i-1))).from = begin;
        (*(tinfoptr+(i-1))).to = res;
        (*(tinfoptr+(i-1))).span = span;
        (*(tinfoptr+(i-1))).spanBytes = spanBytes;
        (*(tinfoptr+(i-1))).width = width;
        (*(tinfoptr+(i-1))).height = res-begin+1;
        (*(tinfoptr+(i-1))).results = NULL;
        (*(tinfoptr+(i-1))).threadno = i;
        (*(tinfoptr+(i-1))).blocksizeperthread = -1;
        //altes ende ist neuer start des nächsten blocks.
        begin = res;
}

fprintf(stderr,"inti abgeschlossen, starte threads\n");

pthread_t myThread[rounds];
for (i=1; i<=rounds; ++i){
    fprintf(stderr,"Rufe Thread %d auf\n",i);
    if (pthread_create(&myThread[i-1], NULL, myDo2, (void*)(tinfoptr+.   (i-1))) ) {
        fprintf(stderr, "Error creating thread\n");
        return 1;
    }
}

gettimeofday(&start, NULL);
for (i=1; i<=rounds; ++i){
    /* wait for the second thread to finish */
    if (pthread_join(myThread[i-1], NULL)) {
        fprintf(stderr, "Error joining thread\n");
        return 2;
    }
}
//Stoppuhr beenden, d.h. get time stamp, NULL per Doku.
gettimeofday(&ende,NULL);

    //if the main thread arrives this position, restulptr containts all rows indexed by the threadnr.
    for (i=1; i<=rounds; i++){
        //noch countereinbauen
        int l_blocksize = (tinfoptr+(i-1))->blocksizeperthread;
        for (y=0; y <= l_blocksize; y++) {
            //Zeilenweise nach stdout schreiben
            write(1, (tinfoptr+(i-1))->results[y], width*3);
            // BMP lines must be of lengths divisible by 4
            write(1, span, spanBytes);
        }
    }


ttlende = ende.tv_sec * 1000000 + ende.tv_usec;
ttlstart = start.tv_sec * 1000000 + start.tv_usec;
fprintf(stderr, "\nDauer: %ld Mikrosekunden\n", (ttlende - ttlstart));

return 0;
}

And here the function called:

void* myDo2(void* tiptr){
threadinfo* mythread = (threadinfo*)tiptr;
//copy infos from struct to this thread
int l_from = mythread->from;
int l_to = mythread->to;
int l_width = mythread->width;
int l_height = mythread->height;
//  char **container = createMatrix(l_width*3,l_height);
char **container = malloc (l_height * sizeof(char*));
for(int i = 0; i<l_height; i++){
    container[i] = malloc(l_width*3*sizeof(char));
}

int x,y;
char iterate=0;
Complex c    = {0,0};
Complex newz = {0,0};
float imageRelation = (float)l_width/(float)height;
char blueGreenRed[3];
    //Ist Buffer für ganze Zeile: Breite * 3 wegen den 3 Bytes pro Pixel
    char zeile[l_width*3];
    int counter = 0;

for (y=l_from; y <= l_to; ++y)
{
    for (x=1; x <= l_width; ++x) {
        Complex z = {0,0};
        float quad=0;

        c.re = zoom * (-1.0 + imageRelation * ( (x-1.0) / (width-1.0)) );
        c.im = zoom * ( 0.5 - (y-1.0) / (height-1.0) );

        // iterate
        for ( iterate=1; iterate < colorLimit && quad < quadLimit; ++iterate ) {
            quad = z.re * z.re + z.im * z.im;

            newz.re = (z.re * z.re) - (z.im * z.im) + c.re;
            newz.im =  z.re * z.im * 2.0            + c.im;

            z = newz;
        }
        toRGB(iterate, blueGreenRed);
        //Kopiere 3 Bytes von bgr nach zeile + (x-1)*3
        //Beachte: Die Variable zeile ist ein character array daher wird (x-1)*3 benutzt um 3 Byte Pakete pro Pixel in die Zeile zu laden.
        memcpy((zeile + (x-1)*3), blueGreenRed, 3);
    }
    memcpy(container[counter], zeile, l_width*3);
    counter++;
}

mythread->blocksizeperthread = counter-1;
mythread->results = container;
        fprintf(stderr, "Ich bin Thread-Nr. %d\n", mythread->threadno);
        fprintf(stderr, "und habe eine Menge Zeilen von %d\n", mythread->blocksizeperthread);
        fprintf(stderr, "und habe berechnet von %d\n", l_from);
        fprintf(stderr, "und habe berechnet bis %d\n", l_to);
return NULL;
}

Thank you very much, yours jbug

"Multithreading have NO improvement on speed - Using pthread in C - Why?" - Why **SHOULD IT**? Can you provide a reference to some guarantee multithreading increases speed? And how do you define "speed"? In terms of CPU cycles, it **always** takes more than single threaded programming. — too honest for this site, May 16 '17 at 17:37
You may be thinking of parallel processing, i.e. spitting processes off to separate cores in a multi-core processor? This will have a measurable increase in efficiency if done correctly, and on an algorithm requiring substantially heavy processing. If so, _[Here is a question and discussion](http://stackoverflow.com/q/19324306/645128)_ about that. In a nutshell, threads share the same time/processing power as controlled by the OS running your application. While splitting processes off to another core allows those processes to truly run in parallel, resulting in gained efficiencies. — ryyker, May 16 '17 at 18:23
One possibility is that you don't have the hardware to support real concurrency. Check the number of cores and the architecture of your OS. — Yang Yang, May 16 '17 at 18:31
@ThingyWotsit - Please expand. I am not sure what _actually, no_ means. — ryyker, May 16 '17 at 18:42

score 8 · Accepted Answer · answered May 16 '17 at 19:14

The answer in short is that the model works but you need to give each thread enough work to do to make it worthwhile to absorb the overhead of starting, stopping, and synchronizing the threads. And you must run on a computer capable of having multiple simultaneous threads running (multi-core machine).

I took the application you provided and modified it to actually compile. If I run this on a linux machine that has many CPU cores available and give the myDo2 work thread enough work to do then I see results similar to the following:

./test width height num_threads
./test 10000 10000 1
Dauer: 17,660,185 Mikrosekunden

./test 10000 10000 2 
Dauer: 7,864,508 Mikrosekunden

./test 10000 10000 8
Dauer: 1,100,126 Mikrosekunden

This means with 8 threads the overall wall clock time has reduced from 17.6 seconds to 1.1 seconds which is an improvement greater than 8 times (probably due to better memory and cache usage).

Yet if I give each thread too little work, then my times don't seem to be improving and actually get worse at some point.

./test 10 10 1
Dauer: 70 Mikrosekunden

./test 10 10 2
Dauer: 60 Mikrosekunden

./test 10 10 4 
Dauer: 205 Mikrosekunden

Here you see that the overhead of starting a thread, then stopping and synchronizing with that thread is greater than the amount of work being done inside of the thread.

So the programming model works but you need to utilize it correctly.

I compiled the code below on RedHat using

gcc -std=gnu99 test.c -o test -l pthread

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <pthread.h>
#include <string.h>

typedef struct _threadinfo
{
    int from;
    int to;
    int width;
    int height;
    int blocksizeperthread;
    char **results;
    int threadno;
} threadinfo;

typedef struct _cplx
{
    float re;
    float im;
} Complex;

void* myDo2( void *tiptr )
{
    threadinfo *mythread = (threadinfo *)tiptr;
    //copy infos from struct to this thread
    int l_from = mythread->from;
    int l_to = mythread->to;
    int l_width = mythread->width;
    int l_height = mythread->height;
    char **container = malloc(l_height * sizeof(char *));
    for (int i = 0; i < l_height; i++)
    {
        container[i] = malloc(l_width * 3 * sizeof(char));
    }

    int x, y;
    char iterate = 0;
    Complex c    = { 0, 0 };
    Complex newz = { 0, 0 };
    float imageRelation = (float)l_width / (float)l_height;
    char blueGreenRed[3];
    //Ist Buffer für ganze Zeile: Breite * 3 wegen den 3 Bytes pro Pixel
    char zeile[l_width * 3];                            //1000*3
    int counter = 0;
    float zoom = 1.0;
    float colorLimit = 10.0;
    float quadLimit = 10.0;

    for (y = l_from; y <= l_to; ++y)                    //1..500
    {
        for (x = 1; x <= l_width; ++x)                  //1..1000
        {
            Complex z = { 0, 0 };
            float quad = 0;

            c.re = zoom * (-1.0 + imageRelation * ((x - 1.0) / (l_width - 1.0)));
            c.im = zoom * (0.5 - (y - 1.0) / (l_height - 1.0));

            // iterate
            for (iterate = 1; iterate < colorLimit && quad < quadLimit; ++iterate)
            {
                quad = z.re * z.re + z.im * z.im;

                newz.re = (z.re * z.re) - (z.im * z.im) + c.re;
                newz.im =  z.re * z.im * 2.0            + c.im;

                z = newz;
            }
            //toRGB(iterate, blueGreenRed);
            //Kopiere 3 Bytes von bgr nach zeile + (x-1)*3
            //Beachte: Die Variable zeile ist ein character array daher wird
            //(x-1)*3 benutzt um 3 Byte Pakete pro Pixel in die Zeile zu laden.
            memcpy((zeile + (x - 1) * 3), blueGreenRed, 3);
        }
        memcpy(container[counter], zeile, l_width * 3);
        counter++;
    }

    mythread->blocksizeperthread = counter - 1;
    mythread->results = container;
    fprintf(stderr, "Ich bin Thread-Nr. %d\n", mythread->threadno);
    fprintf(stderr, "und habe eine Menge Zeilen von %d\n", mythread->blocksizeperthread);
    fprintf(stderr, "und habe berechnet von %d\n", l_from);
    fprintf(stderr, "und habe berechnet bis %d\n", l_to);
    return NULL;
}

int main(int argc, char **argv, char **envp)
{
    if (argc != 4)
    {
        printf("Bitte genau 3 Argumente eingeben.\n");
        return 1;
    }
//Structs und Variablen für die Stopuhr
    struct timeval start, ende;
    long ttlende, ttlstart;
    int width;
    int height;

    width  = atoi(argv[1]);
    height = atoi(argv[2]);

    int y;

// BMP lines must be of lengths divisible by 4
    char span[4] = "\0\0\0\0";
    int spanBytes = 4 - ((width * 3) % 4);
    if (spanBytes == 4) spanBytes = 0;
    int psize = ((width * 3) + spanBytes) * height;

//Stoppuhr starten, d.h. get time stamp

//create chunks
    int threads = atoi(argv[3]);
    int i;
    int reminder = height % threads;
    int blocksize = height / threads;
    int rounds = height / blocksize;
    int begin = 1;


//init structs
    threadinfo *tinfoptr = malloc( sizeof(threadinfo) * rounds );
//threadinfo tinfo = *tinfoptr;
    for (i = 1; i <= rounds; ++i)
    {
        //res = 500 * 1;
        //res = 500*2;
        int res = blocksize * i;
        if ((i == rounds))
        {
            res = res + reminder;
        }

        //update parameters of tinfo
        (*(tinfoptr + (i - 1))).from = begin;
        (*(tinfoptr + (i - 1))).to = res;
        (*(tinfoptr + (i - 1))).width = width;
        (*(tinfoptr + (i - 1))).height = res - begin + 1;
        (*(tinfoptr + (i - 1))).results = NULL;
        (*(tinfoptr + (i - 1))).threadno = i;
        (*(tinfoptr + (i - 1))).blocksizeperthread = -1;
        //altes ende ist neuer start des nächsten blocks.
        begin = res;
    }

    fprintf(stderr, "inti abgeschlossen, starte threads\n");

    pthread_t myThread[rounds];
    for (i = 1; i <= rounds; ++i)
    {
        fprintf(stderr, "Rufe Thread %d auf\n", i);
        if (pthread_create(&myThread[i - 1], NULL, myDo2,
                           (void *)(tinfoptr + (i - 1))))
        {
            fprintf(stderr, "Error creating thread\n");
            return 1;
        }
    }

    gettimeofday(&start, NULL);
    for (i = 1; i <= rounds; ++i)
    {
        /* wait for the second thread to finish */
        if (pthread_join(myThread[i - 1], NULL))
        {
            fprintf(stderr, "Error joining thread\n");
            return 2;
        }
    }
//Stoppuhr beenden, d.h. get time stamp, NULL per Doku.
    gettimeofday(&ende, NULL);

    ttlende = ende.tv_sec * 1000000 + ende.tv_usec;
    ttlstart = start.tv_sec * 1000000 + start.tv_usec;
    fprintf(stderr, "\nDauer: %ld Mikrosekunden\n", (ttlende - ttlstart));

    return 0;
}

thank you very much for your very good and competent anwser :). i have switched the system to a computer where i know for sure that it is a multicore system and it worked. prior to your answer i have tested it within a virtual linux instance. Since this virtual instance runs on a computer having a quadcore i thought that even since the virutal machine just have one core (as i found out with cat proc/stat) the threads will somehow magically translated such that real paralellism is possible. However, thank you very much. — J. Bug, May 17 '17 at 14:47

Multithreading have NO improvement on speed - Using pthread in C - Why?

1 Answers1