Learning multi-threading, don't understand why example results in only small (10%) performance improvement

Question

I am learning multi-threaded programming using C, pthreads, and Ubuntu Linux (20.04). In order to try out pthreads I created the following simple programs:

#include <locale.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <time.h>

// #define _REENTRANT macro not required if compile with -pthread flag on
// Linux (TLPI, Ch. 29)
// Note:  This macro causes the declarations of a few reentrant functions
//        to be exposed.

typedef long long int bignum;

// Function for "worker" threads:
static void *adder(void *arg) {
    // Cast void *arg to correct type:
    bignum num = (bignum)arg;
    bignum res = 0;
    pid_t tid = gettid();  // Linux thread ID - not necessarily same as
                           // pthread ID...
    long ptid = pthread_self();  // Pthread ID is opaque value and shouldn't
                                 // be used this way...
    clock_t start, end;

    // Time program:
    start = clock();

    // Need to use same TID throughout so using (illegal) POSIX TID:
    // printf("Worker thread %d (kernel TID) starting with %'lld...\n", tid, num);
    printf("Worker thread %ld (PTID) starting with %'lld...\n", ptid, num);
    // Work:
    for (bignum i = 0; i <= num; ++i) {
        res += i;
    }

    end = clock();
    double elapsed = ((double)(end - start))/CLOCKS_PER_SEC;
    // Note:  Pthread_self is an opaque value and shouldn't be used like this!
    printf("Worker thread %ld (PTID) ending - ran for %f seconds.\n", ptid,
           elapsed);

    // Cast return type to void *:
    return (void *)res;
}

int main(int argc, char *argv[]) {
    pthread_t tid;
    void *res;
    int status;
    clock_t start, end;

    if (argc != 2) {
        printf("Usage:  %s <Number>\n", argv[0]);

        exit(1);
    }
    // Should do error checking here:
    bignum num = atoll(argv[1]);

    // Use locale to format numbers nicely, e.g., 1000 = 1,000
    setlocale(LC_NUMERIC, "");

    // Time program:
    start = clock();

    printf("Main thread starting...\n");
    status = pthread_create(&tid, NULL, adder, (void *)num);
    if (status != 0) {
        // Get errno and display relevant info:
        fprintf(stderr, "Error:  pthread_create failed with...\n");
    }

    // Join thread(s) to recover resources:
    status = pthread_join(tid, &res);
    if (status != 0) {
        // Get errno and display relevant info:
        fprintf(stderr, "Error:  pthread_join failed with...\n");
    }

    // Cast return value to correct type:
    printf("Thread %ld returned %'lld\n", tid, (bignum)res);

    end = clock();
    double elapsed = ((double)(end - start))/CLOCKS_PER_SEC;
    printf("Main thread ending, program ran for %f seconds.\n", elapsed);

    // Note:  When main thread returns from main(), all threads terminated!
    return 0;
}

I also created this modified multi-threaded version:

#include <locale.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <time.h>

// #define _REENTRANT macro not required if compile with -pthread flag on
// Linux (TLPI, Ch. 29)
// Note:  This macro causes the declarations of a few reentrant functions
//        to be exposed.

typedef long long int bignum;
const int Children = 2;
struct numPart {
    bignum start;
    bignum end;
};

// Function for "worker" threads:
static void *adder(void *arg) {
    // Cast void *arg to correct type:
    // bignum num = (bignum)arg;
    struct numPart *num = (struct numPart*)arg;
    bignum res = 0;
    pid_t tid = gettid();  // Linux thread ID - not necessarily same as
                           // pthread ID...
    long ptid = pthread_self();  // Pthread ID is opaque value and shouldn't
                                 // be used this way...
    clock_t start, end;

    // Time program:
    start = clock();

    // Need to use same TID throughout so using (illegal) POSIX TID:
    // printf("Worker thread %d (kernel TID) starting with range %'lld-%'lld...\n",
    //        tid, num->start, num->end);
    printf("Worker thread %ld (PTID) starting with range %'lld-%'lld...\n",
           ptid, num->start, num->end);
    // Work:
    for (bignum i = num->start; i <= num->end; ++i) {
        res += i;
    }

    end = clock();
    double elapsed = ((double)(end - start))/CLOCKS_PER_SEC;
    // Note:  Pthread_self is an opaque value and shouldn't be used like this!
    printf("Worker thread %ld (PTID) ending - ran for %f seconds.\n",
           ptid, elapsed);

    // Cast return type to void *:
    return (void *)res;
}

int main(int argc, char *argv[]) {
    pthread_t tids[Children];
    // pthread_t tid;
    void *res;
    int status;
    clock_t start, end;

    if (argc != 2) {
        printf("Usage:  %s <Number>\n", argv[0]);

        exit(1);
    }
    // Should do error checking here:
    bignum num = atoll(argv[1]);
    // Although this seems to split the numbers so that the totals are
    // similar, from timing the threads it appears the second one takes
    // substantially longer so giving more numbers to the first thread:
    // bignum part_range = (bignum)(num * .705);
    bignum part_range = (bignum)(num * .50);
    bignum total = 0;

    // Use locale to format numbers nicely, e.g., 1000 = 1,000
    setlocale(LC_NUMERIC, "");

    // Time program:
    start = clock();

    printf("Main thread starting...\n");
    struct numPart part1 = {0, part_range - 1};
    struct numPart part2 = {part_range, num};

    status = pthread_create(&tids[0], NULL, adder, (void *)&part1);
    if (status != 0) {
        // Get errno and display relevant info:
        fprintf(stderr, "Error:  pthread_create failed with...\n");
    }

    status = pthread_create(&tids[1], NULL, adder, (void *)&part2);
    if (status != 0) {
        // Get errno and display relevant info:
        fprintf(stderr, "Error:  pthread_create failed with...\n");
    }

    // Join thread(s) to recover resources:
    for (int i = 0; i < Children; ++i) {
        status = pthread_join(tids[i], &res);
        if (status != 0) {
            // Get errno and display relevant info:
            fprintf(stderr, "Error:  pthread_join failed with...\n");
        }

        // Cast return value to correct type:
        printf("Thread %ld returned %'lld\n", tids[i], (bignum) res);
        total += (bignum) res;
    }

    printf("End result:  %'lld\n", total);

    end = clock();
    double elapsed = ((double)(end - start))/CLOCKS_PER_SEC;
    printf("Main thread ending, program ran for %f seconds.\n", elapsed);

    // Note:  When main thread returns from main(), all threads terminated!
    return 0;
}

When I run the programs, here are the results:

small@ubuntuvm:~/sdev385/lab4$ ./singlethread 4000000000
Main thread starting...
Worker thread 140596613773056 (PTID) starting with 4,000,000,000...
Worker thread 140596613773056 (PTID) ending - ran for 12.999637 seconds.
Thread 140596613773056 returned 8,000,000,002,000,000,000
Main thread ending, program ran for 12.999978 seconds.

small@ubuntuvm:~/sdev385/lab4$ ./multithread 4000000000
Main thread starting...
Worker thread 139697530042112 (PTID) starting with range 0-1,999,999,999...
Worker thread 139697521649408 (PTID) starting with range 2,000,000,000-4,000,000,000...
Worker thread 139697521649408 (PTID) ending - ran for 11.295490 seconds.
Worker thread 139697530042112 (PTID) ending - ran for 11.402623 seconds.
Thread 139697530042112 returned 1,999,999,999,000,000,000
Thread 139697521649408 returned 6,000,000,003,000,000,000
End result: 8,000,000,002,000,000,000
Main thread ending, program ran for 11.402860 seconds.

I used htop to monitor and it appears that singlethread maxes out one CPU and multithread maxes out both CPUs. My expectation was that with two processors and splitting the work fairly evenly that I would get a significant performance improvement (the multithreaded version would finish a lot faster). However, the improvement is quite modest - just over 10%. This leads me to think I must be doing something wrong or perhaps my example program is a bad approach?

I would welcome any feedback on why the performance improvement is so small.

Updates: I am running Ubuntu 20.04 Linux on Windows 10 on VMware Workstation Pro. My Windows 10 host has 4 cores with hyperthreading (so 8 logical cores). I believe VMware maps each Linux CPU to a logical core.

I also created a similar version of the above multithread program which uses 4 threads and got even worse results:

small@ubuntuvm:~/sdev385/lab4$ ./multithread4 4000000000
Main thread starting...
Worker thread 140578578196224 (PTID) starting with range 0-999,999,999...
Worker thread 140578569803520 (PTID) starting with range 1,000,000,000-1,999,999,999...
Worker thread 140578561410816 (PTID) starting with range 2,000,000,000-2,999,999,999...
Worker thread 140578553018112 (PTID) starting with range 3,000,000,000-4,000,000,000...
Worker thread 140578578196224 (PTID) ending - ran for 11.378322 seconds.
Thread 140578578196224 returned 499,999,999,500,000,000
Worker thread 140578569803520 (PTID) ending - ran for 11.627129 seconds.
Thread 140578569803520 returned 1,499,999,999,500,000,000
Worker thread 140578553018112 (PTID) ending - ran for 12.004422 seconds.
Worker thread 140578561410816 (PTID) ending - ran for 12.037362 seconds.
Thread 140578561410816 returned 2,499,999,999,500,000,000
Thread 140578553018112 returned 3,500,000,003,500,000,000
End result:  8,000,000,002,000,000,000
Main thread ending, program ran for 12.038055 seconds.

When I ran this four thread version, I confirmed on Linux that all four processors were maxed. I also check on the Windows host and VMware was taking a little under 50% of the CPU capacity. That's about right - 4 Linux CPUs mapped to 4 logical cores out of 8 means works out to up to half the CPU capacity. And yet, the performance was actually worse than the two thread version. I must be missing the bus, but I don't understand why the performance improvement is so little and gets worse with more threads.

Note: As my guide I am using Kerrisk, M. (2010, October). The Linux Programming Interface. No Starch Press. Available from https://learning.oreilly.com/library/view/the-linux-programming/9781593272203/

1. Are you sure it's running on 2 different cores and not only two different threads on the same core? (if hyperthreading is enabled) — Alexis, Sep 26 '20 at 02:20
@Alexis I believe you're correct - I think the hypervisor is mapping each CPU to a "hyper-thread" as opposed to a real core. However, I also ran a similar multi-threaded version which used 4 threads. Again - it's mapping to hyper-threads but still two different cores. This version actually performed even worse. Does that mean it's not hyper-threading? How do I rule it out? — James S., Sep 26 '20 at 02:22
@ggorlen Sorry - I pasted in the wrong version, the real singlethread program doesn't have that (the OMP pragma) - I just updated. — James S., Sep 26 '20 at 02:30
`clock()` doesn't seem good at measureing time in multi-threaded application. Try using another method like [`gettimeofday()`](https://man7.org/linux/man-pages/man2/gettimeofday.2.html). — MikeCAT, Sep 26 '20 at 02:33

Learning multi-threading, don't understand why example results in only small (10%) performance improvement

0 Answers0