Why is this code getting faster when I'm using way more threads than my CPU has cores?

Question

I have some timing code which I'm using to measure the runtime of a given snippet of code:

struct time_data {
    std::chrono::steady_clock::time_point start, end;

    auto get_duration() const {
        return end - start;
    }

    void print_data(std::ostream & out) const {
        out << str();
    }

    std::string str() const {
        static const std::locale loc{ "" };
        std::stringstream ss;
        ss.imbue(loc);
        ss << "Start:    " << std::setw(24) << std::chrono::nanoseconds{ start.time_since_epoch() }.count() << "ns\n";
        ss << "End:      " << std::setw(24) << std::chrono::nanoseconds{ end.time_since_epoch() }.count() << "ns\n";
        ss << "Duration: " << std::setw(24) << std::chrono::nanoseconds{ get_duration() }.count() << "ns\n";
        return ss.str();
    }

    static friend std::ostream & operator<<(std::ostream & out, const time_data & data) {
        return out << data.str();
    }
};

template<typename T>
time_data time_function(T && func) {
    time_data data;
    data.start = std::chrono::steady_clock::now();
    func();
    data.end = std::chrono::steady_clock::now();
    return data;
}

Then, to make use of it, I have the following code designed to perform an accumulation on a set of data, but instead of simply adding the numbers together, it first finds the Total Stopping Time from applying the Collatz Conjecture and accumulates that instead.

template<typename T>
T accumulation_function(T a, T b) {
    T count = 0;
    while (b > 1) {
        if (b % 2 == 0) b /= 2;
        else b = b * 3 + 1;
        ++count;
    }
    return a + count;
}

template<typename IT>
auto std_sum(IT begin, IT end) {
    auto sum = (*begin - *begin);
    sum = std::accumulate(begin, end, sum, accumulation_function<decltype(sum)>);
    return sum;
}

template<typename IT>
auto single_thread_sum(IT begin, IT end) {
    auto sum = (*begin - *begin);
    IT current = begin;
    while (current != end) {
        sum = accumulation_function(sum, *current);
        ++current;
    }
    return sum;
}

template<typename IT, uint64_t N>
auto N_thread_smart_sum(IT begin, IT end) {
    auto sum = (*begin - *begin);
    std::vector<std::thread> threads;
    std::array<decltype(sum), N> sums;
    auto dist = std::distance(begin, end);
    for (uint64_t i = 0; i < N; i++) {
        threads.emplace_back([=, &sums] {
            IT first = begin;
            IT last = begin;
            auto & tsum = sums[i];
            tsum = 0;
            std::advance(first, i * dist / N);
            std::advance(last, (i + 1) * dist / N);
            while (first != last) {
                tsum = accumulation_function(tsum, *first);
                ++first;
            }
        });
    }
    for (std::thread & thread : threads)
        thread.join();
    for (const auto & s : sums) {
        sum += s;
    }
    return sum;
}

template<typename IT>
auto two_thread_smart_sum(IT begin, IT end) {
    return N_thread_smart_sum<IT, 2>(begin, end);
}

template<typename IT>
auto four_thread_smart_sum(IT begin, IT end) {
    return N_thread_smart_sum<IT, 4>(begin, end);
}

template<typename IT>
auto eight_thread_smart_sum(IT begin, IT end) {
    return N_thread_smart_sum<IT, 8>(begin, end);
}

template<typename IT>
auto sixteen_thread_smart_sum(IT begin, IT end) {
    return N_thread_smart_sum<IT, 16>(begin, end);
}

template<typename IT>
auto thirty_two_thread_smart_sum(IT begin, IT end) {
    return N_thread_smart_sum<IT, 32>(begin, end);
}

template<typename IT>
auto sixty_four_thread_smart_sum(IT begin, IT end) {
    return N_thread_smart_sum<IT, 64>(begin, end);
}

And for reference sake: the boilerplate main code that runs it all:

int main() {
    std::vector<int64_t> raw_data;
    auto fill_data = time_function([&raw_data] {
        constexpr uint64_t SIZE = 1'000'000'000ull;
        raw_data.resize(SIZE);
        std::vector<std::thread> threads;
        for (int i = 0; i < 8; i++) {
            threads.emplace_back([i, SIZE, &raw_data] {
                uint64_t begin = i * SIZE / 8;
                uint64_t end = (i + 1) * SIZE / 8;
                for (uint64_t index = begin; index < end; index++) {
                    raw_data[index] = begin % (20 + i);
                }
            });
        }
        for (std::thread & t : threads) 
            t.join();
    });

    int64_t sum;

    std::cout << std::setw(25) << "Fill Data" << std::endl;
    std::cout << fill_data << std::endl;

    auto std_data = time_function([&] {
        sum = std_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "STD Sum: " << sum << std::endl;
    std::cout << std_data << std::endl;

    auto single_data = time_function([&] {
        sum = single_thread_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Single Sum: " << sum << std::endl;
    std::cout << single_data << std::endl;

    auto smart_2_data = time_function([&] {
        sum = two_thread_smart_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Two-Thread-Smart Sum: " << sum << std::endl;
    std::cout << smart_2_data << std::endl;

    auto smart_4_data = time_function([&] {
        sum = four_thread_smart_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Four-Thread-Smart Sum: " << sum << std::endl;
    std::cout << smart_4_data << std::endl;

    auto smart_8_data = time_function([&] {
        sum = eight_thread_smart_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Eight-Thread-Smart Sum: " << sum << std::endl;
    std::cout << smart_8_data << std::endl;

    auto smart_16_data = time_function([&] {
        sum = sixteen_thread_smart_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Sixteen-Thread-Smart Sum: " << sum << std::endl;
    std::cout << smart_16_data << std::endl;

    auto smart_32_data = time_function([&] {
        sum = thirty_two_thread_smart_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Thirty-Two-Thread-Smart Sum: " << sum << std::endl;
    std::cout << smart_32_data << std::endl;

    auto smart_64_data = time_function([&] {
        sum = sixty_four_thread_smart_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Sixty-Four-Thread-Smart Sum: " << sum << std::endl;
    std::cout << smart_64_data << std::endl;
    return 0;
}

This is the output of the program:

                Fill Data
Start:          16,295,979,890,252ns
End:            16,300,523,805,484ns
Duration:            4,543,915,232ns

                STD Sum: 7750000000
Start:          16,300,550,212,791ns
End:            16,313,216,607,890ns
Duration:           12,666,395,099ns

             Single Sum: 7750000000
Start:          16,313,216,694,522ns
End:            16,325,774,379,684ns
Duration:           12,557,685,162ns

   Two-Thread-Smart Sum: 7750000000
Start:          16,325,774,466,014ns
End:            16,334,441,032,868ns
Duration:            8,666,566,854ns

  Four-Thread-Smart Sum: 7750000000
Start:          16,334,441,137,913ns
End:            16,342,188,642,721ns
Duration:            7,747,504,808ns

 Eight-Thread-Smart Sum: 7750000000
Start:          16,342,188,770,706ns
End:            16,347,850,908,471ns
Duration:            5,662,137,765ns

Sixteen-Thread-Smart Sum: 7750000000
Start:          16,347,850,961,597ns
End:            16,352,187,838,584ns
Duration:            4,336,876,987ns

Thirty-Two-Thread-Smart Sum: 7750000000
Start:          16,352,187,891,710ns
End:            16,356,111,411,220ns
Duration:            3,923,519,510ns

Sixty-Four-Thread-Smart Sum: 7750000000
Start:          16,356,111,471,288ns
End:            16,359,988,028,812ns
Duration:            3,876,557,524ns

The first few results aren't surprising: my self-written accumulate code works about the same as the std::accumulate function (I've seen both emerge as the "fastest" on consecutive runs, implying their implementation is probably similar). And when I move to Two threads, and Four threads, the code gets faster. This makes sense, because I'm using an Intel 4-core processor.

But the results beyond that point are confusing. My CPU only has 4 cores (8 if you consider the hyperthreading), but even if I excuse that the Hyperthreading is giving marginal performance increases at 8 threads, bumping the number up to 16, 32, and 64 threads all yield additional performance gains. Why is this? How are the additional threads yielding additional performance gains, when I've long since maxed out the number of threads the CPU can physically concurrently run?

Note: this is not the same as this linked question, as I'm dealing with a specific use case and code, whereas the linked question is dealing in generalities.

EDIT:

I made an adjustment to the code so that when threads are created, their priority is set to the highest priority Windows allows. Results, broadly speaking, seem to be the same.

                Fill Data
Start:          18,950,798,175,057ns
End:            18,955,085,850,007ns
Duration:            4,287,674,950ns

                STD Sum: 7750000000
Start:          18,955,086,975,013ns
End:            18,967,581,061,562ns
Duration:           12,494,086,549ns

             Single Sum: 7750000000
Start:          18,967,581,136,724ns
End:            18,980,127,355,698ns
Duration:           12,546,218,974ns

   Two-Thread-Smart Sum: 7750000000
Start:          18,980,127,426,332ns
End:            18,988,619,042,214ns
Duration:            8,491,615,882ns

  Four-Thread-Smart Sum: 7750000000
Start:          18,988,619,135,487ns
End:            18,996,215,684,824ns
Duration:            7,596,549,337ns

 Eight-Thread-Smart Sum: 7750000000
Start:          18,996,215,763,004ns
End:            19,002,055,977,730ns
Duration:            5,840,214,726ns

Sixteen-Thread-Smart Sum: 7750000000
Start:          19,002,056,055,608ns
End:            19,006,282,772,254ns
Duration:            4,226,716,646ns

Thirty-Two-Thread-Smart Sum: 7750000000
Start:          19,006,282,840,774ns
End:            19,010,539,676,914ns
Duration:            4,256,836,140ns

Sixty-Four-Thread-Smart Sum: 7750000000
Start:          19,010,539,758,113ns
End:            19,014,450,311,829ns
Duration:            3,910,553,716ns

DOUBLE EDIT COMBO:

I edited the program to make sure the tsum variable was a local variable inside the lambda, rather than a reference to outside the lambda. The Multithreaded code sped up quite significantly, but it's still exhibiting the same behavior.

                Fill Data
Start:          21,740,931,802,656ns
End:            21,745,429,501,480ns
Duration:            4,497,698,824ns

                STD Sum: 7750000000
Start:          21,745,430,637,655ns
End:            21,758,206,612,632ns
Duration:           12,775,974,977ns

             Single Sum: 7750000000
Start:          21,758,206,681,153ns
End:            21,771,260,233,797ns
Duration:           13,053,552,644ns

   Two-Thread-Smart Sum: 7750000000
Start:          21,771,260,287,828ns
End:            21,777,845,764,595ns
Duration:            6,585,476,767ns

  Four-Thread-Smart Sum: 7750000000
Start:          21,777,845,831,002ns
End:            21,784,011,543,159ns
Duration:            6,165,712,157ns

 Eight-Thread-Smart Sum: 7750000000
Start:          21,784,011,628,584ns
End:            21,788,846,061,014ns
Duration:            4,834,432,430ns

Sixteen-Thread-Smart Sum: 7750000000
Start:          21,788,846,127,422ns
End:            21,791,921,652,246ns
Duration:            3,075,524,824ns

Thirty-Two-Thread-Smart Sum: 7750000000
Start:          21,791,921,747,330ns
End:            21,794,980,832,033ns
Duration:            3,059,084,703ns

Sixty-Four-Thread-Smart Sum: 7750000000
Start:          21,794,980,901,761ns
End:            21,797,937,401,426ns
Duration:            2,956,499,665ns

TRIPLE EDIT WOMBO-COMBO:

I ran the same tests again, but in reverse. Results are quite similar:

                Fill Data
Start:          22,845,472,001,475ns
End:            22,850,746,219,215ns
Duration:            5,274,217,740ns

Sixty-Four-Thread-Smart Sum: 7750000000
Start:          22,850,747,328,223ns
End:            22,853,951,903,952ns
Duration:            3,204,575,729ns

Thirty-Two-Thread-Smart Sum: 7750000000
Start:          22,853,951,981,830ns
End:            22,857,239,237,507ns
Duration:            3,287,255,677ns

Sixteen-Thread-Smart Sum: 7750000000
Start:          22,857,239,307,839ns
End:            22,860,389,285,305ns
Duration:            3,149,977,466ns

 Eight-Thread-Smart Sum: 7750000000
Start:          22,860,389,353,524ns
End:            22,864,828,560,484ns
Duration:            4,439,206,960ns

  Four-Thread-Smart Sum: 7750000000
Start:          22,864,828,633,834ns
End:            22,870,640,982,096ns
Duration:            5,812,348,262ns

   Two-Thread-Smart Sum: 7750000000
Start:          22,870,641,056,956ns
End:            22,877,032,284,384ns
Duration:            6,391,227,428ns

             Single Sum: 7750000000
Start:          22,877,032,367,997ns
End:            22,889,501,695,960ns
Duration:           12,469,327,963ns

                STD Sum: 7750000000
Start:          22,889,501,770,820ns
End:            22,902,014,633,229ns
Duration:           12,512,862,409ns

It certainly has to do with how memory is allocated, and how well filled the memory cache pages are. I did a multi-thread program and I also noticed an important boost of performance when the chunks of memory are smaller. Reading memory from different pages cost a bit of CPU power if I recall. — lucas92, Oct 13 '16 at 19:07
because even a single core can handle multithreads in parallel to gain performance .. check http://stackoverflow.com/questions/24513714/how-does-a-single-cpu-handle-multi-threaded-and-multi-process-applications — HazemGomaa, Oct 13 '16 at 19:13
Some good points made here, as well. http://stackoverflow.com/questions/3126154/multithreading-what-is-the-point-of-more-threads-than-cores — davepmiller, Oct 13 '16 at 19:15
It is possible that threads block on some operations like fetching data from DRAM or updating TLB. This allows the OS to schedule a different thread to use a CPU core. — ilya1725, Oct 13 '16 at 19:18
A possible explanation: creating more processes makes it more likely the scheduler will give time to them and not side/background processes. By creating excessively more, you're milking the system for more cycles for your purposes. You might want to see the effect of increasing process priority or running on a minimalist system. — , Oct 13 '16 at 19:22
@H.G: The question already states that they are well beyond the hyperthreading core count, or at least believe so, so that shouldn't be a factor here. — GManNickG, Oct 13 '16 at 19:24
@GManNickG I'm not sure what you mean .. I'm saying that one core can still handle more than 2 threads, regardless of the hyperthreading ... — HazemGomaa, Oct 13 '16 at 19:34
@H.G: Right, but it does so by preemptive whatever was running there. Cores can't run arbitrary amounts of work, at some point to run one thing you have to stop running something else. This is how a single-core machine gives the illusion of running thousands of threads, they each take turns on the core. Overall latency is not reduced. — GManNickG, Oct 13 '16 at 19:36
@GManNickG right ! but how about the core idle time ? I think the overall latency may reduced if there is any thread latency dealing with the memory for example ... http://stackoverflow.com/questions/3126154/multithreading-what-is-the-point-of-more-threads-than-cores — HazemGomaa, Oct 13 '16 at 19:44
Stupid question, but have you swapped the test cases once to see if the order makes a different to the results ? — HopefullyHelpful, Oct 13 '16 at 20:25
@HopefullyHelpful Not a stupid question! I'll give it a shot & see what happens. — Xirema, Oct 13 '16 at 20:27
Possible duplicate of [Optimal number of threads per core](http://stackoverflow.com/questions/1718465/optimal-number-of-threads-per-core) — Random Davis, Oct 13 '16 at 20:39
Your array is 8GB, right? How much physical memory do you have? If you don't have significantly more than 8GB, then you are seeing the effects of some threads being blocked on pagefile I/O and the "extra" threads not being blocked on pagefile I/O, taking advantage of the waits. — doug65536, Oct 13 '16 at 20:55
Is this on Windows? If so, how many "Hard faults/sec" do you see in Resource Monitor, in the Memory pane? — doug65536, Oct 13 '16 at 20:58
Does the reported duration match what you see on the wall clock? — Mark Ransom, Oct 13 '16 at 22:51
@MarkRansom Yes. It's worth noting that my timing mechanism is taking timestamps from before the threads are created and after they are joined. So it's quite unlikely that there's any weird shenanigans with the timing only picking up one thread, or such similar behavior. — Xirema, Oct 14 '16 at 14:42
Assuming there is still an interrogation on the subject, comparing performance counters (by running under `perf` with the relevant configuration on linux) would probably help investigate the issue. — Masklinn, Mar 02 '22 at 11:53

score 3 · Answer 1 · answered Oct 13 '16 at 20:14

3

All of your tsum writes are to the same array, which will occupy consecutive addresses in memory. This will cause cache contention issues with all those writes. Having more threads spreads those writes out to different cache lines, so the CPU cores spend less time invalidating and reloading cache lines.

Try accumulating to a local tsum variable (not a reference into sums), then writing that to sums[i] when the loop is done.

answered Oct 13 '16 at 20:14

1201ProgramAlarm

32,384
7
42
56

1

Your advice caused all the multithreaded code to speed up significantly, but it's still exhibiting the same behavior where 16, 32, and 64 threads all outperform the 2, 4, and 8 threads by significant margins. – Xirema Oct 13 '16 at 20:19

score 3 · Answer 2 · answered Oct 13 '16 at 22:30

3

I suspect that you are seeing the speed up because the threads for your application represent a larger percentage of the total active threads running on the system, giving you a more slices of time overall. You might want to see if the numbers change for the better or worse when you have additional programs running.

answered Oct 13 '16 at 22:30

Mikel F

3,567
1
21
33

If I have lots of other programs running, all the versions of the code slow down by a small amount, without changing the behavior of 'Threads >>> Cores' being faster. – Xirema Oct 14 '16 at 14:41

Why is this code getting faster when I'm using way more threads than my CPU has cores?

EDIT:

DOUBLE EDIT COMBO:

TRIPLE EDIT WOMBO-COMBO:

2 Answers2