On intel x86_64 understanding when to partition memory that will be accessed by multiple threads by 128 vs 64 bytes

Question

Hardware: Intel skylake

This is based on: Understanding std::hardware_destructive_interference_size and std::hardware_constructive_interference_size

I tried creating some benchmarks to test this to see how I could best use it but the results have been weird:

Here is the benchmark code:

#include <assert.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <x86intrin.h>

#define NLOOPS 5

#ifndef PARTITION_PER_CORE
    #define PARTITION_PER_CORE 1
#endif

#ifndef NTHREADS
    #define NTHREADS 8
#endif

#ifndef PARTITION
    #define PARTITION 64
#endif

#ifndef TEST_SIZE
    #define TEST_SIZE (1000000u)
#endif


#define XOR_ATOMIC(X, Y) __atomic_fetch_xor(X, Y, __ATOMIC_RELAXED)
#define XOR_CAS_LOOP(X, Y)                                                     \
    do {                                                                       \
        volatile uint64_t _sink = __atomic_fetch_xor(X, Y, __ATOMIC_RELAXED);  \
    } while (0)


#define ADD_ATOMIC(X, Y) __atomic_fetch_add(X, Y, __ATOMIC_RELAXED)
#define XADD_ATOMIC(X, Y)                                                      \
    do {                                                                       \
        volatile uint64_t _sink = __atomic_fetch_add(X, Y, __ATOMIC_RELAXED);  \
    } while (0)


#define XOR(X, Y) *(X) ^= Y
#define ADD(X, Y) *(X) += (Y)

#ifndef OP1
    #define OP1(X, Y) ADD(X, Y)
#endif


#ifndef OP2
    #define OP2(X, Y) ADD(X, Y)
#endif


#define bench_flush_all_pending()    asm volatile("" : : : "memory");
#define bench_do_not_optimize_out(X) asm volatile("" : : "r,m"(X) : "memory")

uint64_t *        r;
pthread_barrier_t b;

uint64_t total_cycles;

void
init() {
#if PARTITION_PER_CORE
    assert(sysconf(_SC_NPROCESSORS_ONLN) * PARTITION <= sysconf(_SC_PAGE_SIZE));
#else
    assert(NTHREADS * PARTITION <= sysconf(_SC_PAGE_SIZE));
#endif

    r = (uint64_t *)mmap(NULL,
                         sysconf(_SC_PAGE_SIZE),
                         (PROT_READ | PROT_WRITE),
                         (MAP_ANONYMOUS | MAP_PRIVATE),
                         (-1),
                         0);

    assert(r != NULL);


    assert(!pthread_barrier_init(&b, NULL, NTHREADS));

    total_cycles = 0;
}

void *
run1(void * targ) {

    uint64_t * t_region = (uint64_t *)targ;

    const uint64_t y = rand();

    // page in memory / get in cache
    *t_region = 0;

    pthread_barrier_wait(&b);

    uint64_t start_cycles = __rdtsc();
    for (uint32_t i = 0; i < TEST_SIZE; ++i) {
        OP1(t_region, y);
        bench_do_not_optimize_out(t_region);
    }
    bench_flush_all_pending();
    uint64_t end_cycles = __rdtsc();

    __atomic_fetch_add(&total_cycles,
                       (end_cycles - start_cycles),
                       __ATOMIC_RELAXED);
    return NULL;
}

void *
run2(void * targ) {

    uint64_t * t_region = (uint64_t *)targ;

    const uint64_t y = rand();

    // page in memory / get in cache
    *t_region = 0;

    pthread_barrier_wait(&b);

    uint64_t start_cycles = __rdtsc();
    for (uint32_t i = 0; i < TEST_SIZE; ++i) {
        OP2(t_region, y);
        bench_do_not_optimize_out(t_region);
    }
    bench_flush_all_pending();
    uint64_t end_cycles = __rdtsc();

    __atomic_fetch_add(&total_cycles,
                       (end_cycles - start_cycles),
                       __ATOMIC_RELAXED);
    return NULL;
}


void
test() {
    init();

    pthread_t * tids = (pthread_t *)calloc(NTHREADS, sizeof(pthread_t));
    assert(tids != NULL);

    cpu_set_t cset;
    CPU_ZERO(&cset);

    const uint32_t stack_size = (1 << 18);

    uint32_t ncores = sysconf(_SC_NPROCESSORS_ONLN);
    for (uint32_t i = 0; i < NTHREADS; ++i) {
        CPU_SET(i % ncores, &cset);
        pthread_attr_t attr;
        assert(!pthread_attr_init(&attr));
        assert(!pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cset));
        assert(!pthread_attr_setstacksize(&attr, stack_size));

#if PARTITION_PER_CORE
        uint64_t * t_region = r + (i % ncores) * (PARTITION / sizeof(uint64_t));
#else
        uint64_t * t_region = r + (i) * (PARTITION / sizeof(uint64_t));
#endif
        if (i % 2) {
            assert(!pthread_create(tids + i, &attr, run1, (void *)t_region));
        }
        else {
            assert(!pthread_create(tids + i, &attr, run2, (void *)t_region));
        }

        CPU_ZERO(&cset);
        assert(!pthread_attr_destroy(&attr));
    }

    for (uint32_t i = 0; i < NTHREADS; ++i) {
        pthread_join(tids[i], NULL);
    }

    free(tids);
}


int
main(int argc, char ** argv) {

    double results[NLOOPS];
    for(uint32_t i = 0; i < NLOOPS; ++i) {
        test();

        double cycles_per_op = total_cycles;
        cycles_per_op /= NTHREADS * TEST_SIZE;

        results[i] = cycles_per_op;
    }

    char buf[64] = "";
    strcpy(buf, argv[0]);

    uint32_t len       = strlen(buf);
    uint32_t start_op1 = 0, start_op2 = 0;

    for (uint32_t i = 0; i < len; ++i) {
        if (start_op1 == 0 && buf[i] == '-') {
            start_op1 = i + 1;
        }
        else if (buf[i] == '-') {
            start_op2 = i + 1;
            buf[i]    = 0;
        }
    }

    fprintf(stderr,
            "Results: %s\n\t"
            "nthreads           : %d\n\t"
            "partition size     : %d\n\t"
            "partion_per_core   : %s\n\t"
            "Op1                : %s\n\t"
            "Op2                : %s\n\t"
            "Cycles Per Op      : %.3lf",
            argv[0],
            NTHREADS,
            PARTITION,
            PARTITION_PER_CORE ? "true" : "false",
            buf + start_op1,
            buf + start_op2,
            results[0]);
    for(uint32_t i = 1; i < NLOOPS; ++i) {
        fprintf(stderr, ", %.3lf", results[i]);
    }
    fprintf(stderr, "\n\n");

    assert(!munmap(r, sysconf(_SC_PAGE_SIZE)));
    assert(!pthread_barrier_destroy(&b));
}

and then I've been using:

#! /bin/bash

CFLAGS="-O3 -march=native -mtune=native"
LDFLAGS="-lpthread"

# NOTE: nthreads * partion must be <= PAGE_SIZE (test intentionally
#       fails assertion if memory spans multiple pages

# nthreads          : number of threads performing operations

# op1               : even tids will perform op1 (tid is order they where created
#                     starting from 0)

# partition_per_core: boolean, if true multiple threads pinned to the
#                     same core will share the same destination, if
#                     false each thread will have a unique memory
#                     destination (this can be ignored in nthreads <= ncores)

# op2               : odd tids will perform op2 (tid is order they where created
#                     starting from 0)

# partition         : space between destinations (i.e partition
#                     destinations by cache line size or 2 * cache
#                     line size)


for nthreads in 8; do    
    for partition_per_core in 1; do
        for op1 in ADD XOR ADD_ATOMIC XADD_ATOMIC XOR_ATOMIC XOR_CAS_LOOP; do
            for op2 in ADD XOR ADD_ATOMIC XADD_ATOMIC XOR_ATOMIC XOR_CAS_LOOP; do
                for partition in 64 128; do
                    g++ ${CFLAGS} test_atomics.cc -o test-${op1}-${op2} -DPARTITION=${partition} -DNTHREADS=${nthreads} -DPARTITION_PER_CORE=${partition_per_core} -DOP1=${op1} -DOP2=${op2} ${LDFLAGS};                  
                    ./test-${op1}-${op2};
                    rm -f test-${op1}-${op2};
                done
                echo "--------------------------------------------------------------------------------"
            done
            echo "--------------------------------------------------------------------------------"
            echo "--------------------------------------------------------------------------------"
        done
        echo "--------------------------------------------------------------------------------"
        echo "--------------------------------------------------------------------------------"
        echo "--------------------------------------------------------------------------------"
    done
    echo "--------------------------------------------------------------------------------"
    echo "--------------------------------------------------------------------------------"
    echo "--------------------------------------------------------------------------------"
    echo "--------------------------------------------------------------------------------"
done

To run it.

Some results make sense:

i.e with lock xaddq I get:

Results: ./test-XADD_ATOMIC-XADD_ATOMIC
    nthreads           : 8
    partition size     : 64
    partion_per_core   : true
    Op1                : XADD_ATOMIC
    Op2                : XADD_ATOMIC
    Cycles Per Op      : 21.547, 20.089, 38.852, 26.723, 25.934

Results: ./test-XADD_ATOMIC-XADD_ATOMIC
    nthreads           : 8
    partition size     : 128
    partion_per_core   : true
    Op1                : XADD_ATOMIC
    Op2                : XADD_ATOMIC
    Cycles Per Op      : 19.607, 19.187, 19.483, 18.857, 18.721

which basically shows an improvement across the board with 128 byte partition. In general I have been able to see that 128 byte partition is beneficial with atomic operations EXCEPT for CAS loops i.e:

Results: ./test-XOR_CAS_LOOP-XOR_CAS_LOOP
    nthreads           : 8
    partition size     : 64
    partion_per_core   : true
    Op1                : XOR_CAS_LOOP
    Op2                : XOR_CAS_LOOP
    Cycles Per Op      : 20.273, 20.061, 20.737, 21.240, 21.747

Results: ./test-XOR_CAS_LOOP-XOR_CAS_LOOP
    nthreads           : 8
    partition size     : 128
    partion_per_core   : true
    Op1                : XOR_CAS_LOOP
    Op2                : XOR_CAS_LOOP
    Cycles Per Op      : 20.632, 20.432, 21.710, 22.627, 23.070

gives basically the opposite of the expected result. As well I have been unable to see any difference between 64 byte and 128 byte partition size when mixing atomic with non atomic operations i.e:

Results: ./test-XADD_ATOMIC-ADD
    nthreads           : 8
    partition size     : 64
    partion_per_core   : true
    Op1                : XADD_ATOMIC
    Op2                : ADD
    Cycles Per Op      : 11.117, 11.186, 11.223, 11.169, 11.138

Results: ./test-XADD_ATOMIC-ADD
    nthreads           : 8
    partition size     : 128
    partion_per_core   : true
    Op1                : XADD_ATOMIC
    Op2                : ADD
    Cycles Per Op      : 11.126, 11.157, 11.072, 11.227, 11.080

What I am curious about is:

Why is CAS loop not like the rest of the atomics in terms of best partition size?
If my understanding is correct in that sharing in the L2 cache is what makes 128 byte partition more effective, why does this only seem to affect atomic operations? Non atomics should still be constantly invalidating cache lines.
Is there a general rule for understanding when 128 byte partition is needed versus when you can save memory with 64 partition?

Note: I am not asking nor do I expect anyone to read my code. I only included it as context for the question / where I got my numbers. If you have questions / concerns about the benchmarking method comment and I'll reply.

*sharing in the L2 cache* - No, L2 caches are per-core private on Intel CPUs. What makes 128 special is the L2 spatial prefetcher that tries to complete 128-byte pairs of 2 cache lines, guessing that if one line has been accessed, it's likely the other will be too. This can "steal" it destructively from another core in cases where that heuristic is wrong. — Peter Cordes, Sep 04 '20 at 00:44
Note that non-atomic operations can queue bursts of work via the store buffer, making them more resistant to losing access to a cache line for some time. IIRC, this point came up in some other Q&A a while ago, probably about false sharing in the same cache line. — Peter Cordes, Sep 04 '20 at 00:46
Regarding your first point about 128 byte pairs. Will the first cache line in the pair always be 128 byte aligned? Or is it just the "next" cache line (next being defined by whichever adjacent cache line appears most likely). Asking because people seemed to advocate for 128 byte alignment but if its "next" cache line then it seems that cache line padding is the best approach. — Noah, Sep 04 '20 at 02:22
It's a 128-byte *aligned* pair of 2 cache lines; that's why 128-byte alignment makes sense, and why the spatial prefetcher doesn't cause problems across 128-byte boundaries. You could have checked Intel's optimization manual for more about HW prefetchers. — Peter Cordes, Sep 04 '20 at 02:29
@PeterCordes your expertise might be useful [here](https://marc.info/?t=162640319200001&r=1&w=2) if your interested. — Noah, Jul 16 '21 at 17:22
your comments in https://marc.info/?l=gcc-patches&m=162645607916585&w=2 match my limited understanding of what the L2 spatial prefetcher would try to do, and that it's not just SnB specifically, but the whole SnB-family including Ice Lake that has it. It might have some heuristics that try to avoid destructive interference, or maybe not IDK. It should be possible to create a microbenchmark to detect the effect, like maybe one pair of cores sharing one line and another pair sharing the sibling (or a distant line), and see if L2 spatial creates extra contention between pairs. — Peter Cordes, Jul 16 '21 at 17:30
@PeterCordes pretty sure model is incorect and the spatial prefetcher on goes off memory access. Made benchmark of `lock add` in a loop on cores 0,1,2,3 and saw no difference in RFOs with 64/128 byte partition (irrelevant of counting l2 prefetch events). — Noah, Jul 16 '21 at 21:25
Hmm, I wonder if L2 spatial avoids activating when a line has already ping-ponged, or if atomic RMW accesses are special (e.g. assumed not to have spatial locality?). L1 could attach hints to the request to L2 based on the kind of access. Pure-store and pure-load could be different. Maybe test with one thread reading odd cache lines of an array, and another thread reading evens, to confirm destructive interference, and see if it still happens on L3 hits, or with stores. — Peter Cordes, Jul 16 '21 at 21:32
I did (well did partition == 32 and was able to confirm RFO's scaled with loops). Its essentially the same code as above with `PARTION=32` (or 64/128) and `perf stat` collecting RFO requests. Possibly a microcode update but doubt it. Is there a good way to search all updates in the last year or so? Re: "L2 spatial avoids activating when a line has already ping-ponged" what makes the most sense to me is its memory only. Only ddr4 (most common I think) 128 bytes is both channels so its cheaper than normal because of multiplexing. — Noah, Jul 17 '21 at 01:13
@Noah - interesting result with `lock xadd`. My impression has been that 64 bytes is "fine", you don't need to go to 128 to get the non-contention benefit, but I was never sure of the mechanism (e.g., prefetch disabled based on history, or prefetch simply being dropped if the other line is already owned by another core). However, I never really tested it extensively and your test shows that for non-CAS atomics, this _may_ be a problem? — BeeOnRope, Jul 18 '21 at 07:36
You could turn off that specific prefetcher and run the test again to confirm that it was actually the spatial prefetcher... — BeeOnRope, Jul 18 '21 at 07:36
@BeeOnRope and @PeterCordes I think it has something to do with hyperthreading and I can only reproduce the results on Skylake (not on icelake/tigerlake). With 64 byte partition on Skylake AND with thread on each logical core in use I see a performance dropoff and an increase in RFOs as compared with 128 byte partition. Only using physical cores, however, there is no performance dropoff or RFO increase. The RFO increase is a bit odd. It seems to oscilated between a low value / high value (`.1 * Iterations` vs `1 * Iterations`) with about 50/50 distribution. — Noah, Jul 18 '21 at 21:31

On intel x86_64 understanding when to partition memory that will be accessed by multiple threads by 128 vs 64 bytes

0 Answers0