Hardware: Intel skylake
This is based on: Understanding std::hardware_destructive_interference_size and std::hardware_constructive_interference_size
I tried creating some benchmarks to test this to see how I could best use it but the results have been weird:
Here is the benchmark code:
#include <assert.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <x86intrin.h>
#define NLOOPS 5
#ifndef PARTITION_PER_CORE
#define PARTITION_PER_CORE 1
#endif
#ifndef NTHREADS
#define NTHREADS 8
#endif
#ifndef PARTITION
#define PARTITION 64
#endif
#ifndef TEST_SIZE
#define TEST_SIZE (1000000u)
#endif
#define XOR_ATOMIC(X, Y) __atomic_fetch_xor(X, Y, __ATOMIC_RELAXED)
#define XOR_CAS_LOOP(X, Y) \
do { \
volatile uint64_t _sink = __atomic_fetch_xor(X, Y, __ATOMIC_RELAXED); \
} while (0)
#define ADD_ATOMIC(X, Y) __atomic_fetch_add(X, Y, __ATOMIC_RELAXED)
#define XADD_ATOMIC(X, Y) \
do { \
volatile uint64_t _sink = __atomic_fetch_add(X, Y, __ATOMIC_RELAXED); \
} while (0)
#define XOR(X, Y) *(X) ^= Y
#define ADD(X, Y) *(X) += (Y)
#ifndef OP1
#define OP1(X, Y) ADD(X, Y)
#endif
#ifndef OP2
#define OP2(X, Y) ADD(X, Y)
#endif
#define bench_flush_all_pending() asm volatile("" : : : "memory");
#define bench_do_not_optimize_out(X) asm volatile("" : : "r,m"(X) : "memory")
uint64_t * r;
pthread_barrier_t b;
uint64_t total_cycles;
void
init() {
#if PARTITION_PER_CORE
assert(sysconf(_SC_NPROCESSORS_ONLN) * PARTITION <= sysconf(_SC_PAGE_SIZE));
#else
assert(NTHREADS * PARTITION <= sysconf(_SC_PAGE_SIZE));
#endif
r = (uint64_t *)mmap(NULL,
sysconf(_SC_PAGE_SIZE),
(PROT_READ | PROT_WRITE),
(MAP_ANONYMOUS | MAP_PRIVATE),
(-1),
0);
assert(r != NULL);
assert(!pthread_barrier_init(&b, NULL, NTHREADS));
total_cycles = 0;
}
void *
run1(void * targ) {
uint64_t * t_region = (uint64_t *)targ;
const uint64_t y = rand();
// page in memory / get in cache
*t_region = 0;
pthread_barrier_wait(&b);
uint64_t start_cycles = __rdtsc();
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
OP1(t_region, y);
bench_do_not_optimize_out(t_region);
}
bench_flush_all_pending();
uint64_t end_cycles = __rdtsc();
__atomic_fetch_add(&total_cycles,
(end_cycles - start_cycles),
__ATOMIC_RELAXED);
return NULL;
}
void *
run2(void * targ) {
uint64_t * t_region = (uint64_t *)targ;
const uint64_t y = rand();
// page in memory / get in cache
*t_region = 0;
pthread_barrier_wait(&b);
uint64_t start_cycles = __rdtsc();
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
OP2(t_region, y);
bench_do_not_optimize_out(t_region);
}
bench_flush_all_pending();
uint64_t end_cycles = __rdtsc();
__atomic_fetch_add(&total_cycles,
(end_cycles - start_cycles),
__ATOMIC_RELAXED);
return NULL;
}
void
test() {
init();
pthread_t * tids = (pthread_t *)calloc(NTHREADS, sizeof(pthread_t));
assert(tids != NULL);
cpu_set_t cset;
CPU_ZERO(&cset);
const uint32_t stack_size = (1 << 18);
uint32_t ncores = sysconf(_SC_NPROCESSORS_ONLN);
for (uint32_t i = 0; i < NTHREADS; ++i) {
CPU_SET(i % ncores, &cset);
pthread_attr_t attr;
assert(!pthread_attr_init(&attr));
assert(!pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cset));
assert(!pthread_attr_setstacksize(&attr, stack_size));
#if PARTITION_PER_CORE
uint64_t * t_region = r + (i % ncores) * (PARTITION / sizeof(uint64_t));
#else
uint64_t * t_region = r + (i) * (PARTITION / sizeof(uint64_t));
#endif
if (i % 2) {
assert(!pthread_create(tids + i, &attr, run1, (void *)t_region));
}
else {
assert(!pthread_create(tids + i, &attr, run2, (void *)t_region));
}
CPU_ZERO(&cset);
assert(!pthread_attr_destroy(&attr));
}
for (uint32_t i = 0; i < NTHREADS; ++i) {
pthread_join(tids[i], NULL);
}
free(tids);
}
int
main(int argc, char ** argv) {
double results[NLOOPS];
for(uint32_t i = 0; i < NLOOPS; ++i) {
test();
double cycles_per_op = total_cycles;
cycles_per_op /= NTHREADS * TEST_SIZE;
results[i] = cycles_per_op;
}
char buf[64] = "";
strcpy(buf, argv[0]);
uint32_t len = strlen(buf);
uint32_t start_op1 = 0, start_op2 = 0;
for (uint32_t i = 0; i < len; ++i) {
if (start_op1 == 0 && buf[i] == '-') {
start_op1 = i + 1;
}
else if (buf[i] == '-') {
start_op2 = i + 1;
buf[i] = 0;
}
}
fprintf(stderr,
"Results: %s\n\t"
"nthreads : %d\n\t"
"partition size : %d\n\t"
"partion_per_core : %s\n\t"
"Op1 : %s\n\t"
"Op2 : %s\n\t"
"Cycles Per Op : %.3lf",
argv[0],
NTHREADS,
PARTITION,
PARTITION_PER_CORE ? "true" : "false",
buf + start_op1,
buf + start_op2,
results[0]);
for(uint32_t i = 1; i < NLOOPS; ++i) {
fprintf(stderr, ", %.3lf", results[i]);
}
fprintf(stderr, "\n\n");
assert(!munmap(r, sysconf(_SC_PAGE_SIZE)));
assert(!pthread_barrier_destroy(&b));
}
and then I've been using:
#! /bin/bash
CFLAGS="-O3 -march=native -mtune=native"
LDFLAGS="-lpthread"
# NOTE: nthreads * partion must be <= PAGE_SIZE (test intentionally
# fails assertion if memory spans multiple pages
# nthreads : number of threads performing operations
# op1 : even tids will perform op1 (tid is order they where created
# starting from 0)
# partition_per_core: boolean, if true multiple threads pinned to the
# same core will share the same destination, if
# false each thread will have a unique memory
# destination (this can be ignored in nthreads <= ncores)
# op2 : odd tids will perform op2 (tid is order they where created
# starting from 0)
# partition : space between destinations (i.e partition
# destinations by cache line size or 2 * cache
# line size)
for nthreads in 8; do
for partition_per_core in 1; do
for op1 in ADD XOR ADD_ATOMIC XADD_ATOMIC XOR_ATOMIC XOR_CAS_LOOP; do
for op2 in ADD XOR ADD_ATOMIC XADD_ATOMIC XOR_ATOMIC XOR_CAS_LOOP; do
for partition in 64 128; do
g++ ${CFLAGS} test_atomics.cc -o test-${op1}-${op2} -DPARTITION=${partition} -DNTHREADS=${nthreads} -DPARTITION_PER_CORE=${partition_per_core} -DOP1=${op1} -DOP2=${op2} ${LDFLAGS};
./test-${op1}-${op2};
rm -f test-${op1}-${op2};
done
echo "--------------------------------------------------------------------------------"
done
echo "--------------------------------------------------------------------------------"
echo "--------------------------------------------------------------------------------"
done
echo "--------------------------------------------------------------------------------"
echo "--------------------------------------------------------------------------------"
echo "--------------------------------------------------------------------------------"
done
echo "--------------------------------------------------------------------------------"
echo "--------------------------------------------------------------------------------"
echo "--------------------------------------------------------------------------------"
echo "--------------------------------------------------------------------------------"
done
To run it.
Some results make sense:
i.e with lock xaddq
I get:
Results: ./test-XADD_ATOMIC-XADD_ATOMIC
nthreads : 8
partition size : 64
partion_per_core : true
Op1 : XADD_ATOMIC
Op2 : XADD_ATOMIC
Cycles Per Op : 21.547, 20.089, 38.852, 26.723, 25.934
Results: ./test-XADD_ATOMIC-XADD_ATOMIC
nthreads : 8
partition size : 128
partion_per_core : true
Op1 : XADD_ATOMIC
Op2 : XADD_ATOMIC
Cycles Per Op : 19.607, 19.187, 19.483, 18.857, 18.721
which basically shows an improvement across the board with 128
byte partition. In general I have been able to see that 128
byte partition is beneficial with atomic operations EXCEPT for CAS
loops i.e:
Results: ./test-XOR_CAS_LOOP-XOR_CAS_LOOP
nthreads : 8
partition size : 64
partion_per_core : true
Op1 : XOR_CAS_LOOP
Op2 : XOR_CAS_LOOP
Cycles Per Op : 20.273, 20.061, 20.737, 21.240, 21.747
Results: ./test-XOR_CAS_LOOP-XOR_CAS_LOOP
nthreads : 8
partition size : 128
partion_per_core : true
Op1 : XOR_CAS_LOOP
Op2 : XOR_CAS_LOOP
Cycles Per Op : 20.632, 20.432, 21.710, 22.627, 23.070
gives basically the opposite of the expected result. As well I have been unable to see any difference between 64
byte and 128
byte partition size when mixing atomic with non atomic operations i.e:
Results: ./test-XADD_ATOMIC-ADD
nthreads : 8
partition size : 64
partion_per_core : true
Op1 : XADD_ATOMIC
Op2 : ADD
Cycles Per Op : 11.117, 11.186, 11.223, 11.169, 11.138
Results: ./test-XADD_ATOMIC-ADD
nthreads : 8
partition size : 128
partion_per_core : true
Op1 : XADD_ATOMIC
Op2 : ADD
Cycles Per Op : 11.126, 11.157, 11.072, 11.227, 11.080
What I am curious about is:
Why is
CAS
loop not like the rest of the atomics in terms of best partition size?If my understanding is correct in that sharing in the L2 cache is what makes
128
byte partition more effective, why does this only seem to affect atomic operations? Non atomics should still be constantly invalidating cache lines.Is there a general rule for understanding when
128
byte partition is needed versus when you can save memory with64
partition?
Note: I am not asking nor do I expect anyone to read my code. I only included it as context for the question / where I got my numbers. If you have questions / concerns about the benchmarking method comment and I'll reply.