I just wanted to test if my compiler recognizes ...
atomic<pair<uintptr_t, uintptr_t>
... and uses DWCASes on it (like x86-64 lock cmpxchg16b
) or if it supplements the pair with a usual lock.
So I first wrote a minimal program with a single noinline-function which does a compare and swap on a atomic pair. The compiler generated a lot of code for this which I didn't understand, and I didn't saw any LOCK-prefixed instructions in that. I was curious about whether the implementation places a lock within the atomic and printed a sizeof of the above atomic pair: 24 on a 64-bit-platform, so obviously without a lock.
At last I wrote a program which increments both portions of a single atomic pair by all the threads my system has (Ryzen Threadripper 64 core, Win10, SMT off) a predefined number of times. Then I calculated the time for each increment in nanoseconds. The time is rather high, about 20.000ns for each successful increment, so it first looked to me if there was a lock I overlooked; so this couldn't be true with a sizeof of this atomic of 24 bytes. And when I saw at the Processs Viewer I saw that all 64 cores were nearly at 100% user CPU time all the time - so there couldn't be any kernel-interventions.
So is there anyone here smarter than me and can identify what this DWCAS-substitute does from the assembly-dump ?
Here's my test-code:
#include <iostream>
#include <atomic>
#include <utility>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <chrono>
#include <vector>
using namespace std;
using namespace chrono;
struct uip_pair
{
uip_pair() = default;
uip_pair( uintptr_t first, uintptr_t second ) :
first( first ),
second( second )
{
}
uintptr_t first, second;
};
using atomic_pair = atomic<uip_pair>;
int main()
{
cout << "sizeof(atomic<pair<uintptr_t, uintptr_t>>): " << sizeof(atomic_pair) << endl;
atomic_pair ap( uip_pair( 0, 0 ) );
cout << "atomic<pair<uintptr_t, uintptr_t>>::is_lock_free: " << ap.is_lock_free() << endl;
mutex mtx;
unsigned nThreads = thread::hardware_concurrency();
unsigned ready = nThreads;
condition_variable cvReady;
bool run = false;
condition_variable cvRun;
atomic_int64_t sumDur = 0;
auto theThread = [&]( size_t n )
{
unique_lock<mutex> lock( mtx );
if( !--ready )
cvReady.notify_one();
cvRun.wait( lock, [&]() -> bool { return run; } );
lock.unlock();
auto start = high_resolution_clock::now();
uip_pair cmp = ap.load( memory_order_relaxed );
for( ; n--; )
while( !ap.compare_exchange_weak( cmp, uip_pair( cmp.first + 1, cmp.second + 1 ), memory_order_relaxed, memory_order_relaxed ) );
sumDur.fetch_add( duration_cast<nanoseconds>( high_resolution_clock::now() - start ).count(), memory_order_relaxed );
lock.lock();
};
vector<jthread> threads;
threads.reserve( nThreads );
static size_t const ROUNDS = 100'000;
for( unsigned t = nThreads; t--; )
threads.emplace_back( theThread, ROUNDS );
unique_lock<mutex> lock( mtx );
cvReady.wait( lock, [&]() -> bool { return !ready; } );
run = true;
cvRun.notify_all();
lock.unlock();
for( jthread &thr : threads )
thr.join();
cout << (double)sumDur / ((double)nThreads * ROUNDS) << endl;
uip_pair p = ap.load( memory_order_relaxed );
cout << "synch: " << (p.first == p.second ? "yes" : "no") << endl;
}
[EDIT]: I've extracted the compare_exchange_weak-function into a noinline-function and disassembled the code:
struct uip_pair
{
uip_pair() = default;
uip_pair( uintptr_t first, uintptr_t second ) :
first( first ),
second( second )
{
}
uintptr_t first, second;
};
using atomic_pair = atomic<uip_pair>;
#if defined(_MSC_VER)
#define NOINLINE __declspec(noinline)
#elif defined(__GNUC__) || defined(__clang__)
#define NOINLINE __attribute((noinline))
#endif
NOINLINE
bool cmpXchg( atomic_pair &ap, uip_pair &cmp, uip_pair xchg )
{
return ap.compare_exchange_weak( cmp, xchg, memory_order_relaxed, memory_order_relaxed );
}
mov eax, 1
mov r10, rcx
mov r9d, eax
xchg DWORD PTR [rcx], eax
test eax, eax
je SHORT label8
label1:
mov eax, DWORD PTR [rcx]
test eax, eax
je SHORT label7
label2:
mov eax, r9d
test r9d, r9d
je SHORT label5
label4:
pause
sub eax, 1
jne SHORT label4
cmp r9d, 64
jl SHORT label5
lea r9d, QWORD PTR [rax+64]
jmp SHORT label6
label5:
add r9d, r9d
label6:
mov eax, DWORD PTR [rcx]
test eax, eax
jne SHORT label2
label7:
mov eax, 1
xchg DWORD PTR [rcx], eax
test eax, eax
jne SHORT label1
label8:
mov rax, QWORD PTR [rcx+8]
sub rax, QWORD PTR [rdx]
jne SHORT label9
mov rax, QWORD PTR [rcx+16]
sub rax, QWORD PTR [rdx+8]
label9:
test rax, rax
sete al
test al, al
je SHORT label10
movups xmm0, XMMWORD PTR [r8]
movups XMMWORD PTR [rcx+8], xmm0
xor ecx, ecx
xchg DWORD PTR [r10], ecx
ret
label10:
movups xmm0, XMMWORD PTR [rcx+8]
xor ecx, ecx
movups XMMWORD PTR [rdx], xmm0
xchg DWORD PTR [r10], ecx
ret
Maybe someone understands the disassembly. Remember that XCHG is implicitly LOCK'ed on x86. It seems to me that MSVC uses some kind of software transactional memory here. I can extend the shared structure embedded in the atomic arbitrarily but the difference is still 8 bytes; so MSVC always uses some kind of STM.