4

Just for the sake of learning this, I'm trying to grasp how to use HLE prefixes XACQUIRE and XRELEASE. After reading the Intel documentation, my understanding was that after executing an instruction with the XACQUIRE prefix the CPU enters into some sort of a write lock until the instruction with the XRELEASE prefix. So I wrote the following test code to see if I'm correct. Well, there's still something that I don't understand because my code sample fails.

So can someone tell me what am I missing with those HLE prefixes?

Two fails:

  1. The xtest instruction reports that HLE was not enabled, and

  2. Because my assumed "mutex-ed" code doesn't run as a mutex, it fails concurrency.

Next is the Windows C++ project, compiled with VS 2017 with x64 .asm file as follows:

.code

testCPUID PROC
    push rbx

    ; CPUID.07h.EBX.HLE[bit 4]==1

    mov eax, 7h
    xor ecx, ecx
    cpuid
    and rbx, 1 shl 4

    mov rax, rbx
    pop rbx
    ret
testCPUID ENDP



testHLEWrite PROC
    ; RCX = pointer to TST91 struct:
    ;       void* pPtrToNextWrite;
    ;       int nNextValue;
    ;       void* pCutoffPtr;
    ;       void* pBeginPtr;

    xor edx, edx
    xacquire xchg [rcx], rdx        ; I'm assuming that this will work as a mutex ...

    xtest                           ; Sanity check to see if HLE got enabled?
    jnz lbl_00                      ; If HLE is on => ZF=0
    int 3                           ; we get here if HLE did not get enabled
lbl_00:

    ; Do some nonsensical stuff
    ; The idea is to write sequential values into a shared array
    ; to see if the lock above holds
    ; Format:
    ;       > --16 sequential bytes-- <

    mov r8d, dword ptr [rcx + 8]

    mov byte ptr [rdx], '>'
    inc rdx

    ; Write 16 sequential bytes

    mov rax, 10h
lbl_01:
    mov byte ptr [rdx], r8b
    inc r8
    inc rdx
    dec rax
    jnz lbl_01

    mov byte ptr [rdx], '<'
    inc rdx

    cmp rdx, [rcx + 10h]            ; check if reached the end of buffer
    jb lbl_02
    mov rdx, [rcx + 18h]            ; reset ptr to the beginning of buffer
lbl_02:

    mov dword ptr [rcx + 8], r8d
    xrelease mov [rcx], rdx         ; this will release the mutex

    ret
testHLEWrite ENDP





testHLEForCorrectness PROC
    ; RCX = pointer to TST91 struct:
    ;       void* pPtrToNextWrite;
    ;       int nNextValue;
    ;       void* pCutoffPtr;
    ;       void* pBeginPtr;

    xor edx, edx
    xacquire xchg [rcx], rdx        ; I'm assuming that this will work as a mutex ...

    xtest                           ; Sanity check to see if HLE got enabled?
    jnz lbl_00                      ; If HLE is on => ZF=0
    int 3                           ; we get here if HLE did not get enabled
lbl_00:

    mov r9, [rcx + 18h]

lbl_repeat:
    cmp r9, rdx
    jae lbl_out

    cmp byte ptr [r9], '>'
    jnz lbl_bad
    cmp byte ptr [r9 + 1 + 10h], '<'
    jnz lbl_bad

    mov r8b, byte ptr [r9 + 1]
    sub eax, eax
lbl_01:
    cmp [r9 + rax + 1], r8b
    jnz lbl_bad
    inc rax
    inc r8
    cmp rax, 10h
    jb lbl_01

    add r9, 2 + 10h
    jmp lbl_repeat

lbl_out:

    xrelease mov [rcx], rdx         ; this will release the mutex

    ret

lbl_bad:
    ; Verification failed
    int 3

testHLEForCorrectness ENDP

END

And this is how it's called from the user-mode C++ project:

#include <assert.h>
#include <Windows.h>

struct TST91{
    BYTE* pNextWrite;
    int nNextValue;
    BYTE* pCutoffPtr;
    BYTE* pBeginPtr;
};

extern "C" {
    BOOL testCPUID(void);
    void testHLEWrite(TST91* p);
    void testHLEForCorrectness(TST91* p);
};

DWORD WINAPI ThreadProc01(LPVOID lpParameter);

TST91* gpStruct = NULL;
BYTE* gpMem = NULL;             //Its size is 'gszcbMemSize' BYTEs
const size_t gszcbMemSize = 0x1000 * 8;

int main()
{
    if(testCPUID())
    {
        gpStruct = new TST91;
        gpMem = new BYTE[gszcbMemSize];

        gpStruct->pNextWrite = gpMem;
        gpStruct->nNextValue = 1;
        gpStruct->pBeginPtr = gpMem;
        gpStruct->pCutoffPtr = gpMem + gszcbMemSize - 0x100;

        for(int t = 0; t < 5; t++)
        {
            CloseThread(CreateThread(NULL, 0, 
                ThreadProc01, (VOID*)(1LL << t), 0, NULL));
        }

        _gettch();

        delete gpStruct;
        delete[] gpMem;
    }
    else
        _tprintf(L"Your CPU doesn't support HLE\n");

   return 0;
}

DWORD WINAPI ThreadProc01(LPVOID lpParameter)
{
    if(!SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)lpParameter))
    {
        assert(NULL);
    }

    for(;;)
    {
        testHLEWrite(gpStruct);
        testHLEForCorrectness(gpStruct);
    }

    return 0;
}
MikeF
  • 1,021
  • 9
  • 29
  • What CPU are you running this on? – Peter Cordes Jul 30 '18 at 03:17
  • Note that with HLE (unlike RTM), [the abort address is the address of the `xacquire`-enabled instruction](http://felixcloutier.com/x86/XACQUIRE:XRELEASE.html). So reaching the `int3` probably happens after the transaction aborts for some reason. – Peter Cordes Jul 30 '18 at 03:19
  • @PeterCordes: It's haswell i7 that supports it. I had to post an answer below. As for the `xtest` instruction with HLE, it's still a mystery to me how it works. Maybe you can shed some light on it? – MikeF Jul 30 '18 at 03:21
  • So you disabled microcode updates on your Haswell, for testing this? Or was it only RTM that they disabled in Haswell, and then again in early Broadwell, after corner-case bugs were discovered? [Which CPUs support TSX, with the erratum fixed?](https://superuser.com/q/894950). I thought there were no Haswell CPUs with working TSX (RTM + HLE), when using up-to-date microcode. – Peter Cordes Jul 30 '18 at 03:22
  • @PeterCordes: No, I didn't disable anything. It's just my development CPU as-is. I don't have any newer CPU to test it on. – MikeF Jul 30 '18 at 03:25
  • Then your CPU probably *doesn't* support HLE, which would explain the "surprising" behaviour of `xtest`. https://en.wikipedia.org/wiki/Transactional_Synchronization_Extensions doesn't say anything about a later Haswell stepping with the bug fixed (but it doesn't mention that late Broadwell, and all Skylake, have working TSX, so it's not conclusive). It's possible that the microcode update doesn't update what CPUID reports, so maybe it reports that it has HLE but actually ignores the prefix (i.e. acts like it aborted right away every time). – Peter Cordes Jul 30 '18 at 03:34
  • 1
    Either that or you were trying to do too much inside a single transaction. I see you have a loop, but I didn't follow it in enough detail to see how many cache lines would be part of the transaction. – Peter Cordes Jul 30 '18 at 03:39

2 Answers2

5

You can answer your own questions, can't you?

Anyway. I think I got it. I'll try to stick with plain English, or go with how I understand it. Feel free to edit it out if I make an incorrect statement. (By the way, Hardware Lock Elision, what a cool name. Sounds like some Matt Damon movie. I even had to Google word "elision" to understand what it means... and I still don't remember it.)

So this HLE concept is nothing more than a hint for the CPU to treat the lock prefix in a more optimized way. The lock prefix by itself is somewhat "expensive" for the modern processors to execute in an efficient way. So when the CPU that supports it sees the HLE prefix it will initially not acquire the lock, but will do so only if there is a read/write conflict. In that case the CPU will issue an HLE abort, that in turn will require a later conventional lock.

Morever, the HLE prefix for XACQUIRE is F2, and for XRELEASE is F3, which is nothing more than the old-school REPNE and REP prefixes, that are simply ignored when used with a lock-able instruction by the older CPUs that don't support HLE. What all this means is that to use HLE one doesn't need to check with CPUID instruction for its support and can safely use them as-is. The older CPUs will ignore them and treat the accompanying lock prefix as a lock, while newer CPUs will take them as an optimization hint. In other words, using those XACQUIRE and XRELEASE prefixes will not hurt anything if you add them into your own implementation of a mutex, semaphore, you name it.

So having said that, I had to rewrite my original test code sample as such (just the relevant concurrency parts for a very basic mutex-type lock).

ASM code to enter the lock:

testHLEWrite PROC
    ; RCX = pointer to TST91 struct:
    ;       void* pPtrToNextWrite;
    ;       int nNextValue;
    ;       void* pCutoffPtr;
    ;       void* pBeginPtr;
    ;       size_t lock;          <-- new member

lbl_retry:
    xacquire lock bts qword ptr [rcx + 20h], 1      ; Try to acquire lock (use HLE hint prefix)
    jnc lbl_locked
    pause                       ; Will issue an implicit HLE abort
    jmp lbl_retry


lbl_locked:

and then to leave the lock:

(Note here that XRELEASE prefix differs from the lock prefix in that it supports a mov instruction that has a memory destination operand.)

    xrelease mov qword ptr [rcx + 20h], 0       ; Release the lock (use HLE prefix hint)

    ret
testHLEWrite ENDP

Also if you want to write it in C with the use of (Visual Studio's) intrinsics:

//Some variable to hold the lock
volatile long lock = 0;

and then the code itself:

//Acquire the lock
while(_interlockedbittestandset_HLEAcquire((long *)&lock, 1))
{
    _mm_pause();
}

and then:

//Leave the lock
_Store_HLERelease(&lock, 0);

Lastly, I want to point out that I haven't done any timing/benchmark tests on the performance of the code with and without the HLE prefixes. So if someone wants to do it (and see the validity of the HLE concept) you're welcome to it. I'll be glad to learn it as well.

MikeF
  • 1,021
  • 9
  • 29
  • Yes, answering your own question is allowed, and in fact encouraged when you solve your own problem. – Peter Cordes Jul 30 '18 at 03:20
  • 2
    The Matt Damon comment is a little chatty for an answer but I suppose we can let it pass because you do have a point :) anyhow it probably comes from here - http://pages.cs.wisc.edu/~rajwar/papers/micro01.pdf (although the general concept called "transactional memory" predates that) – Leeor Jul 31 '18 at 08:50
4

You say your CPU is a Haswell.

TSX (HLE and RTM) was disabled by a microcode update for all Haswell CPUs. You're running Windows, so we can safely assume that your system uses up-to-date microcode automatically. (You don't have to flash your BIOS; the OS can install updated CPU microcode on every boot.)

See Which CPUs support TSX, with the erratum fixed?, and also https://en.wikipedia.org/wiki/Transactional_Synchronization_Extensions. I can't rule out some new stepping of Haswell having working TSX, but the most likely explanation for xtest setting ZF is that the microcode update doesn't disable decoding of TSX instructions (otherwise xtest would #UD), but does disable ever actually entering a transactional region. (i.e. treats every transaction as aborting right away.)

If that's the case, then xacquire xchg would execute the same as a normal xchg, running the later instructions non-transactionally. (Unlike with RTM (xbegin), where the abort address is given separately.)


But if I'm wrong and you do somehow have a Haswell with working HLE, then we can look at other possible explanations for aborting the transaction (which would lead to reaching the int3 when we go through the critical section in non-transactionally and reach the xtest).

I don't think your transaction is too large (too many cache lines touched can cause an abort but I don't think that's the case here). David Kanter's guess about the internal implementation of using L1d as the transaction buffer turned out to be correct when Haswell was released. (And AFAIK, Skylake still only uses L1d, not tracking read-set or write-set in L2 or L3). But you're only touching 1 or 2 lines. The line containing the pointer, and the pointed-to line.

An interrupt inside a transaction could cause occasional aborts, so don't be shocked to find that some transactions abort. Only if they always abort does it mean that you're doing something a transaction can't handle, or maybe that the CPU has HLE disabled.

The variable you use as the lock also has to satisfy certain properties.

The XACQUIRE manual entry:

The lock variables must satisfy the guidelines described in Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1, Section 16.3.3, for elision to be successful, otherwise an HLE abort may be signaled.

From SDM vol.1:

16.3.3 Requirements for HLE Locks

For HLE execution to successfully commit transactionally, the lock must satisfy certain properties and access to the lock must follow certain guidelines.

  • An XRELEASE-prefixed instruction must restore the value of the elided lock to the value it had before the lock acquisition. This allows hardware to safely elide locks by not adding them to the write-set. The data size and data address of the lock release (XRELEASE prefixed) instruction must match that of the lock acquire (XACQUIRE prefixed) and the lock must not cross a cache line boundary.

  • Software should not write to the elided lock inside a transactional HLE region with any instruction other than an XRELEASE prefixed instruction, otherwise it may cause a transactional abort. In addition, recursive locks (where a thread acquires the same lock multiple times without first releasing the lock) may also cause a transactional abort. Note that software can observe the result of the elided lock acquire inside the critical section. Such a read operation will return the value of the write to the lock.

The processor automatically detects violations to these guidelines, and safely transitions to a non-transactional execution without elision. Since Intel TSX detects conflicts at the granularity of a cache line, writes to data collocated on the same cache line as the elided lock may be detected as data conflicts by other logical processors eliding the same lock

So your transaction could only commit if pPtrToNextWrite == pBeginPtr, because that's the value you're using to unlock, instead of the original value you read into rdx with xchg. It looks like it would be easier to just copy the register after doing the xchg to save that value before incrementing it in a loop.

But other than that, it's surprisingly flexible. It sounds like the hardware doesn't care if 0 means locked and 0xdeadbeef (or a pointer value) means available.

It's up to the programmer to design a correct locking scheme that doesn't store back the previous value if it found the lock was already taken, as well as protecting the critical section when running non-transactionally.

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
  • Thanks. I corrected it in my own answer. I too read through the Intel documentation and I am somewhat surprised that those TSX transactional regions can be aborted by a multitude of things, one of which is an interrupt. I then tried to come up with any real life application for those TSX locks and couldn't find a single one where I would use them. Maybe only as an anti-debugging mechanism (or anti-single-stepping with the TF/trap flag.) Other than that, what good does the lock do to me if it can be aborted (i.e. fail) in case of an interrupt that I can't control? – MikeF Jul 30 '18 at 06:25
  • 1
    @MikeF: The use-case for HLE is performance in the average case! Like I said, it avoids *actually* holding the cache-line containing the lock in Modified state, so avoids cache-line ping-pong from multiple cores sharing the same lock but modifying different non-conflicting parts of a large data structure. (e.g. a tree or hash table). Having 1 transaction in 10000 spuriously abort barely hurts throughput. – Peter Cordes Jul 30 '18 at 06:36
  • @MikeF: If you want to retry the transaction a few times before falling back to a non-transactional method, use RTM with xbegin / xend instead of HLE: you get to choose what happens on an `xbegin` abort, because it acts like a conditional branch (see the link at the top of this answer). You could easily keep a retry counter. – Peter Cordes Jul 30 '18 at 06:38
  • Also concerning your statement about TSX being shunted on my Haswell CPU, I just did quick-and-dirty benchmark tests with the updated locks from the code in my answer (not the original post.) One test: 5 threads running for 5 seconds with the use of `xacquire`/`xrelease` hints on the locks, as I showed there. I'm getting about 416k executions of my `testHLEWrite()` function on average. Second test: If I comment out `xacquire`/`xrelease` hints, I'm getting about 405k executions. So there must be something to it after all. – MikeF Jul 30 '18 at 06:40
  • 1
    @MikeF: A good test would be with both threads updating non-conflicting memory, so the only actually-shared memory was the lock itself. HLE could then in theory give more than a factor of 2 speedup. Or put `xor eax,eax` / `xtest` / `setz al` / `add [abort_counter], eax` inside the transaction and see if the number of non-transactional executions was smaller than the total number of executions. (Or use a conditional branch to only `add dword [abort_counter], 1` when needed.) – Peter Cordes Jul 30 '18 at 06:56
  • @PeterCordes I personally speculated that the read set and write set are actually lines marked in the L1d cache. When it reads, a snoop is sent; when it writes an RFO is sent. If another core wishes to read from member of the write set, the core will receive an RFO but realise that it is part of the write set, aborting the transaction. IF another core wants to write to the read set then it will receive a snoop, aborting the transaction. I know that i haven't got round to replying to 2 or 3 of my questions but i'll get there. I'm about to pose a question about TSX that hasnt been elaborated on. – Lewis Kelsey Oct 15 '19 at 01:32
  • Then again, it does talk about an 'elision buffer'. Which i'd presume is just a buffer keeping track of the lines that are part of a particular read and write set – Lewis Kelsey Oct 15 '19 at 01:37
  • @LewisKelsey: yes, this answer links to David Kanter's posts about TSX on Haswell which confirm that L1d holds the read / write sets. Presumably it's still the same on Skylake and Ice Lake. – Peter Cordes Oct 15 '19 at 01:37
  • *core will receive a __snoop__ but realise it is part of the write set ...If another core wants to write to the read set then it will receive an __RFO__*. Got it round the wrong way because I changed the positions of read and write but didn't switch snoop and RFO. Also, if it reads and it has a shared/modified line then it doesn't have to snoop obviously. If it writes to a line that is modified before the transaction begins then the write could be lost so I'd assume xacquire writes back modified lines so the other cores get the correct value if it rolls back and rejects the RFOs / snoops – Lewis Kelsey Oct 15 '19 at 02:20
  • @LewisKelsey: Oh, interesting point about where it keeps the old state for possible rollback. Yeah if a Modified line is the only copy of the pre-transaction state, that would be a problem. Otherwise you'd think that would be the ideal case. Hmm, what if the store buffer is involved to actually buffer the new data, and cache tags just determine whether to discard that or commit it all? Then you never have to roll back L1d itself, just take a cache-lock on lines that are accessed while you commit the transaction. (stop responding to Invalidate / share-requests for them, like for `lock add`). – Peter Cordes Oct 15 '19 at 02:29
  • There are 2 possibilities 1) Instead of writing back all modified lines i'm not sure if it is possible to write a modified line back (if a CLWB could be generated) before writing to it and including it in the write set. I'm not sure but I'd imagine snoops and RFOs could be rejected and the L3 slice controller Cbox whatever would then use the L3 copy (if core writebacks are indeed cached there and doesn't immediately write back until it needs to, i forgot) or send a request to the IMC/PCIe logic if not. – Lewis Kelsey Oct 15 '19 at 03:14
  • 2) I assume the 'elision buffer' the manuals mention stores the addresses of the read and write set lines. Maybe it buffers write data as well and then you only need to lock the lines and when a snoop / RFO fails it doesnt get rejected but pends until the lock is taken off after the restoration of previous values. Maybe the eviction buffer borrows from the store buffer but it could be separate. The store buffer only has 60 or so entries, which may prove to be more or less than the L1d cache can manage before eviction and potential abortion depending what pages it writes to. – Lewis Kelsey Oct 15 '19 at 03:14
  • @LewisKelsey: IDK how realistic it is to have a transaction that doesn't fit in the store buffer. If we know that's not the upper limit (e.g. write to more than 72 lines so we know that store coalescing can't have happened), then that rules out that hypothesis. BTW, remember that SKX's L3 is not inclusive. Normally it would be able to hold stuff that an L1d just wrote back. But there's no guarantee it can handle a bunch of lines quickly because it doesn't have ways in its sets reserved. I don't think your idea requires inclusive L3 for correctness but I haven't thought much about TSX. – Peter Cordes Oct 15 '19 at 03:38
  • Maybe it holds both the speculative write set (which may be rolled back), and the current copies of those lines (the rollback state) in the L1D, e.g. in different ways of the same set. – BeeOnRope Oct 16 '19 at 19:05