Just for the sake of learning this, I'm trying to grasp how to use HLE prefixes XACQUIRE
and XRELEASE
. After reading the Intel documentation, my understanding was that after executing an instruction with the XACQUIRE
prefix the CPU enters into some sort of a write lock until the instruction with the XRELEASE
prefix. So I wrote the following test code to see if I'm correct. Well, there's still something that I don't understand because my code sample fails.
So can someone tell me what am I missing with those HLE prefixes?
Two fails:
The
xtest
instruction reports that HLE was not enabled, andBecause my assumed "mutex-ed" code doesn't run as a mutex, it fails concurrency.
Next is the Windows C++ project, compiled with VS 2017 with x64 .asm file as follows:
.code
testCPUID PROC
push rbx
; CPUID.07h.EBX.HLE[bit 4]==1
mov eax, 7h
xor ecx, ecx
cpuid
and rbx, 1 shl 4
mov rax, rbx
pop rbx
ret
testCPUID ENDP
testHLEWrite PROC
; RCX = pointer to TST91 struct:
; void* pPtrToNextWrite;
; int nNextValue;
; void* pCutoffPtr;
; void* pBeginPtr;
xor edx, edx
xacquire xchg [rcx], rdx ; I'm assuming that this will work as a mutex ...
xtest ; Sanity check to see if HLE got enabled?
jnz lbl_00 ; If HLE is on => ZF=0
int 3 ; we get here if HLE did not get enabled
lbl_00:
; Do some nonsensical stuff
; The idea is to write sequential values into a shared array
; to see if the lock above holds
; Format:
; > --16 sequential bytes-- <
mov r8d, dword ptr [rcx + 8]
mov byte ptr [rdx], '>'
inc rdx
; Write 16 sequential bytes
mov rax, 10h
lbl_01:
mov byte ptr [rdx], r8b
inc r8
inc rdx
dec rax
jnz lbl_01
mov byte ptr [rdx], '<'
inc rdx
cmp rdx, [rcx + 10h] ; check if reached the end of buffer
jb lbl_02
mov rdx, [rcx + 18h] ; reset ptr to the beginning of buffer
lbl_02:
mov dword ptr [rcx + 8], r8d
xrelease mov [rcx], rdx ; this will release the mutex
ret
testHLEWrite ENDP
testHLEForCorrectness PROC
; RCX = pointer to TST91 struct:
; void* pPtrToNextWrite;
; int nNextValue;
; void* pCutoffPtr;
; void* pBeginPtr;
xor edx, edx
xacquire xchg [rcx], rdx ; I'm assuming that this will work as a mutex ...
xtest ; Sanity check to see if HLE got enabled?
jnz lbl_00 ; If HLE is on => ZF=0
int 3 ; we get here if HLE did not get enabled
lbl_00:
mov r9, [rcx + 18h]
lbl_repeat:
cmp r9, rdx
jae lbl_out
cmp byte ptr [r9], '>'
jnz lbl_bad
cmp byte ptr [r9 + 1 + 10h], '<'
jnz lbl_bad
mov r8b, byte ptr [r9 + 1]
sub eax, eax
lbl_01:
cmp [r9 + rax + 1], r8b
jnz lbl_bad
inc rax
inc r8
cmp rax, 10h
jb lbl_01
add r9, 2 + 10h
jmp lbl_repeat
lbl_out:
xrelease mov [rcx], rdx ; this will release the mutex
ret
lbl_bad:
; Verification failed
int 3
testHLEForCorrectness ENDP
END
And this is how it's called from the user-mode C++ project:
#include <assert.h>
#include <Windows.h>
struct TST91{
BYTE* pNextWrite;
int nNextValue;
BYTE* pCutoffPtr;
BYTE* pBeginPtr;
};
extern "C" {
BOOL testCPUID(void);
void testHLEWrite(TST91* p);
void testHLEForCorrectness(TST91* p);
};
DWORD WINAPI ThreadProc01(LPVOID lpParameter);
TST91* gpStruct = NULL;
BYTE* gpMem = NULL; //Its size is 'gszcbMemSize' BYTEs
const size_t gszcbMemSize = 0x1000 * 8;
int main()
{
if(testCPUID())
{
gpStruct = new TST91;
gpMem = new BYTE[gszcbMemSize];
gpStruct->pNextWrite = gpMem;
gpStruct->nNextValue = 1;
gpStruct->pBeginPtr = gpMem;
gpStruct->pCutoffPtr = gpMem + gszcbMemSize - 0x100;
for(int t = 0; t < 5; t++)
{
CloseThread(CreateThread(NULL, 0,
ThreadProc01, (VOID*)(1LL << t), 0, NULL));
}
_gettch();
delete gpStruct;
delete[] gpMem;
}
else
_tprintf(L"Your CPU doesn't support HLE\n");
return 0;
}
DWORD WINAPI ThreadProc01(LPVOID lpParameter)
{
if(!SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)lpParameter))
{
assert(NULL);
}
for(;;)
{
testHLEWrite(gpStruct);
testHLEForCorrectness(gpStruct);
}
return 0;
}