5

By gcc docs: x86-transactional-memory-intrinsics.html, when transaction failed/abort, _xbegin() should return a abort status . However, I find it return 0 sometimes. And the frequency is very high. What kind of situation that **_xbegin()**will return 0?

After checking manual, I find many situations may cause this result. For example, CPUID, SYSTEMCALL, CFLUSH.etc. However, I don't think my code has triggered any of them.

Here is my code: Simulating a small bank, a random account transfer 1$ to another account.

#include "immintrin.h"
#include <thread>
#include <unistd.h>
#include <iostream>

using namespace std;

#define n_threads 1
#define OPSIZE 1000000000
typedef struct Account{
    long balance;
    long number;
} __attribute__((aligned(64))) account_t;

typedef struct Bank{
    account_t* accounts;
    long size;
} bank_t;

bool done = 0;
long *tx, *_abort, *capacity, *debug, *failed, *conflict, *zero;

void* f1(bank_t* bank, int id){
    for(int i=0; i<OPSIZE; i++){ 
        int src = rand()%bank->size;
        int dst = rand()%bank->size;
        while(src == dst){
            dst = rand()%bank->size;
        } 

        while(true){
            unsigned stat =  _xbegin();
            if(stat == _XBEGIN_STARTED){
                bank->accounts[src].balance++;  
                bank->accounts[dst].balance--;
                _xend();
                asm volatile("":::"memory");    
                tx[id]++;
                break;
            }else{
                _abort[id]++;

                if (stat == 0){
                    zero[id]++;
                }
                if (stat & _XABORT_CONFLICT){
                    conflict[id]++;
                }
                if (stat & _XABORT_CAPACITY){
                    capacity[id]++;
                }
                if (stat & _XABORT_DEBUG){
                    debug[id]++;
                }
                if ((stat & _XABORT_RETRY) == 0){
                    failed[id]++;
                    break;
                }
                if (stat & _XABORT_NESTED){
                    printf("[ PANIC ] _XABORT_NESTED\n");
                    exit(-1);
                }
                if (stat & _XABORT_EXPLICIT){
                    printf("[ panic ] _XBEGIN_EXPLICIT\n");
                    exit(-1);
                }
            }
        }
    }
    return NULL;
}
void* f2(bank_t* bank){
    printf("_heartbeat function\n");
    long last_txs=0, last_aborts=0, last_capacities=0, last_debugs=0, last_faileds=0, last_conflicts=0, last_zeros = 0;
    long txs=0, aborts=0, capacities=0, debugs=0, faileds=0, conflicts=0, zeros = 0;
    while(1){
        last_txs = txs;
        last_aborts = aborts;
        last_capacities = capacities;
        last_debugs = debugs;
        last_conflicts = conflicts;
        last_faileds = faileds;
        last_zeros = zeros;

        txs=aborts=capacities=debugs=faileds=conflicts=zeros = 0;
        for(int i=0; i<n_threads; i++){
            txs += tx[i];
            aborts += _abort[i];
            faileds += failed[i];
            capacities += capacity[i];
            debugs += debug[i];
            conflicts += conflict[i];
            zeros += zero[i];
        }

        printf("txs\t%ld\taborts\t\t%ld\tfaileds\t%ld\tcapacities\t%ld\tdebugs\t%ld\tconflit\t%ld\tzero\t%ld\n", 
            txs - last_txs, aborts - last_aborts , faileds - last_faileds, 
            capacities- last_capacities, debugs - last_debugs, conflicts - last_conflicts,
            zeros- last_zeros);

        sleep(1);
    }
}

int main(int argc, char** argv){
    int accounts = 10240;

    bank_t* bank = new bank_t;
    bank->accounts = new account_t[accounts];
    bank->size = accounts;

    for(int i=0; i<accounts; i++){
        bank->accounts[i].number = i;
        bank->accounts[i].balance = 0;
    }

    thread* pid[n_threads];
    tx = new long[n_threads];
    _abort = new long[n_threads];
    capacity = new long[n_threads];
    debug = new long[n_threads];
    failed = new long[n_threads];
    conflict = new long[n_threads];
    zero = new long[n_threads];

    thread* _heartbeat = new thread(f2, bank);
    for(int i=0; i<n_threads; i++){
        tx[i] = _abort[i] = capacity[i] = debug[i] = failed[i] = conflict[i] = zero[i] =  0;
        pid[i] = new thread(f1, bank, i);
    }

//  sleep(5);
    for(int i=0; i<n_threads;i++){
        pid[i]->join();
    }
    return 0;
}

Supplements:

  1. All accounts is 64bit aligned. I printed bank->accounts[0], bank->accounts1 address. 0xf41080,0xf410c0。
  2. Using -O0 and asm volatile("":::"memory");therefore there is no instruction reordering problems.
  3. Abort rate increases at time. Here is the result

    txs     84      aborts          0       faileds 0       capacities      0     debugs  0       conflit 0       zero    0
    txs     17070804      aborts          71      faileds 68      capacities      9       debugs  0       conflit 3       zero    59
    txs     58838         aborts          9516662 faileds 9516661 capacities      0       debugs  0       conflit 1       zero    9516661
    txs     0             aborts          9550428 faileds 9550428 capacities      0       debugs  0       conflit 0       zero    9550428
    txs     0             aborts          9549254 faileds 9549254 capacities      0       debugs  0       conflit 0       zero    9549254
    
  4. Even through n_threads is 1, the result is same.

  5. If I add coarse lock after fallback as follow, the result seems be correct.

    int fallback_lock;
    
    bool 
    rtm_begin(int id)
    {   
        while(true) { 
            unsigned stat;
            stat = _xbegin ();
            if(stat == _XBEGIN_STARTED) {
                return true;
            } else {
                _abort[id]++;
                if (stat == 0){
                    zero[id]++;
                }
                //call some fallback function
                if (stat& _XABORT_CONFLICT){
                    conflict[id]++;
                }
    
                //will not succeed on a retry
                if ((stat &  _XABORT_RETRY) == 0) {
                    failed[id]++;
                    //grab a fallback lock
                    while (!__sync_bool_compare_and_swap(&fallback_lock,0,1)) {
                    }
                    return false;
                }
            }
        }
    }
    ....
    
    in_rtm = rtm_begin(id);
    y = fallback_lock;
    accounts[src].balance--;
    accounts[dst].balance++;
    if (in_rtm){
        _xend();
    }else{
        while(!__sync_bool_compare_and_swap(&fallback_lock, 1, 0)){
        }
    }
    
Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
Waker Leo
  • 129
  • 1
  • 5
  • Hmm. Cursory inspection suggests things are fine (though, I have not run it, and it's been quite some time since I worked actively with TM). The failure with 1 thread seems curious -- is it still failing if you pin the thread to a core? Similarly, the coarse lock result is a little curious. What happens if you back off (spinwait) instead? – Matthew G. Jul 08 '16 at 20:28
  • It fails even when pinning the thread to a core. Someone suggests it could because tlb missing or cache missing. Because transaction abort go back to its start point and as if interrupt never happens. So the mistake will happen next time. – Waker Leo Jul 09 '16 at 10:34
  • That seems immensely awkward. :S Sorry, alas, I think we have exhausted my expertise here! – Matthew G. Jul 12 '16 at 19:44
  • Which processor were you using? TSX is buggy and has been disabled on Haswell, Haswell-E, Haswell-EP and early Broadwell CPUs upon a microcode update. – bit2shift Sep 05 '17 at 20:18

1 Answers1

0

The hardware documentation on RTM suggests the following:

The value of EAX can be '0' following an RTM abort. For example, a CPUID instruction when used inside an RTM region causes a transactional abort and may not satisfy the requirements for setting any of the EAX bits. This may result in an EAX value of '0'.

(Where, EAX is the hardware register used to communicate status, that GCC will in turn return to you as the return value of )

Matthew G.
  • 1,298
  • 10
  • 24
  • Thanks for your answer, but I don't think CPUID or SYSTEMCALL cause the problem. I reedit my question and touch the whole code. Can you explain why a fallback coarse lock is needed? – Waker Leo Jul 08 '16 at 05:41
  • 1
    Looks like there are a bunch of things that cause [aborts](http://www.realworldtech.com/haswell-tm/2/). – David Wohlferd Jul 08 '16 at 06:11