achieve GCC cas function for version 4.1.2 and earlier

Question

My new company project, they want the code run for the 32-bit, the compile server is a CentOS 5.0 with GCC 4.1.1, that was the nightmare.
There are lots of functions using in the project like __sync_fetch_and_add was given in GCC 4.1.2 and later.

I was told can not upgrade GCC version, so I have to make another solution after Googling for several hours.

When I wrote a demo to test, I just got the wrong answer, the code blow want to replace function __sync_fetch_and_add

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>

static int count = 0;

int compare_and_swap(int* reg, int oldval, int newval) 
{
    register char result;
#ifdef __i386__
    __asm__ volatile ("lock; cmpxchgl %3, %0; setz %1" 
                     : "=m"(*reg), "=q" (result) 
                     : "m" (*reg), "r" (newval), "a" (oldval) 
                     : "memory");
    return result;
#elif defined(__x86_64__)
    __asm__ volatile ("lock; cmpxchgq %3, %0; setz %1" 
                     : "=m"(*reg), "=q" (result) 
                     : "m" (*reg), "r" (newval), "a" (oldval) 
                     : "memory");
    return result;
#else
    #error:architecture not supported and gcc too old
#endif

}

void *test_func(void *arg)
{
    int i = 0;
    for(i = 0; i < 2000; ++i) {
        compare_and_swap((int *)&count, count, count + 1);
    }

    return NULL;
}

int main(int argc, const char *argv[])
{
    pthread_t id[10];
    int i = 0;

    for(i = 0; i < 10; ++i){
        pthread_create(&id[i], NULL, test_func, NULL);
    }

    for(i = 0; i < 10; ++i) {
        pthread_join(id[i], NULL);
    }
    //10*2000=20000
    printf("%d\n", count);

    return 0;
}

Whent I got the wrong result:

[root@centos-linux-7 workspace]# ./asm
17123
[root@centos-linux-7 workspace]# ./asm
14670
[root@centos-linux-7 workspace]# ./asm
14604
[root@centos-linux-7 workspace]# ./asm
13837
[root@centos-linux-7 workspace]# ./asm
14043
[root@centos-linux-7 workspace]# ./asm
16160
[root@centos-linux-7 workspace]# ./asm
15271
[root@centos-linux-7 workspace]# ./asm
15280
[root@centos-linux-7 workspace]# ./asm
15465
[root@centos-linux-7 workspace]# ./asm
16673

I realize in this line

compare_and_swap((int *)&count, count, count + 1);

count + 1 was wrong!

Then how can I implement the same function as __sync_fetch_and_add. The compare_and_swap function works when the third parameter is constant.

By the way, compare_and_swap function is that right? I just Googled for that, not familiar with assembly.

I got despair with this question.

………………………………………………………………………………………………………………………………………………………………………………………………………………………

after seeing the answer below,I use while and got the right answer,but seems confuse more. here is the code:

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>

static unsigned long  count = 0;

int sync_add_and_fetch(int* reg, int oldval, int incre) 
{
    register char result;
#ifdef __i386__
    __asm__ volatile ("lock; cmpxchgl %3, %0; setz %1" : "=m"(*reg), "=q" (result) : "m" (*reg), "r" (oldval + incre), "a" (oldval) : "memory");
    return result;
#elif defined(__x86_64__)
    __asm__ volatile ("lock; cmpxchgq %3, %0; setz %1" : "=m"(*reg), "=q" (result) : "m" (*reg), "r" (newval + incre), "a" (oldval) : "memory");
    return result;
#else
    #error:architecture not supported and gcc too old
#endif

}


void *test_func(void *arg)
{
    int i=0;
    int result = 0;
    for(i=0;i<2000;++i)
    {
        result = 0;
        while(0 == result)
        {
            result = sync_add_and_fetch((int *)&count, count, 1);
        }
    }

    return NULL;
}

int main(int argc, const char *argv[])
{
    pthread_t id[10];
    int i = 0;

    for(i=0;i<10;++i){
        pthread_create(&id[i],NULL,test_func,NULL);
    }

    for(i=0;i<10;++i){
        pthread_join(id[i],NULL);
    }
    //10*2000=20000
    printf("%u\n",count);

    return 0;
}

the answer goes right to 20000,so i think when you use sync_add_and_fetch function,you should goes with a while loop is stupid,so I write like this:

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>

static unsigned long  count = 0;

int compare_and_swap(int* reg, int oldval, int incre) 
{
    register char result;
#ifdef __i386__
    __asm__ volatile ("lock; cmpxchgl %3, %0; setz %1" : "=m"(*reg), "=q" (result) : "m" (*reg), "r" (oldval + incre), "a" (oldval) : "memory");
    return result;
#elif defined(__x86_64__)
    __asm__ volatile ("lock; cmpxchgq %3, %0; setz %1" : "=m"(*reg), "=q" (result) : "m" (*reg), "r" (newval + incre), "a" (oldval) : "memory");
    return result;
#else
    #error:architecture not supported and gcc too old
#endif

}

void sync_add_and_fetch(int *reg,int oldval,int incre)
{
    int ret = 0;
    while(0 == ret)
    {
       ret = compare_and_swap(reg,oldval,incre);
    }
}

void *test_func(void *arg)
{
    int i=0;
    for(i=0;i<2000;++i)
    {
        sync_add_and_fetch((int *)&count, count, 1);
    }

    return NULL;
}

int main(int argc, const char *argv[])
{
    pthread_t id[10];
    int i = 0;

    for(i=0;i<10;++i){
        pthread_create(&id[i],NULL,test_func,NULL);
    }

    for(i=0;i<10;++i){
        pthread_join(id[i],NULL);
    }
    //10*2000=20000
    printf("%u\n",count);

    return 0;
}

but when i run this code with ./asm after g++ -g -o asm asm.cpp -lpthread.the asm just stuck for more than 5min,see top in another terminal:

3861 root 19 0 102m 888 732 S 400 0.0 2:51.06 asm

I just confused,is this code not the same?

Do matching constraints work for memory operands? You need to make sure that `*reg` uses the same memory location for both the input and output operand, or else gcc will think it can use this to copy a value. Or does gcc4.1 support a `"+m"(*reg)` read-write memory operand? — Peter Cordes, Jul 30 '18 at 21:06
@phuclv compiling a newer gcc version can not run for the centos5.0 with gcc 4.1.1. will be wrong for can't find server functions. — 52coder, Jul 31 '18 at 06:51
@PeterCordes look for the gcc page,does not find gcc4.1 whether support "+m"(*reg) or not,In the code above ,the demo run well with "=m"(*reg). — 52coder, Jul 31 '18 at 06:55
Have you generated a list of the __sync builtins you need, including the types used for each? — Nominal Animal, Jul 31 '18 at 06:58
@NominalAnimal server functions list:__sync_val_compare_and_swap __sync_add_and_fetch __sync_sub_and_fetch ,is there other solution? — 52coder, Jul 31 '18 at 07:53
@52coder: Like I said in my answer, it's easy to write inline asm that works correctly in one test, but can break with other surrounding code. — Peter Cordes, Jul 31 '18 at 16:54
Am I reading this wrong? What if `cmpxchg` fails due to another thread getting there 'first?' You haven't told the compiler that there's any reason to believe that `oldval` is changing (ie no constraints on the asm, no `volatile` qualifier on the variable, etc), so it's not going to see any reason to re-read the current value (in optimized builds). cmpxchg sets eax to the 'new' oldval, but you aren't using it. So it's just going to spin waiting until the next time the counts match. You'll either need to force `sync_add_and_fetch` to re-read, or have compare_and_swap return the new value. — David Wohlferd, Jul 31 '18 at 22:14
@DavidWohlferd when call the function compare_and_swap,I have to use the while loop to make sure it does swap(sync add and so on) — 52coder, Aug 01 '18 at 13:17
Yes, you do have to use a loop. But your loop is wrong. Picture this: You are in test_func and you call sync_add_and_fetch. At the instant you call it, count is 5. You get into sync_add_and_fetch and start calling compare_and_swap using oldval=5. But before you have a chance to cmpxchgq, another thread gets there first, so count is now 6. Since you never update your copy of oldval, you are going to continue to try to use 5, but that's not going to work (until count overflows). When cmpxchgq fails, it tells you what you should start using for oldval (in rax), you just aren't using it. — David Wohlferd, Aug 01 '18 at 21:33
@DavidWohlferd tanks for you answer,I've got your idea.That's why the program got stuck, because I think when you use sync_add_and_fetch function,you should goes with a while loop is stupid,so I write a function to call compare_and_swap and judge the rusult in while loop. — 52coder, Aug 02 '18 at 09:05

score 1 · Answer 1 · answered Jul 30 '18 at 18:35

1

The 64-bit compare_and_swap is wrong as it swaps 64 bits but int is only 32 bits.

compare_and_swap should be used in a loop which retries it until is succeeds.

answered Jul 30 '18 at 18:35

Timothy Baldwin

3,551
1
14
23

update the question after using while loop,still got a trouble,if you have time,please help me see for a moment,thanks. – 52coder Jul 31 '18 at 06:46

Peter Cordes · Answer 2 · 2018-07-30T22:18:23.843

Your result look right to me. lock cmpxchg succeeds most of the time, but will fail if another core beat you to the punch. You're doing 20k attempts to cmpxchg count+1, not 20k atomic increments.

To write __sync_fetch_and_add with inline asm, you'll want to use lock xadd. It's specifically designed to implement fetch-add.

Implementing other operations, like fetch-or or fetch-and, require a CAS retry loop if you actually need the old value. So you could make a version of the function that doesn't return the old value, and is just a sync-and without the fetch, using lock and with a memory destination. (Compiler builtins can make this optimization based on whether the result is needed or not, but an inline asm implementation doesn't get a chance to choose asm based on that information.)

For efficiency, remember that and, or, add and many other instructions can use immediate operands, so a "re"(src) constraint would be appropriate (not "ri" for int64_t on x86-64, because that would allow immediates too large. https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html). But cmpxchg, xadd, and xchg can't use immediates, of course.

I'd suggest looking at compiler output for modern gcc (e.g. on http://godbolt.org/) for functions using the builtin, to see what compilers do.

But beware that inline asm can compile correctly given one set of surrounding code, but not the way you expect given different code. e.g. if the surrounding code copied a value after using CAS on it (probably unlikely), the compiler might decide to give the asm template two different memory operands for "=m"(*reg) and "m"(*reg), but your asm template assumes they will always be the same address.

IDK if gcc4.1 supports it, but "+m"(*reg) would declare a read/write memory operand. Otherwise perhaps you can use a matching constraint to say that the input is in the same location as an earlier operand, like "0"(*reg). But that might only work for registers, not memory, I didn't check.

`"a" (oldval)` is a bug: cmpxchg writes EAX on failure.

It's not ok to tell the compiler you leave a reg unmodified, and then write an asm template that does modify it. You will get unpredictable behaviour from stepping on the compiler's toes.

See c inline assembly getting "operand size mismatch" when using cmpxchg for a safe inline-asm wrapper for lock cmpxchg. It's written for gcc6 flag-output, so you'll have to back-port that and maybe a few other syntax details to crusty old gcc4.1.

That answer also addresses returning the old value so it doesn't have to be separately loaded.

(Using ancient gcc4.1 sounds like a bad idea to me, especially for writing multi-threaded code. So much room for error from porting working code with __sync builtins to hand-rolled asm. The risks of using a newer compiler, like stable gcc5.5 if not gcc7.4, are different but probably smaller.)

If you're going to rewrite code using __sync builtins, the sane thing would be to rewrite it using C11 stdatomic.h, or GNU C's more modern __atomic builtins that are intended to replace __sync.

The Linux kernel does successfully use inline asm for hand-rolled atomics, though, so it's certainly possible.

update the question after using while loop,still got a trouble,if you have time,please help me see for a moment,thanks. — 52coder, Jul 31 '18 at 06:46
@52coder: you don't need a while loop for `sync_fetch_and_add`, use `lock xadd` instead of `lock cmpxchg`. Your function naming now makes no sense; you have a function called `compare_and_swap` which can only add, so it's not a generic compare-and-swap function anymore. — Peter Cordes, Jul 31 '18 at 16:57

Nominal Animal · Answer 3 · 2018-08-01T17:20:36.307

If you truly are in such a predicament, I would start with the following header file:

#ifndef   SYNC_H
#define   SYNC_H
#if defined(__x86_64__) || defined(__i386__)

static inline int  sync_val_compare_and_swap_int(int *ptr, int oldval, int newval)
{
    __asm__ __volatile__( "lock cmpxchgl %[newval], %[ptr]"
                        : "+a" (oldval), [ptr] "+m" (*ptr)
                        : [newval] "r" (newval)
                        : "memory" );
    return oldval;
}

static inline int  sync_fetch_and_add_int(int *ptr, int val)
{
    __asm__ __volatile__( "lock xaddl %[val], %[ptr]"
                        : [val] "+r" (val), [ptr] "+m" (*ptr)
                        :
                        : "memory" );
    return val;
}


static inline int  sync_add_and_fetch_int(int *ptr, int val)
{
    const int  old = val;
    __asm__ __volatile__( "lock xaddl %[val], %[ptr]"
                        : [val] "+r" (val), [ptr] "+m" (*ptr)
                        :
                        : "memory" );
    return old + val;
}

static inline int  sync_fetch_and_sub_int(int *ptr, int val) { return sync_fetch_and_add_int(ptr, -val); }
static inline int  sync_sub_and_fetch_int(int *ptr, int val) { return sync_add_and_fetch_int(ptr, -val); }

/* Memory barrier */
static inline void  sync_synchronize(void) { __asm__ __volatile__( "mfence" ::: "memory"); }

#else
#error Unsupported architecture.
#endif
#endif /* SYNC_H */

The same extended inline assembly works for both x86 and x86-64. Only the int type is implemented, and you do need to replace possible __sync_synchronize() calls with sync_synchronize(), and each __sync_...() call with sync_..._int().

To test, you can use e.g.

#include <stdlib.h>
#include <pthread.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>
#include "sync.h"

#define  THREADS   16
#define  PERTHREAD 8000

void *test_func1(void *sumptr)
{
    int *const sum = sumptr;
    int        n = PERTHREAD;
    while (n-->0)
        sync_add_and_fetch_int(sum, n + 1);
    return NULL;
}

void *test_func2(void *sumptr)
{
    int *const sum = sumptr;
    int        n = PERTHREAD;
    while (n-->0)
        sync_fetch_and_add_int(sum, n + 1);
    return NULL;
}

void *test_func3(void *sumptr)
{
    int *const sum = sumptr;
    int        n = PERTHREAD;
    int        oldval, curval, newval;
    while (n-->0) {
        curval = *sum;
        do {
            oldval = curval;
            newval = curval + n + 1;
        } while ((curval = sync_val_compare_and_swap_int(sum, oldval, newval)) != oldval);
    }
    return NULL;
}

static void *(*worker[3])(void *) = { test_func1, test_func2, test_func3 };

int main(void)
{
    pthread_t       thread[THREADS];
    pthread_attr_t  attrs;
    int             sum = 0;
    int             t, result;

    pthread_attr_init(&attrs);
    pthread_attr_setstacksize(&attrs, 65536);
    for (t = 0; t < THREADS; t++) {
        result = pthread_create(thread + t, &attrs, worker[t % 3], &sum);
        if (result) {
            fprintf(stderr, "Failed to create thread %d of %d: %s.\n", t+1, THREADS, strerror(errno));
            exit(EXIT_FAILURE);
        }
    }
    pthread_attr_destroy(&attrs);

    for (t = 0; t < THREADS; t++)
        pthread_join(thread[t], NULL);

    t = THREADS * PERTHREAD * (PERTHREAD + 1) / 2;
    if (sum == t)
        printf("sum = %d (as expected)\n", sum);
    else
        printf("sum = %d (expected %d)\n", sum, t);

    return EXIT_SUCCESS;
}

Unfortunately, I don't have an ancient version of GCC to test, so this has only been tested with GCC 5.4.0 and GCC-4.9.3 for x86 and x86-64 (using -O2) on Linux.

If you find any bugs or issues in the above, please let me know in a comment so I can verify and fix as needed.

thanks for your help,when i run your code,there is a mistake.https://stackoverflow.com/questions/10437990/cant-find-a-register-in-class-creg-while-reloading-asm-memcpy-inline-asm — 52coder, Aug 01 '18 at 13:06
[root@localhost ~]# gcc -g -o stackoverflow stackoverflow.c -lpthread sync.h: In function ‘sync_val_compare_and_swap_int’: sync.h:7: error: can't find a register in class ‘AREG’ while reloading ‘asm’ [root@localhost ~]# — 52coder, Aug 01 '18 at 13:06
@52coder: Could you try with `__asm__ __volatile__ ( "lock cmpxchgl %[newval], %[ptr]" : "+a" (oldval) : "a" (oldval), [newval] "r" (newval), [ptr] "m" (*ptr) : "memory" )`? — Nominal Animal, Aug 01 '18 at 16:45
Why would you use `"+r" (val)` (a read/write operand) and then a separate read-only input for `val`? And why not a `"+m"` input for `*ptr`, instead of a read-only input? The memory clobber makes it ok to modify that memory, but it's still weird. See my sync_xchg asm on the OP's other question. : [Achieve window function InterlockedExchange in Linux](https://stackoverflow.com/q/51617338). But yes, that's how you use `lock xadd`. — Peter Cordes, Aug 01 '18 at 16:50
@52coder: It seems that without optimization, gcc doesn't figure out that `"+a" (oldval)` and `"a" (oldval)` can be CSE'd, so it can use EAX for both operands. https://godbolt.org/g/gthsox. As you can see, it also breaks `sync_fetch_and_add_int` because it puts the read-only version in a different register (uses ECX then reads EAX). But with `-O3`, it all works because the compiler sees its the same variable. **Best solution: remove the redundant read-only copy of the operand. https://godbolt.org/g/KTjpbx compiles correctly with and without optimization** (with gcc4.1 on godbolt) — Peter Cordes, Aug 01 '18 at 17:06
And BTW, more efficient `sync_add_int` is possible using `lock add` or `lock sub`, instead of defining it in terms of the `lock xadd` version. But that will work. If you wanted efficiency, you wouldn't be using gcc4.1 — Peter Cordes, Aug 01 '18 at 17:09
@PeterCordes: Too long since last wrote inline asm, that's all. GCC-4.9.3 and later did figure it out correctly, and I tend to err on the explicit side. (Forgot that it suffices to declare i+o in outputs, basically.) Anyway, I do agree wrt. the constraints; thanks for the note! (As to `sync_add_int()`, I only wanted to point out `sync_fetch_and_add()` is preferable over `sync_add_and_fetch()`. Since they weren't used in the test, I removed them.) — Nominal Animal, Aug 01 '18 at 17:27

achieve GCC cas function for version 4.1.2 and earlier

3 Answers3

`"a" (oldval)` is a bug: cmpxchg writes EAX on failure.

Linked

achieve GCC cas function for version 4.1.2 and earlier

3 Answers3

"a" (oldval) is a bug: cmpxchg writes EAX on failure.

Linked

`"a" (oldval)` is a bug: cmpxchg writes EAX on failure.