If the array size is known at compile time, you could do something like this:
#include <inttypes.h>
#include <malloc.h>
#include <stdio.h>
#include <memory.h>
#define str(s) #s
#define xstr(s) str(s)
#define ARRAYSIZE 4
asm(".macro AddArray2 p1, p2, from, to\n\t"
"movq (\\from*8)(\\p2), %rax\n\t"
"adcq %rax, (\\from*8)(\\p1)\n\t"
".if \\to-\\from\n\t"
" AddArray2 \\p1, \\p2, \"(\\from+1)\", \\to\n\t"
".endif\n\t"
".endm\n");
asm(".macro AddArray p1, p2, p3\n\t"
"movq (\\p2), %rax\n\t"
"addq %rax, (\\p1)\n\t"
".if \\p3-1\n\t"
" AddArray2 \\p1, \\p2, 1, (\\p3-1)\n\t"
".endif\n\t"
".endm");
int main()
{
unsigned char carry;
// assert(ARRAYSIZE > 0);
// Create the arrays
uint64_t *anum = (uint64_t *)malloc(ARRAYSIZE * sizeof(uint64_t));
uint64_t *bnum = (uint64_t *)malloc(ARRAYSIZE * sizeof(uint64_t));
// Put some data in
memset(anum, 0xff, ARRAYSIZE * sizeof(uint64_t));
memset(bnum, 0, ARRAYSIZE * sizeof(uint64_t));
bnum[0] = 1;
// Print the arrays before the add
printf("anum: ");
for (int x=0; x < ARRAYSIZE; x++)
{
printf("%I64x ", anum[x]);
}
printf("\nbnum: ");
for (int x=0; x < ARRAYSIZE; x++)
{
printf("%I64x ", bnum[x]);
}
printf("\n");
// Add the arrays
asm ("AddArray %[anum], %[bnum], " xstr(ARRAYSIZE) "\n\t"
"setc %[carry]" // Get the flags from the final add
: [carry] "=q"(carry)
: [anum] "r" (anum), [bnum] "r" (bnum)
: "rax", "cc", "memory"
);
// Print the result
printf("Result: ");
for (int x=0; x < ARRAYSIZE; x++)
{
printf("%I64x ", anum[x]);
}
printf(": %d\n", carry);
}
This gives code like this:
mov (%rsi),%rax
add %rax,(%rbx)
mov 0x8(%rsi),%rax
adc %rax,0x8(%rbx)
mov 0x10(%rsi),%rax
adc %rax,0x10(%rbx)
mov 0x18(%rsi),%rax
adc %rax,0x18(%rbx)
setb %bpl
Since adding 1 to all f's will completely overflow everything, the output from the code above is:
anum: ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff
bnum: 1 0 0 0
Result: 0 0 0 0 : 1
As written, ARRAYSIZE can be up to about 100 elements (due to gnu's macro depth nesting limits). Seems like it should be enough...