I want to add two 32-bit unsigned integers in CUDA PTX and I also want to take care of the carry propagation. I am using the code below to do that, but the result is not as expected.
Acording to the documentation, the add.cc.u32 d, a, b
performs integer addition and writes the carry-out value into the condition code register, that is CC.CF
.
On the other hand, addc.cc.u32 d, a, b
performs integer addition with carry-in and writes the carry-out value into the condition code register. The semantics of this instruction would be
d = a + b + CC.CF
. I also tryed addc.u32 d, a, b
with no difference.
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cuda.h>
typedef unsigned int u32;
#define TRY_CUDA_CALL(x) \
do \
{ \
cudaError_t err; \
err = x; \
if(err != cudaSuccess) \
{ \
printf("Error %08X: %s at %s in line %d\n", err, cudaGetErrorString(err), __FILE__, __LINE__); \
exit(err); \
} \
} while(0)
__device__ u32
__uaddo(u32 a, u32 b) {
u32 res;
asm("add.cc.u32 %0, %1, %2; /* inline */ \n\t"
: "=r" (res) : "r" (a) , "r" (b));
return res;
}
__device__ u32
__uaddc(u32 a, u32 b) {
u32 res;
asm("addc.cc.u32 %0, %1, %2; /* inline */ \n\t"
: "=r" (res) : "r" (a) , "r" (b));
return res;
}
__global__ void testing(u32* s)
{
u32 a, b;
a = 0xffffffff;
b = 0x2;
s[0] = __uaddo(a,b);
s[0] = __uaddc(0,0);
}
int main()
{
u32 *s_dev;
u32 *s;
s = (u32*)malloc(sizeof(u32));
TRY_CUDA_CALL(cudaMalloc((void**)&s_dev, sizeof(u32)));
testing<<<1,1>>>(s_dev);
TRY_CUDA_CALL( cudaMemcpy(s, s_dev, sizeof(u32), cudaMemcpyDeviceToHost) );
printf("s = %d;\n",s[0]);
return 1;
}
As far as I know, you get a carry if the result doesn't fit in the variable, which happens here and an overflow if the sign bit is corrupted, but I'm working with unsigned values.
The code above tries to add 0xFFFFFFFF
to 0x2
and of course the result won't fit on 32-bit, so why I don't get a 1 after __uaddc(0,0)
call?
EDIT
Nvidia Geforce GT 520mx
Windows 7 Ultimate, 64-bit
Visual Studio 2012
CUDA 7.0