CUDA has a __byte_perm()
intrinsic that maps directly to the PRMT
instruction at the machine code (SASS) level, which is a byte-wise permute instruction. It can be used to efficiently extract and merge bytes. To affect a byte-wise left rotation, we can double up each byte, shift byte-pairs by the desired amount, then extract and merge the four high-bytes of the byte pairs.
For byte-wise rotation, we only need the lowest three bits of the shift amount, as a rotation by s
is the same as a rotation by s mod 8
. For efficiency, it is best to avoid integer types comprising fewer than 32 bits, as C++ semantics require integer types narrower than int
to be widened to int
before use in expressions. This can and does incur conversion overhead on many architectures, including GPUs.
The throughput of the PRMT
instruction is architecture dependent, so the use of __byte_perm()
may lead to code that is faster or slower than use of the classical SIMD-in-a-register method demonstrated in another answer, so be sure to benchmark in the context of your use case prior to deployment.
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
__device__ uint32_t per_byte_bit_left_rotate (uint32_t input, uint32_t amount)
{
uint32_t l = __byte_perm (input, 0, 0x1100) << (amount & 7);
uint32_t h = __byte_perm (input, 0, 0x3322) << (amount & 7);
return __byte_perm (l, h, 0x7531);
}
__global__ void rotl_kernel (uint32_t input, uint32_t amount, uint32_t *res)
{
*res = per_byte_bit_left_rotate (input, amount);
}
uint32_t ref_per_byte_bit_left_rotate (uint32_t input, uint32_t amount)
{
int s = amount & 7;
uint8_t b0 = (input >> 0) & 0xff;
uint8_t b1 = (input >> 8) & 0xff;
uint8_t b2 = (input >> 16) & 0xff;
uint8_t b3 = (input >> 24) & 0xff;
b0 = s ? ((b0 << s) | (b0 >> (8 - s))) : b0;
b1 = s ? ((b1 << s) | (b1 >> (8 - s))) : b1;
b2 = s ? ((b2 << s) | (b2 >> (8 - s))) : b2;
b3 = s ? ((b3 << s) | (b3 >> (8 - s))) : b3;
return (b3 << 24) | (b2 << 16) | (b1 << 8) | (b0 << 0);
}
// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579) /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
int main (void)
{
uint32_t arg, ref, res = 0, *res_d = 0;
uint32_t shft;
CUDA_SAFE_CALL (cudaMalloc ((void**)&res_d, sizeof(*res_d)));
for (int i = 0; i < 100000; i++) {
arg = KISS;
shft = KISS;
ref = ref_per_byte_bit_left_rotate (arg, shft);
rotl_kernel <<<1,1>>>(arg, shft, res_d);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaMemcpy (&res, res_d, sizeof (res),
cudaMemcpyDeviceToHost));
if (res != ref) {
printf ("!!!! arg=%08x shft=%d res=%08x ref=%08x\n",
arg, shft, res, ref);
}
}
CUDA_SAFE_CALL (cudaFree (res_d));
CUDA_SAFE_CALL (cudaDeviceSynchronize());
return EXIT_SUCCESS;
}