The PTX instruction prmt
exposes the functionality of the machine instruction PRMT
. The default mode of the prmt
instruction is used when none of the special modes .f4e, .b4e, .rc8, .ecl, .ecr, .rc16
is specified.
The default mode has two per-byte sub-modes, controlled by the most significant bit of the 4-bit selector field for each of eight source bytes. The commonly used sub-mode is to have the msb of the selector field zero, which means the destination byte is copied verbatim from the specified source byte. This sub-mode is exposed via a device function intrinsic __byte_perm()
, and generally used to extract, insert, and permute bytes or perform bit shifts by multiples of 8. Example usage can be seen in this answer.
The other sub-mode is special, in that instead of copying the entire source byte, it replicates the most significant bit of the specified source byte across the destination byte. For this, the msb of the selector field needs to be set to one. Programmers must use PTX inline assembly to access this functionality.
I did not design the GPU hardware, so cannot speak to why that sub-mode was implemented. It is generally useful when the msb of each byte serves as a boolean value that one needs to convert into a mask for the whole byte. This in turn is generally useful for byte-wise processing within a 32-bit register. Note that CUDA includes a good many device function intrinsics for such processing, and disassembly will confirm that the msb replication sub-mode of the prmt
default mode is used for many of those.
A fully worked example, emulation of the paddsb
operation (byte-wise addition with signed saturation), is shown below. Note the use of prmt
with msb replication inside masked_sign_to_byte_mask()
.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#if (__CUDACC__)
#define __HOST__ __host__
#define __DEVICE__ __device__
#else // __CUDACC__
#define __HOST__
#define __DEVICE__
#endif // __CUDACC__
#define MSB_MASK (0x80808080U) // mask for msb of each byte
// r = (a ^ b) & ~c
__HOST__ __DEVICE__ uint32_t lop3_14 (uint32_t a, uint32_t b, uint32_t c)
{
uint32_t r;
#if (__CUDA_ARCH__ >= 500)
asm ("lop3.b32 %0,%1,%2,%3,0x14;\n\t" : "=r"(r) : "r"(a), "r"(b), "r"(c));
#else // __CUDA_ARCH__
r = (a ^ b) & ~c;
#endif // __CUDA_ARCH__
return r;
}
// r = (a ^ b) & c
__HOST__ __DEVICE__ uint32_t lop3_28 (uint32_t a, uint32_t b, uint32_t c)
{
uint32_t r;
#if (__CUDA_ARCH__ >= 500)
asm ("lop3.b32 %0,%1,%2,%3,0x28;\n\t" : "=r"(r) : "r"(a), "r"(b), "r"(c));
#else // __CUDA_ARCH__
r = (a ^ b) & c;
#endif // __CUDA_ARCH__
return r;
}
// r = a ^ (~b & c)
__HOST__ __DEVICE__ uint32_t lop3_d2 (uint32_t a, uint32_t b, uint32_t c)
{
uint32_t r;
#if (__CUDA_ARCH__ >= 500)
asm ("lop3.b32 %0,%1,%2,%3,0xd2;\n\t" : "=r"(r) : "r"(a), "r"(b), "r"(c));
#else // __CUDA_ARCH__
r = a ^ (~b & c);
#endif // __CUDA_ARCH__
return r;
}
// r = (a & c) | (b & ~c)
__HOST__ __DEVICE__ uint32_t lop3_f4 (uint32_t a, uint32_t b, uint32_t c)
{
uint32_t r;
#if (__CUDA_ARCH__ >= 500)
asm ("lop3.b32 %0,%1,%2,%3,0xf4;\n\t" : "=r"(r) : "r"(a), "r"(b), "r"(c));
#else // __CUDA_ARCH__
r = (a & c) | (b & ~c);
#endif // __CUDA_ARCH__
return r;
}
__HOST__ __DEVICE__ uint32_t masked_sign_to_byte_mask (uint32_t a)
{
#if (__CUDA_ARCH__ >= 200)
asm ("prmt.b32 %0,%0,0,0xba98;" : "+r"(a)); // convert MSBs to masks
#else
a = a & MSB_MASK;
a = a + a - (a >> 7); // extend MSBs to full byte to create mask
#endif
return a;
}
__HOST__ __DEVICE__ uint32_t masked_select (uint32_t a, uint32_t b, uint32_t m)
{
#if (__CUDA_ARCH__ >= 500)
return lop3_f4 (a, b, m);
#elif 0
return (((a)&(m))|((b)&(~(m))));
#else
return((((a)^(b))&(m))^(b));
#endif
}
/*
my_paddsb() performs byte-wise addition with signed saturation. In the
case of overflow, positive results are clamped at 127, while negative
results are clamped at -128.
*/
__HOST__ __DEVICE__ uint32_t my_paddsb (uint32_t a, uint32_t b)
{
uint32_t sum, res, ofl, sga, msk;
res = (a & ~MSB_MASK) + (b & ~MSB_MASK);
sum = a ^ b;
ofl = lop3_14 (res, a, sum); // ofl = (res ^ a) & ~sum
sga = masked_sign_to_byte_mask (a); // sign(a)-mask
msk = masked_sign_to_byte_mask (ofl);// overflow-mask
res = lop3_d2 (res, ~MSB_MASK, sum); // res = res ^ (MSB_MASK & sum)
sga = lop3_28 (sga, ~MSB_MASK, msk); // sga = (sga ^ ~MSB_MASK) & msk
res = masked_select (sga, res, msk); // res = (sga & msk) | (res & ~msk)
return res;
}
__global__ void kernel (uint32_t a, uint32_t b)
{
printf ("GPU: %08x\n", my_paddsb (a, b));
}
int main (void)
{
uint32_t a = 0x12ef70a0;
uint32_t b = 0x34cd6090;
kernel<<<1,1>>>(a, b);
cudaDeviceSynchronize();
printf ("CPU: %08x\n", my_paddsb (a, b));
return EXIT_SUCCESS;
}