Here is one possible approach using __ballot()
as suggested by @tera.
- in a coalesced fashion, load the data by tiles into shared memory
- do an initial transpose in shared memory during the load, to arrange columns into rows to facilitate the
__ballot()
operation.
- perform ballots to build the "final" row-wise data out of the column-wise data (that has been stored in rows)
- write the data back out
Steps 1-3 should be fairly efficient on the GPU, at least as best I know how. Step 4 is hampered by the fact that a bit-wise transposition of bit-blocks does not yield a set of 32-bit quantities that are adjacent in memory. Therefore the memory access pattern on the global store is scattered, in the general case.
The output indexing operation was the most tedious to visualize/arrange.
Following is a fully-worked example with 4 test cases. For the test cases I also wrote a naive CPU function to perform the "golden" test, borrowing from another idea in the comments under the question.
$ cat t435.cu
#include <stdio.h>
#include <stdlib.h>
#define IDX(d,x,y,ld) d[y*ld+x]
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long dtime_usec(unsigned long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void bt(const unsigned * __restrict__ in, unsigned * __restrict__ out, const unsigned idim){
// assumes "square" binary matrix transpose
// assumes kernel is launched with 1024 threads per block, 2D, 32x32
// assumes input matrix dimension idim is evenly divisible by 32 bits
// assumes idim is specified in bits
__shared__ unsigned smem[32][33];
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
smem[threadIdx.x][threadIdx.y] = ((idx < idim/32)&&(idy < idim))?IDX(in,idx,idy,idim/32):0;
__syncthreads();
unsigned myval = smem[threadIdx.y][31-threadIdx.x];
__syncthreads();
smem[ 0][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<31)&myval));
smem[ 1][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<30)&myval));
smem[ 2][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<29)&myval));
smem[ 3][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<28)&myval));
smem[ 4][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<27)&myval));
smem[ 5][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<26)&myval));
smem[ 6][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<25)&myval));
smem[ 7][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<24)&myval));
smem[ 8][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<23)&myval));
smem[ 9][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<22)&myval));
smem[10][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<21)&myval));
smem[11][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<20)&myval));
smem[12][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<19)&myval));
smem[13][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<18)&myval));
smem[14][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<17)&myval));
smem[15][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<16)&myval));
smem[16][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<15)&myval));
smem[17][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<14)&myval));
smem[18][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<13)&myval));
smem[19][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<12)&myval));
smem[20][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<11)&myval));
smem[21][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<10)&myval));
smem[22][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 9)&myval));
smem[23][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 8)&myval));
smem[24][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 7)&myval));
smem[25][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 6)&myval));
smem[26][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 5)&myval));
smem[27][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 4)&myval));
smem[28][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 3)&myval));
smem[29][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 2)&myval));
smem[30][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 1)&myval));
smem[31][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 0)&myval));
__syncthreads();
int indx = (idx*idim)+(gridDim.y*threadIdx.y)+blockIdx.y;
if ((idx < (idim/32))&&(idy < idim)) out[indx] = smem[threadIdx.y][threadIdx.x];
}
void naive_cpu_bt(const unsigned *in, unsigned *out, const unsigned idim){
memset(out, 0, idim*(idim/32)*sizeof(unsigned));
for (int i = 0; i < (idim/32); i++)
for (int j = 0; j < idim; j++){
for (int bit = 0; bit < 32; bit++){
if ((in[(j*(idim/32)) + i]>>bit)&1) out[(((i*32)+(31-bit))*(idim/32))+(j/32)] |= 1<<(31 - (j%32));
}
}
}
int main(){
unsigned *h_idata, *h_odata, *d_idata, *d_odata, *c_odata;
unsigned idim;
// test case 1, 32x32, upper triangular
idim = 32;
h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
unsigned data = 0x0FFFFFFFFU;
for (int i = 0; i < 32; i++){
h_idata[i] = data;
data>>=1;}
cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
bt<<<dim3(1,1),dim3(32,32)>>>(d_idata, d_odata, idim);
cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
naive_cpu_bt(h_idata, c_odata, idim);
for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) {printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
for (int i = 0; i < 32; i++) printf("0x%8x\n", h_odata[i]);
free(h_idata);
free(h_odata);
free(c_odata);
cudaFree(d_idata);
cudaFree(d_odata);
// test case 2, 64x64, opposite diagonal
idim = 64;
h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
data = 0x01;
for (int i = 0; i < 32; i++){
h_idata[2*i] = 0; h_idata[(2*i)+1] = data;
h_idata[64+(2*i)] = data; h_idata[65+(2*i)] = 0xFFFFFFFFU;
data<<=1; data++;}
cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
bt<<<dim3(1,2),dim3(32,32)>>>(d_idata, d_odata, idim);
cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
naive_cpu_bt(h_idata, c_odata, idim);
for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) { printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
for (int i = 0; i < 64; i++) printf("0x%8x 0x%8x\n", h_odata[i*2], h_odata[(i*2)+1]);
free(h_idata);
free(h_odata);
free(c_odata);
cudaFree(d_idata);
cudaFree(d_odata);
// test case 3, 96x96, ones in alternating columns
idim = 96;
h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
data = 0x55555555U;
for (int i = 0; i < idim*(idim/32); i++)
h_idata[i] = data;
cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
bt<<<dim3(1,3),dim3(32,32)>>>(d_idata, d_odata, idim);
cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
naive_cpu_bt(h_idata, c_odata, idim);
for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) { printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
for (int i = 0; i < 96; i++) printf("0x%8x 0x%8x 0x%8x\n", h_odata[i*3], h_odata[(i*3)+1], h_odata[(i*3)+2]);
free(h_idata);
free(h_odata);
free(c_odata);
cudaFree(d_idata);
cudaFree(d_odata);
// test case 4, 8kx8k random
idim = 8192;
int xblocks = (idim/1024)+((idim%1024)?1:0);
h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
for (int i = 0; i < idim*(idim/32); i++)
h_idata[i] = rand();
unsigned long gt = dtime_usec(0);
cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
unsigned long gkt = dtime_usec(0);
bt<<<dim3(xblocks,(idim/32)),dim3(32,32)>>>(d_idata, d_odata, idim);
cudaDeviceSynchronize();
gkt = dtime_usec(gkt);
cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
gt = dtime_usec(gt);
unsigned long ct = dtime_usec(0);
naive_cpu_bt(h_idata, c_odata, idim);
ct = dtime_usec(ct);
for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) { printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
printf("gputime: %fms, kerneltime: %fms, cputime: %fms\n", (gt*1000)/(float)USECPSEC, (gkt*1000)/(float)USECPSEC, (ct*1000)/(float)USECPSEC);
printf("kernel bandwidth = %fMB/s\n", (idim*(idim/32)*4*2)/(float)gkt);
printf("Success!\n");
free(h_idata);
free(h_odata);
free(c_odata);
cudaFree(d_idata);
cudaFree(d_odata);
return 0;
}
$ nvcc -arch=sm_61 -o t435 t435.cu
$ ./t435
0x80000000
0xc0000000
0xe0000000
0xf0000000
0xf8000000
0xfc000000
0xfe000000
0xff000000
0xff800000
0xffc00000
0xffe00000
0xfff00000
0xfff80000
0xfffc0000
0xfffe0000
0xffff0000
0xffff8000
0xffffc000
0xffffe000
0xfffff000
0xfffff800
0xfffffc00
0xfffffe00
0xffffff00
0xffffff80
0xffffffc0
0xffffffe0
0xfffffff0
0xfffffff8
0xfffffffc
0xfffffffe
0xffffffff
0x 0 0x 1
0x 0 0x 3
0x 0 0x 7
0x 0 0x f
0x 0 0x 1f
0x 0 0x 3f
0x 0 0x 7f
0x 0 0x ff
0x 0 0x 1ff
0x 0 0x 3ff
0x 0 0x 7ff
0x 0 0x fff
0x 0 0x 1fff
0x 0 0x 3fff
0x 0 0x 7fff
0x 0 0x ffff
0x 0 0x 1ffff
0x 0 0x 3ffff
0x 0 0x 7ffff
0x 0 0x fffff
0x 0 0x 1fffff
0x 0 0x 3fffff
0x 0 0x 7fffff
0x 0 0x ffffff
0x 0 0x 1ffffff
0x 0 0x 3ffffff
0x 0 0x 7ffffff
0x 0 0x fffffff
0x 0 0x1fffffff
0x 0 0x3fffffff
0x 0 0x7fffffff
0x 0 0xffffffff
0x 1 0xffffffff
0x 3 0xffffffff
0x 7 0xffffffff
0x f 0xffffffff
0x 1f 0xffffffff
0x 3f 0xffffffff
0x 7f 0xffffffff
0x ff 0xffffffff
0x 1ff 0xffffffff
0x 3ff 0xffffffff
0x 7ff 0xffffffff
0x fff 0xffffffff
0x 1fff 0xffffffff
0x 3fff 0xffffffff
0x 7fff 0xffffffff
0x ffff 0xffffffff
0x 1ffff 0xffffffff
0x 3ffff 0xffffffff
0x 7ffff 0xffffffff
0x fffff 0xffffffff
0x 1fffff 0xffffffff
0x 3fffff 0xffffffff
0x 7fffff 0xffffffff
0x ffffff 0xffffffff
0x 1ffffff 0xffffffff
0x 3ffffff 0xffffffff
0x 7ffffff 0xffffffff
0x fffffff 0xffffffff
0x1fffffff 0xffffffff
0x3fffffff 0xffffffff
0x7fffffff 0xffffffff
0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
0x 0 0x 0 0x 0
0xffffffff 0xffffffff 0xffffffff
gputime: 5.530000ms, kerneltime: 0.301000ms, cputime: 659.205994ms
kernel bandwidth = 55738.257812MB/s
Success!
$
For the first 3 test cases, I used 3 different input patterns, and printed out the results, in addition to doing the cpu validation. For the 4th test case (large random pattern) I skip the printout, but retain the cpu validation.
Also note that I used the __ballot_sync()
variant to be compliant with CUDA 9.
I haven't done any performance analysis. Like most transpose operations, I expect a large part of the performance here to stem from efficient use of global memory. The major inhibitor there would be the scattered writes in step 4 above.
Some general background information on matrix transpose in CUDA can be found here although this bitwise case deviates substantially from it.