my implementation is that i am using the z-order curve to traverse the entries of each matrix block. this implementation is resulting in 3x speedup than the naive approach (see my code below).
i want to achieve a better speedup by traversing the entries of each block in row-major order, and using the z-order curve to define the order by which matrix blocks are visited.
how would i go about implemeting that??
`int Compact1By1(int x)
{
x &= 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
x = (x ^ (x >> 1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
x = (x ^ (x >> 2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
x = (x ^ (x >> 4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
x = (x ^ (x >> 8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
return x;
}
`
`void optimized(int *src, int *dst, int SIZE)
{
int blocksize = 64;
int x=0;
int y=0;
int tmp = 0;
for (int i = 0; i < SIZE; ++i) {
for ( int j = 0; j < SIZE; j += blocksize) {
tmp = i*SIZE + j;
for(int z = tmp; z < tmp + blocksize ; ++z){
x=Compact1By1(z >> 0);
y=Compact1By1(z >> 1);
dst[(y * SIZE) + x] = src[(x * SIZE) + y];
}
}
}
}
`