Thank you very much, I am trying to optimizing the Kasumi algorithm written in C. There has S-box in FI function which uses to encrypt the data, S7-box has 127 elements and S9-box has 512 elements. the FI function code like:
static u16 FI(u16 in, u16 subkey)
{
static u16 s7[] = {...};
static u16 s9[] = {...};
nine = (u16)(in>>7);
seven = (u16)(in&0x7F);
/* Now run the various operations */
nine = (u16)(S9[nine] ^ seven);
seven = (u16)(S7[seven] ^ (nine & 0x7F));
seven ^= (subkey>>9);
nine ^= (subkey&0x1FF);
nine = (u16)(S9[nine] ^ seven);
seven = (u16)(S7[seven] ^ (nine & 0x7F));
in = (u16)((seven<<9) + nine);
return( in );
}
u16 represents unsigned short.
By some transformation. I merge S7-box and S9-box to S16-box, and I use avx instruction to make 16 data parallel. the code of FI function like:
static u16 FI(__m256i in, u16 subkey)
{
u16 arr[16];
_mm256_store_si256((__m256i*)arr, in);
u8 i;
for(i = 0; i < 16; i++)
{
arr[i] = (u16)(s16[arr[i]] ^ subkey);
arr[i] = (arr[i] << 7) | (arr[i] >> 9);
arr[i] = s16[arr[i]];
}
in = _mm256_load_si256((__m256i*)arr);
}
S16-box has 65536 elements, so maybe some cache miss will happen. I also use gather instruction like:
inline static __m256i FI( __m256i in, u16 subkey )
{
__m256i _tmp = _mm256_set1_epi32(0xffff);
__m256i even_sequence = _mm256_and_si256(in, _tmp);
__m256i odd_sequence = _mm256_srli_epi32(in, 16);
even_sequence = _mm256_i32gather_epi32((int const*)s16, even_sequence, 2);
__m256i _subkey = _mm256_set1_epi16(subkey);
even_sequence = _mm256_xor_si256(even_sequence, _subkey);
even_sequence = _mm256_and_si256(even_sequence, _tmp);
odd_sequence = _mm256_i32gather_epi32((int const*)s16, odd_sequence, 2);
odd_sequence = _mm256_xor_si256(odd_sequence, _subkey);
odd_sequence = _mm256_and_si256(odd_sequence, _tmp);
// rotate
__m256i hi = _mm256_slli_epi16(even_sequence, 7);
__m256i lo = _mm256_srli_epi16(even_sequence, 9);
even_sequence = _mm256_or_si256(hi, lo);
//same for odd
hi = _mm256_slli_epi16(odd_sequence, 7);
lo = _mm256_srli_epi16(odd_sequence, 9);
odd_sequence = _mm256_or_si256(hi, lo);
even_sequence = _mm256_i32gather_epi32((int const*)s16, even_sequence, 2);
odd_sequence = _mm256_i32gather_epi32((int const*)s16, odd_sequence, 2);
even_sequence = _mm256_and_si256(even_sequence, _tmp);
odd_sequence = _mm256_slli_epi32(odd_sequence, 16);
in = _mm256_or_si256(even_sequence, odd_sequence);
return in;
}
but the performance cannot meet requirements, I also think about the bit-slice. I read a paper which can make 128 datas parallel but need some hardware support. i think bit tranpose operation is time-consuming and there are many constraint.
Thank you very much!