I am testing out SSE on a variant of Zip Decryption. However, the unoptimized code is performing better.
Running the compiler with the parameters: -msse4 -O3 results in the following benchmarks:-
Normal Test: 0.275, SSE Test: 0.655
I tried increasing the loop counter, but the benchmarks did not change much. What optimizations are being done by the compiler? Should we not be able to match it using SSE?
Edit: Used wall time as suggested by Jens, increased loop iterations and fixed printf format.
#include <stdio.h>
#include <stdint.h>
#include <smmintrin.h>
#include <time.h>
// Windows
#ifdef _WIN32
#include <Windows.h>
double get_wall_time()
{
LARGE_INTEGER time,freq;
if (!QueryPerformanceFrequency(&freq))
{
// Handle error
return 0;
}
if (!QueryPerformanceCounter(&time))
{
// Handle error
return 0;
}
return (double)time.QuadPart / freq.QuadPart;
}
double get_cpu_time()
{
FILETIME a,b,c,d;
if (GetProcessTimes(GetCurrentProcess(),&a,&b,&c,&d) != 0)
{
// Returns total user time.
// Can be tweaked to include kernel times as well.
return
(double)(d.dwLowDateTime |
((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001;
}
else
{
// Handle error
return 0;
}
}
// Posix/Linux
#else
#include <sys/time.h>
double get_wall_time()
{
struct timeval time;
if (gettimeofday(&time,NULL))
{
// Handle error
return 0;
}
return (double)time.tv_sec + (double)time.tv_usec * .000001;
}
double get_cpu_time()
{
return (double)clock() / CLOCKS_PER_SEC;
}
#endif
static void test_sse()
{
double start = get_wall_time();
uint64_t sum = 0;
uint32_t nk0 = 0x12345678;
uint32_t nk1 = 0x23456789;
uint32_t nk2 = 0x34567890;
uint32_t nk3 = 0x45678901;
__m128i mask = _mm_set1_epi32(0xff);
uint64_t i;
for(i = 0; i < 100000000; i++)
{
uint32_t newKeys[] = {nk0, nk1, nk2, nk3};
__m128i *nk_sse = (__m128i*)(&newKeys);
__m128i opa = _mm_and_si128(*nk_sse, mask);
__m128i opr8 = _mm_srai_epi32 (*nk_sse, 8);
__m128i opr16 = _mm_srai_epi32 (*nk_sse, 16);
__m128i opr24 = _mm_srai_epi32 (*nk_sse, 24);
__m128i oprsum = _mm_add_epi32(_mm_add_epi32(_mm_add_epi32(opa, _mm_and_si128(opr8, mask)), _mm_and_si128(opr16, mask)), _mm_and_si128(opr24, mask));
uint32_t* oprsum_ptr = (uint32_t*)(&oprsum);
uint32_t sum_sse = oprsum_ptr[0] + oprsum_ptr[1] + oprsum_ptr[2] + oprsum_ptr[3];
sum += sum_sse;
nk0--;
nk1--;
nk2--;
nk3--;
}
double end = get_wall_time();
double ms = end - start;
printf("SSE Test - Sum: %lu, ms: %f\n", sum, ms);
}
static void test()
{
double start = get_wall_time();
uint64_t sum = 0;
uint32_t nk0 = 0x12345678;
uint32_t nk1 = 0x23456789;
uint32_t nk2 = 0x34567890;
uint32_t nk3 = 0x45678901;
uint64_t i;
for(i = 0; i < 100000000; i++)
{
uint8_t res0 = (uint8_t) (nk0 & 0xff);
uint8_t res1 = (uint8_t) (nk0 >> 8);
uint8_t res2 = (uint8_t) (nk0 >> 16);
uint8_t res3 = (uint8_t) (nk0 >> 24);
uint8_t res4 = (uint8_t) (nk1 & 0xff);
uint8_t res5 = (uint8_t) (nk1 >> 8);
uint8_t res6 = (uint8_t) (nk1 >> 16);
uint8_t res7 = (uint8_t) (nk1 >> 24);
uint8_t res8 = (uint8_t) (nk2 & 0xff);
uint8_t res9 = (uint8_t) (nk2 >> 8);
uint8_t res10 = (uint8_t) (nk2 >> 16);
uint8_t res11 = (uint8_t) (nk2 >> 24);
uint8_t res12 = (uint8_t) (nk3 & 0xff);
uint8_t res13 = (uint8_t) (nk3 >> 8);
uint8_t res14 = (uint8_t) (nk3 >> 16);
uint8_t res15 = (uint8_t) (nk3 >> 24);
sum += res0 + res1+ res2 + res3 + res4 + res5 + res6 + res7 + res8 + res9
+ res10 + res11 + res12 + res13 + res14 + res15;
nk0--;
nk1--;
nk2--;
nk3--;
}
double end = get_wall_time();
double ms = end - start;
printf("Normal Test - Sum: %lu, ms: %f\n", sum, ms);
}
int main (int argc, char **argv)
{
test();
test_sse();
return 0;
}