You can increase the speed of your program by reducing the cache misses: aoffsets[i]
and boffsets[i]
are relatively far away from each other in memory. By placing them next to each other, you speed up the program significantly. On my machine (e5400 cpu, VS2012) the execution time is reduced from 3.0 seconds to 2.3 seconds:
#include <vector>
#include <windows.h>
#include <iostream>
typedef unsigned short uint16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
#define MATCH_MASK 16383
#define DOUBLE_MATCH_MASK (MATCH_MASK*2)
static const int MATCH_BITS = 14;
static const int MATCH_LEFT = (32 - MATCH_BITS);
#define MATCH_STRIP(x) ((uint32_t)(x) >> MATCH_LEFT)
static const int n_maxoffset = 1000;
uint16_t test(int32_t* a, int asize, int32_t* b, int bsize)
{
uint16_t offsets[DOUBLE_MATCH_MASK] = {0};
auto fn_init_offsets = [](const int32_t* x, int n_size, uint16_t offsets[])->void
{
for (int i = 0; i < n_size; ++i)
offsets[MATCH_STRIP(x[i])*2 /*important. leave space for other offsets*/] = i;
};
fn_init_offsets(a, asize, offsets);
fn_init_offsets(b, bsize, offsets+1);
uint8_t topcount = 0;
int topoffset = 0;
{
std::vector<uint8_t> count_vec(asize + bsize + 1, 0);
uint8_t* counts = &(count_vec[0]);
for (int i = 0; i < MATCH_MASK; i+=2)
{
if (offsets[i] && offsets[i+1])
{
int offset = (offsets[i] - offsets[i+1]); //NOTE: I removed
if ((-n_maxoffset <= offset && offset <= n_maxoffset))
{
offset += bsize;
uint8_t n_cur_count = ++counts[offset];
if (n_cur_count > topcount)
{
topcount = n_cur_count;
topoffset = offset;
}
}
}
}
}
return offsets[0];
}
int main(int argc, char* argv[])
{
const int sizes = 1000;
int32_t* a = new int32_t[sizes];
int32_t* b = new int32_t[sizes];
for (int i=0;i<sizes;i++)
{
a[i] = rand()*rand();
b[i] = rand()*rand();
}
//Variablen
LONGLONG g_Frequency, g_CurentCount, g_LastCount;
QueryPerformanceFrequency((LARGE_INTEGER*)&g_Frequency);
QueryPerformanceCounter((LARGE_INTEGER*)&g_CurentCount);
int sum = 0;
for (int i=0;i<100000;i++)
{
sum += test(a,sizes,b,sizes);
}
QueryPerformanceCounter((LARGE_INTEGER*)&g_LastCount);
double dTimeDiff = (((double)(g_LastCount-g_CurentCount))/((double)g_Frequency));
std::cout << "Result: " << sum << std::endl <<"time: " << dTimeDiff << std::endl;
delete[] a;
delete[] b;
return 0;
}
compared to your version of test()
.
#include <vector>
#include <windows.h>
#include <iostream>
typedef unsigned short uint16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
#define MATCH_MASK 16383
#define DOUBLE_MATCH_MASK (MATCH_MASK*2)
static const int MATCH_BITS = 14;
static const int MATCH_LEFT = (32 - MATCH_BITS);
#define MATCH_STRIP(x) ((uint32_t)(x) >> MATCH_LEFT)
static const int n_maxoffset = 1000;
uint16_t test(int32_t* a, int asize, int32_t* b, int bsize)
{
uint16_t aoffsets[DOUBLE_MATCH_MASK] = {0}; // important! or it will only be right on the first time
uint16_t* boffsets = aoffsets + MATCH_MASK;
auto fn_init_offsets = [](const int32_t* x, int n_size, uint16_t offsets[])->void
{
for (int i = 0; i < n_size; ++i)
offsets[MATCH_STRIP(x[i])] = i;
};
fn_init_offsets(a, asize, aoffsets);
fn_init_offsets(b, bsize, boffsets);
uint8_t topcount = 0;
int topoffset = 0;
{
std::vector<uint8_t> count_vec(asize + bsize + 1, 0);
uint8_t* counts = &(count_vec[0]);
for (int i = 0; i < MATCH_MASK; ++i)
{
if (aoffsets[i] && boffsets[i])
{
int offset = (aoffsets[i] - boffsets[i]); //NOTE: I removed the -= because otherwise offset would always be positive!
if ((-n_maxoffset <= offset && offset <= n_maxoffset))
{
offset += bsize;
uint8_t n_cur_count = ++counts[offset];
if (n_cur_count > topcount)
{
topcount = n_cur_count;
topoffset = offset;
}
}
}
}
}
return aoffsets[0];
}
int main(int argc, char* argv[])
{
const int sizes = 1000;
int32_t* a = new int32_t[sizes];
int32_t* b = new int32_t[sizes];
for (int i=0;i<sizes;i++)
{
a[i] = rand()*rand();
b[i] = rand()*rand();
}
LONGLONG g_Frequency, g_CurentCount, g_LastCount;
QueryPerformanceFrequency((LARGE_INTEGER*)&g_Frequency);
QueryPerformanceCounter((LARGE_INTEGER*)&g_CurentCount);
int sum = 0;
for (int i=0;i<100000;i++)
{
sum += test(a,sizes,b,sizes);
}
QueryPerformanceCounter((LARGE_INTEGER*)&g_LastCount);
double dTimeDiff = (((double)(g_LastCount-g_CurentCount))/((double)g_Frequency));
std::cout << "Result: " << sum << std::endl <<"time: " << dTimeDiff << std::endl;
delete[] a;
delete[] b;
return 0;
}