i have a pretty weird problem regarding SSE usage.
I wrote the following function where i use SSE to calculate the maximum of the difference of two float arrays, each containing 64 floats.
The dists-array is a 2d-array allocated via _aligned_malloc
.
#include <iostream>
#include <xmmintrin.h>
#include <time.h>
#include <stdio.h>
#include <algorithm>
#include <fstream>
#include "hr_time.h"
using namespace std;
float** dists;
float** dists2;
__m128* a;
__m128* b;
__m128* c;
__m128* d;
__m128 diff;
__m128 diff2;
__m128 mymax;
float* myfmax;
float test(int s, int t)
{
a = (__m128*) dists[s];
b = (__m128*) dists[t];
c = (__m128*) dists2[s];
d = (__m128*) dists2[t];
diff;
mymax = _mm_set_ps(0.0, 0.0, 0.0, 0.0);
for (int i = 0; i <= 16; i++)
{
diff = _mm_sub_ps(*a, *b);
mymax = _mm_max_ps(diff, mymax);
diff2 = _mm_sub_ps(*d, *c);
mymax = _mm_max_ps(diff2, mymax);
a++;
b++;
c++;
d++;
}
_mm_store_ps(myfmax, mymax);
float res = max(max(max(myfmax[0], myfmax[1]), myfmax[2]), myfmax[3]);
return res;
}
int Deserialize(std::istream* stream)
{
int numOfElements, arraySize;
stream->read((char*)&numOfElements, sizeof(int)); // numOfElements = 64
stream->read((char*)&arraySize, sizeof(int)); // arraySize = 8000000
dists = (float**)_aligned_malloc(arraySize * sizeof(float*), 16);
dists2 = (float**)_aligned_malloc(arraySize * sizeof(float*), 16);
for (int j = 0; j < arraySize; j++)
{
dists[j] = (float*)_aligned_malloc(numOfElements * sizeof(float), 16);
dists2[j] = (float*)_aligned_malloc(numOfElements * sizeof(float), 16);
}
for (int i = 0; i < arraySize; i++)
{
stream->read((char*)dists[i], (numOfElements*sizeof(float)));
}
for (int i = 0; i < arraySize; i++)
{
stream->read((char*)dists2[i], (numOfElements*sizeof(float)));
}
return 0;
}
int main(int argc, char** argv)
{
int entries = 8000000;
myfmax = (float*)_aligned_malloc(4 * sizeof(float), 16);
ifstream fs("binary_file", std::ios::binary);
Deserialize(&fs);
CStopWatch* watch = new CStopWatch();
watch->StartTimer();
int i;
for (i = 0; i < entries; i++)
{
int s = rand() % entries;
int t = rand() % entries;
test(s, t);
}
watch->StopTimer();
cout << i << " iterations took " << watch->GetElapsedTimeMs() << "ms" << endl;
cin.get();
}
My problem is, that this code runs very fast if i run it in Visual Studio with an attached debugger. But as soon as i execute it without the debugger it gets very slow. So i did a little reasearch and found out that one difference between those two starting methods is the "Debug Heap". So i disabled that by defining "_NO_DEBUG_HEAP=1". With that option i get very poor performance with an attached debugger too.
But i don't understand how i can get better performance by using the Debug Heap? And i don't know how to solve this problem, so i hope one of you guys can help me.
Thanks in advance.
Regards, Karsten