I'm learning writing some SSE code. Here is one test program that compare the SSE code and normal c++ code in both accuracy and performance. The program take two vectors of a given size, and output the sum value of all the elements. For example, when size=3
, v1={1, 2, 3}
, v2={2, 3, 4}
, the result will be (1+2) + (2+3) + (3+4) = 15
. The vector's values is generated by generator with fixed seed. And the size of the vectors is divisible by 4. Here is the code:
test.cpp:
#include <random>
#include <iostream>
#if defined USESSE
#include <pmmintrin.h>
#endif
#include "timer.h"
using namespace std;
float* init_vector(int size) {
default_random_engine generator(100); // seed
uniform_real_distribution<float> distribution(0.0, 1.0);
float* vec = (float*)malloc(size*sizeof(float));
for (int i=0; i<size; i++) {
vec[i] = distribution(generator);
}
return vec;
}
#if defined USESSE
float vec_sum(float* v1, float* v2, int size) {
__m128 vec_sumed = _mm_setzero_ps(); // {0, 0, 0, 0}
// vertical sum of two vectors
for (int i=0; i<size; i+=4) {
// sum vertically and then horizentally
vec_sumed = _mm_add_ps(vec_sumed, _mm_add_ps(_mm_load_ps(v1+i), _mm_load_ps(v2+i)));
}
float* v = (float*)malloc(4*sizeof(float));
_mm_store_ps(v, vec_sumed);
// make horizental sum over the final vec
float result = 0;
for (int i=0; i<4; i++) {
result += v[i];
}
return result;
}
#else
float vec_sum(float* v1, float* v2, int size) {
float result = 0;
for (int i=0; i<size; i++) {
result += v1[i]+v2[i];
}
return result;
}
#endif
void make_test(int size) {
float* vec1 = init_vector(size);
float* vec2 = init_vector(size);
Timer timer;
timer.tic();
cout << vec_sum(vec1, vec2, size)/size << endl;
timer.toc();
cout << "Run time: " << timer.get() << endl;
}
int main() {
#if defined USESSE
cout << "with SSE" << endl;
#else
cout << "without SSE" << endl;
#endif
make_test(40000000);
}
timer.h (Only for timing the functions).
#pragma once
#include <chrono>
#include <string>
class Timer
{
public:
Timer(){reset();};
void reset() {
begin = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(begin-begin);
};
void tic() {
begin = std::chrono::high_resolution_clock::now();
};
float toc() {
duration += std::chrono::duration_cast<std::chrono::milliseconds>
(std::chrono::high_resolution_clock::now()-begin);
return get();
};
float get() {
return (float)duration.count() / 1000;
}
private:
std::chrono::high_resolution_clock::time_point begin;
std::chrono::milliseconds duration;
};
I use the compile and run command g++ -std=c++14 test.cpp -DUSESSE -o test && ./test
for using SSE and g++ -std=c++14 test.cpp -o test && ./test
for not using SSE. The result is:
with SSE
0.999982
Run time: 0.071
------------------------
without SSE
0.838861
Run time: 0.124
If I change the size into a smaller number, let's say 400, the results is the same:
with SSE
1.01521
Run time: 0
-----------------------
without SSE
1.01521
Run time: 0
But it is hard to compare the performance using a small number. My question is, with a large vector size, why the SSE version function produces a different result from the normal c++ function?