In the code below I changed the "dataLen" and get different efficiency.
dataLen = 400 SSE time:758000 us AVX time:483000 us SSE > AVX
dataLen = 2400 SSE time:4212000 us AVX time:2636000 us SSE > AVX
dataLen = 2864 SSE time:6115000 us AVX time:6146000 us SSE ~= AVX
dataLen = 3200 SSE time:8049000 us AVX time:9297000 us SSE < AVX
dataLen = 4000 SSE time:10170000us AVX time:11690000us SSE < AVX
The SSE and AVX code can be both simplified into this: buf3[i] += buf1[1]*buf2[i];
#include "testfun.h"
#include <iostream>
#include <chrono>
#include <malloc.h>
#include "immintrin.h"
using namespace std::chrono;
void testfun()
{
int dataLen = 4000;
int N = 10000000;
float *buf1 = reinterpret_cast<float*>(_aligned_malloc(sizeof(float)*dataLen, 32));
float *buf2 = reinterpret_cast<float*>(_aligned_malloc(sizeof(float)*dataLen, 32));
float *buf3 = reinterpret_cast<float*>(_aligned_malloc(sizeof(float)*dataLen, 32));
for(int i=0; i<dataLen; i++)
{
buf1[i] = 1;
buf2[i] = 1;
buf3[i] = 0;
}
//=========================SSE CODE=====================================
system_clock::time_point SSEStart = system_clock::now();
__m128 p1, p2, p3;
for(int j=0; j<N; j++)
for(int i=0; i<dataLen; i=i+4)
{
p1 = _mm_load_ps(&buf1[i]);
p2 = _mm_load_ps(&buf2[i]);
p3 = _mm_load_ps(&buf3[i]);
p3 = _mm_add_ps(_mm_mul_ps(p1, p2), p3);
_mm_store_ps(&buf3[i], p3);
}
microseconds SSEtimeUsed = duration_cast<milliseconds>(system_clock::now() - SSEStart);
std::cout << "SSE time used: " << SSEtimeUsed.count() << " us, " <<std::endl;
//=========================AVX CODE=====================================
for(int i=0; i<dataLen; i++) buf3[i] = 0;
system_clock::time_point AVXstart = system_clock::now();
__m256 pp1, pp2, pp3;
for(int j=0; j<N; j++)
for(int i=0; i<dataLen; i=i+8)
{
pp1 = _mm256_load_ps(&buf1[i]);
pp2 = _mm256_load_ps(&buf2[i]);
pp3 = _mm256_load_ps(&buf3[i]);
pp3 = _mm256_add_ps(_mm256_mul_ps(pp1, pp2), pp3);
_mm256_store_ps(&buf3[i], pp3);
}
microseconds AVXtimeUsed = duration_cast<milliseconds>(system_clock::now() - AVXstart);
std::cout << "AVX time used: " << AVXtimeUsed.count() << " us, " <<std::endl;
_aligned_free(buf1);
_aligned_free(buf2);
}
my cpu is Intel Xeon E3-1225 v2 which have a L1 cache 32KB*4 (4 core),when running this code it only uses 1 core, so the used L1 cache is 32KB.
buf1 buf2 and buf3 is small enough to located in L1 cache and L2 cache(L2 cache 1MB).Both of SSE and AVX is band width limited, but with the dataLen increase, Why do the AVX need more time than SSE?