I have an array of signed short that I want to divide by 2048 and get an array of float as a result.
I found SSE: convert short integer to float that allows to convert unsigned shorts to floats, but I want to handle signed shorts also.
The code below works but only for positive shorts.
// We want to divide some signed short by 2048 and get a float.
const auto floatScale = _mm256_set1_ps(2048);
short* shortsInput = /* values from somewhere */;
float* floatsOutput = /* initialized */;
__m128i* m128iInput = (__m128i*)&shortsInput[0];
// Converts the short vectors to 2 float vectors. This works, but only for positive shorts.
__m128i m128iLow = _mm_unpacklo_epi16(m128iInput[0], _mm_setzero_si128());
__m128i m128iHigh = _mm_unpackhi_epi16(m128iInput[0], _mm_setzero_si128());
__m128 m128Low = _mm_cvtepi32_ps(m128iLow);
__m128 m128High = _mm_cvtepi32_ps(m128iHigh);
// Puts the 2 __m128 vectors into 1 __m256.
__m256 singleComplete = _mm256_castps128_ps256(m128Low);
singleComplete = _mm256_insertf128_ps(singleComplete, m128High, 1);
// Finally do the math
__m256 scaledVect = _mm256_div_ps(singleComplete, floatScale);
// and puts the result where needed.
_mm256_storeu_ps(floatsOutput[0], scaledVect);
How can I convert my signed shorts to floats? Or maybe there's a better way to tackle this problem?
EDIT: I tried the different answers compared to a non SIMD algorithm, doing it 10M times over a 2048 array, on an AMD Ryzen 7 2700 at ~3.2GHz. I'm using Visual 15.7.3 with mostly the default config:
/permissive- /Yu"stdafx.h" /GS /GL /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /sdl
/Fd"x64\Release\vc141.pdb" /Zc:inline /fp:precise /D "NDEBUG" /D "_CONSOLE"
/D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope
/arch:AVX2 /Gd /Oi /MD /openmp /FC /Fa"x64\Release\" /EHsc /nologo
/Fo"x64\Release\" /Fp"x64\Release\test.pch" /diagnostics:classic
Note that I'm very new to SIMD and haven't use C++ for ages. Here's what I get (I reran each test separately and not one after the other and got better results like that):
- No SIMD: 7300ms
- wim's answer: 2300ms
- chtz's SSE2 answer: 1650ms
- chtz's AVX2 answer: 2100ms
So I get a nice speedup by using SIMD, and chtz's SSE2 answer, though being more verbose and complex to understand, is faster. (At least when compiled with AVX enabled, so it avoids extra instructions to copy registers by using 3-operand VEX-coded instructions. On Intel CPUs, the AVX2 versions should be significantly faster than the 128-bit version.)
Here's my test code:
const int size = 2048;
const int loopSize = (int)1e7;
float* noSimd(short* shortsInput) {
float* floatsOutput = new float[size];
auto startTime = std::chrono::high_resolution_clock::now();
for (int i = 0; i < loopSize; i++) {
for (int j = 0; j < size; j++) {
floatsOutput[j] = shortsInput[j] / 2048.0f;
}
}
auto stopTime = std::chrono::high_resolution_clock::now();
long long totalTime = (stopTime - startTime).count();
printf("%lld noSimd\n", totalTime);
return floatsOutput;
}
float* wimMethod(short* shortsInput) {
const auto floatScale = _mm256_set1_ps(1.0f / 2048.0f);
float* floatsOutput = new float[size];
auto startTime = std::chrono::high_resolution_clock::now();
for (int i = 0; i < loopSize; i++) {
for (int j = 0; j < size; j += 8) {
__m128i short_vec = _mm_loadu_si128((__m128i*)&shortsInput[j]);
__m256i int_vec = _mm256_cvtepi16_epi32(short_vec);
__m256 singleComplete = _mm256_cvtepi32_ps(int_vec);
// Finally do the math
__m256 scaledVect = _mm256_mul_ps(singleComplete, floatScale);
// and puts the result where needed.
_mm256_storeu_ps(&floatsOutput[j], scaledVect);
}
}
auto stopTime = std::chrono::high_resolution_clock::now();
long long totalTime = (stopTime - startTime).count();
printf("%lld wimMethod\n", totalTime);
return floatsOutput;
}
float* chtzMethodSSE2(short* shortsInput) {
float* floatsOutput = new float[size];
auto startTime = std::chrono::high_resolution_clock::now();
for (int i = 0; i < loopSize; i++) {
for (int j = 0; j < size; j += 8) {
// get input:
__m128i val = _mm_loadu_si128((__m128i*)&shortsInput[j]);
// add 0x8000 to wrap to unsigned short domain:
val = _mm_add_epi16(val, const0x8000);
// interleave with upper part of float(1<<23)/2048.f:
__m128i lo = _mm_unpacklo_epi16(val, const0x4580);
__m128i hi = _mm_unpackhi_epi16(val, const0x4580);
// interpret as float and subtract float((1<<23) + (0x8000))/2048.f
__m128 lo_f = _mm_sub_ps(_mm_castsi128_ps(lo), constFloat);
__m128 hi_f = _mm_sub_ps(_mm_castsi128_ps(hi), constFloat);
// store:
_mm_storeu_ps(&floatsOutput[j], lo_f);
_mm_storeu_ps(&floatsOutput[j] + 4, hi_f);
}
}
auto stopTime = std::chrono::high_resolution_clock::now();
long long totalTime = (stopTime - startTime).count();
printf("%lld chtzMethod\n", totalTime);
return floatsOutput;
}
float* chtzMethodAVX2(short* shortsInput) {
const auto floatScale = _mm256_set1_ps(1.0f / 2048.0f);
float* floatsOutput = new float[size];
auto startTime = std::chrono::high_resolution_clock::now();
for (int i = 0; i < loopSize; i++) {
for (int j = 0; j < size; j += 8) {
// get input:
__m128i val = _mm_loadu_si128((__m128i*)&shortsInput[j]);
// interleave with 0x0000
__m256i val_unpacked = _mm256_cvtepu16_epi32(val);
// 0x4580'8000
const __m256 magic = _mm256_set1_ps(float((1 << 23) + (1 << 15)) / 2048.f);
const __m256i magic_i = _mm256_castps_si256(magic);
/// convert by xor-ing and subtracting magic value:
// VPXOR avoids port5 bottlenecks on Intel CPUs before SKL
__m256 val_f = _mm256_castsi256_ps(_mm256_xor_si256(val_unpacked, magic_i));
__m256 converted = _mm256_sub_ps(val_f, magic);
// store:
_mm256_storeu_ps(&floatsOutput[j], converted);
}
}
auto stopTime = std::chrono::high_resolution_clock::now();
long long totalTime = (stopTime - startTime).count();
printf("%lld chtzMethod2\n", totalTime);
return floatsOutput;
}