I'm trying to convert this code:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(radiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
}
mPhase = phase;
in SSE2, trying to speed up the whole block (which is called often). I'm using MSVC with the Fast optimizazion flag, but auto-vectorization is very crap. Since I'm also learning vectorization, I find it a nice challenge.
So I've take the formula above, and simplified, such as:
radiansPerSampleBp0 = radiansPerSample * bp0;
phase += std::clamp(radiansPerSampleBp0 * pB[sampleIndex] + radiansPerSample * pC[sampleIndex]), 0.0, PI);
Which can be muted into a serial dependency such as:
phase[0] += (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[1] += (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1]) + (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[2] += (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2]) + (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1])
phase[3] += (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3]) + (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2])
phase[4] += (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4]) + (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3])
phase[5] += (radiansPerSampleBp0 * pB[5] + radiansPerSample * pC[5]) + (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4])
Hence, the code I did:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
}
mPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
But, unfortunately, after sum "steps", the results become very different for each phase value. Tried to debug, but I'm not really able to find where the problem is.
Also, it's not really so "fast" rather than the old version.
Are you able to recognize the trouble? And how you will speed-up the code?
Here's the whole code, if you want to check the two different outputs:
#include <iostream>
#include <algorithm>
#include <immintrin.h>
#include <emmintrin.h>
#define PI 3.14159265358979323846
constexpr int voiceSize = 1;
constexpr int bufferSize = 256;
class Param
{
public:
alignas(16) double mPhase = 0.0;
alignas(16) double mPhaseOptimized = 0.0;
alignas(16) double mNoteFrequency = 10.0;
alignas(16) double mHostPitch = 1.0;
alignas(16) double mRadiansPerSample = 1.0;
alignas(16) double b[voiceSize][bufferSize];
alignas(16) double c[voiceSize][bufferSize];
Param() { }
inline void Process(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
std::cout << sampleIndex << ": " << phase << std::endl;
}
mPhase = phase;
}
inline void ProcessOptimized(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhaseOptimized;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
std::cout << sampleIndex << ": " << v_phase.m128d_f64[0] << std::endl;
std::cout << sampleIndex + 1 << ": " << v_phase.m128d_f64[1] << std::endl;
}
mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
}
};
class MyPlugin
{
public:
Param mParam1;
MyPlugin() {
// fill b
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = (sampleIndex / ((double)bufferSize - 1));
mParam1.b[voiceIndex][sampleIndex] = value;
}
}
// fill c
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = 0.0;
mParam1.c[voiceIndex][sampleIndex] = value;
}
}
}
~MyPlugin() { }
void Process(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.Process(voiceIndex, blockSize);
}
}
void ProcessOptimized(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.ProcessOptimized(voiceIndex, blockSize);
}
}
};
int main() {
MyPlugin myPlugin;
long long numProcessing = 1;
long long counterProcessing = 0;
// I'll only process once block, just for analysis
while (counterProcessing++ < numProcessing) {
// variable blockSize (i.e. it can vary, being even or odd)
int blockSize = 256;
// process data
myPlugin.Process(blockSize);
std::cout << "#########" << std::endl;
myPlugin.ProcessOptimized(blockSize);
}
}