UPDATE 2:
I've switched to initializing all arrays with fixed numbers. Why is performance faster when using modeDampingTermsExp
:
12625671
12285846
12819392
11179072
12272587
11722863
12648955
vs using modeDampingTermsExp2
?
1593620
1668170
1614495
1785965
1814576
1851797
1808568
1801945
It's about 10x faster.
Full code
#include <iostream>
#include <chrono>
using namespace std;
int bufferWriteIndex = 0;
float curSample = 0;
float tIncr = 0.1f;
float modeGainsTimesModeShapes[25] = { -0.144338, -1.49012e-08, -4.3016e-09, 7.45058e-09, -0, -0.25,
-1.49012e-08, 4.77374e-16, -7.45058e-09, 0, -0.288675, 0, 4.3016e-09, 3.55271e-15, -0, -0.25,
1.49012e-08, -1.4512e-15, 7.45058e-09, 0, -0.144338, 1.49012e-08, -4.30159e-09, -7.45058e-09, -0 };
float modeDampingTermsString[5] = { -8.03847, -30, -60, -90, -111.962 };
float damping[5] = { 1, 1, 1, 1, 1 };
float modeFrequenciesArr[5] = { 71419.1, 266564, 533137, 799710, 994855 };
float modeDampingTermsExp[5] = { 0.447604, 0.0497871, 0.00247875, 0.00012341, 1.37263e-05 };
float modeDampingTermsExp2[5] = { -0.803847, -3, -6, -9, -11.1962 };
int main(int argc, char** argv) {
float subt = 0;
int subWriteIndex = 0;
auto now = std::chrono::high_resolution_clock::now();
while (true) {
curSample = 0;
for (int i = 0; i < 5; i++) {
//Slow version
//damping[i] = damping[i] * modeDampingTermsExp2[i];
//Fast version
damping[i] = damping[i] * modeDampingTermsExp2[i];
float cosT = 2 * damping[i];
for (int m = 0; m < 5; m++) {
curSample += modeGainsTimesModeShapes[i * 5 + m] * cosT;
}
}
//t += tIncr;
bufferWriteIndex++;
//measure calculations per second
auto elapsed = std::chrono::high_resolution_clock::now() - now;
if ((elapsed / std::chrono::milliseconds(1)) > 1000) {
now = std::chrono::high_resolution_clock::now();
int idx = bufferWriteIndex;
cout << idx - subWriteIndex << endl;
subWriteIndex = idx;
}
}
}
UPDATE 1:
I changed tIncr
to 0.1f
to avoid a possible subnormal number as mentioned by @JaMit. I've also removed t += tIncr;
from the calculation.
Full code:
#include <iostream>
#include <chrono>
using namespace std;
int bufferWriteIndex = 0;
float curSample = 0;
float t = 0;
float tIncr = 0.1f;
float modeGainsTimesModeShapes[25] = { -0.144338, -1.49012e-08, -4.3016e-09, 7.45058e-09, -0, -0.25,
-1.49012e-08, 4.77374e-16, -7.45058e-09, 0, -0.288675, 0, 4.3016e-09, 3.55271e-15, -0, -0.25,
1.49012e-08, -1.4512e-15, 7.45058e-09, 0, -0.144338, 1.49012e-08, -4.30159e-09, -7.45058e-09, -0 };
float modeDampingTermsString[5] = { -8.03847, -30, -60, -90, -111.962 };
float damping[5] = { 1, 1, 1, 1, 1 };
float modeFrequenciesArr[5] = { 71419.1, 266564, 533137, 799710, 994855 };
float modeDampingTermsExp[5];
int main(int argc, char** argv) {
/*
for (int m = 0; m < 5; m++) {
modeDampingTermsExp[m] = exp(modeDampingTermsString[m] * tIncr);
}*/
for (int m = 0; m < 5; m++) {
modeDampingTermsExp[m] = modeDampingTermsString[m] * tIncr;
}
//std::thread t1(audioStringSimCos);
//t1.detach();
float subt = 0;
int subWriteIndex = 0;
auto now = std::chrono::high_resolution_clock::now();
while (true) {
curSample = 0;
for (int i = 0; i < 5; i++) {
damping[i] = damping[i] * modeDampingTermsExp[i];
float cosT = 2 * damping[i] * cos(t * modeFrequenciesArr[i]);
for (int m = 0; m < 5; m++) {
curSample += modeGainsTimesModeShapes[i * 5 + m] * cosT;
}
}
//t += tIncr;
bufferWriteIndex++;
//measure calculations per second
auto elapsed = std::chrono::high_resolution_clock::now() - now;
if ((elapsed / std::chrono::milliseconds(1)) > 1000) {
now = std::chrono::high_resolution_clock::now();
int idx = bufferWriteIndex;
cout << idx - subWriteIndex << endl;
subWriteIndex = idx;
}
}
}
Now it runs faster WITH the exp
in the intialization?
Output with exp
:
7632423
7516857
6855266
6251330
7040232
6784555
7169865
7638150
7403717
7626824
7408493
7722998
around 7 million/s, and without exp
:
1229743
1193849
1069924
1426083
1472080
1484318
1503082
1462985
1433372
1357586
1483370
1491731
1526445
1516673
1517916
1522941
1523948
1506818
around 1.5 million. This is confusing to me.
ORIGINAL POST:
It seems like the way I initialize modeDampingTermsExp
in my small example has a huge impact on the performance of my calculations, where I access it. Here is my minimum, reproducible example:
#include <iostream>
#include <chrono>
using namespace std;
int bufferWriteIndex = 0;
float curSample = 0;
float t = 0;
float tIncr = 1.0f / 48000;
float modeGainsTimesModeShapes[25] = { -0.144338, -1.49012e-08, -4.3016e-09, 7.45058e-09, -0, -0.25,
-1.49012e-08, 4.77374e-16, -7.45058e-09, 0, -0.288675, 0, 4.3016e-09, 3.55271e-15, -0, -0.25,
1.49012e-08, -1.4512e-15, 7.45058e-09, 0, -0.144338, 1.49012e-08, -4.30159e-09, -7.45058e-09, -0 };
float modeDampingTermsString[5] = { -8.03847, -30, -60, -90, -111.962 };
float damping[5] = { 1, 1, 1, 1, 1 };
float modeFrequenciesArr[5] = { 71419.1, 266564, 533137, 799710, 994855 };
float modeDampingTermsExp[5];
int main(int argc, char** argv) {
/*
for (int m = 0; m < 5; m++) {
modeDampingTermsExp[m] = exp(modeDampingTermsString[m] * tIncr);
}*/
for (int m = 0; m < 5; m++) {
modeDampingTermsExp[m] = modeDampingTermsString[m] * tIncr;
}
//std::thread t1(audioStringSimCos);
//t1.detach();
float subt = 0;
int subWriteIndex = 0;
auto now = std::chrono::high_resolution_clock::now();
while (true) {
curSample = 0;
for (int i = 0; i < 5; i++) {
damping[i] = damping[i] * modeDampingTermsExp[i];
float cosT = 2 * damping[i] * cos(t * modeFrequenciesArr[i]);
for (int m = 0; m < 5; m++) {
curSample += modeGainsTimesModeShapes[i * 5 + m] * cosT;
}
}
t += tIncr;
bufferWriteIndex++;
//measure calculations per second
auto elapsed = std::chrono::high_resolution_clock::now() - now;
if ((elapsed / std::chrono::milliseconds(1)) > 1000) {
now = std::chrono::high_resolution_clock::now();
int idx = bufferWriteIndex;
cout << idx - subWriteIndex << endl;
subWriteIndex = idx;
}
}
}
When I initialize it like this
for (int m = 0; m < 5; m++) {
modeDampingTermsExp[m] = exp(modeDampingTermsString[m] * tIncr);
}
using the exp
function, performance is about 10 times slower than like this:
for (int m = 0; m < 5; m++) {
modeDampingTermsExp[m] = modeDampingTermsString[m] * tIncr;
}
I measure the calculations per second unsing chrono right below the 2 nested for loops in the endless while(true)
loop (snippet of the fulle example above):
//measure calculations per second
auto elapsed = std::chrono::high_resolution_clock::now() - now;
if ((elapsed / std::chrono::milliseconds(1)) > 1000) {
now = std::chrono::high_resolution_clock::now();
int idx = bufferWriteIndex;
cout << idx - subWriteIndex << endl;
subWriteIndex = idx;
}
Using the exp
function, my program gives the following output for example:
538108
356659
356227
383885
389902
390405
391748
391375
388910
383791
391691
it stays at around 390k.
Using the other initialization without it, I get the following output:
3145299
3049618
2519474
2755627
2846730
2824666
2893591
3119401
3492762
3366317
3470675
3505168
3492805
3523005
3432182
3561458
3580840
3576725
around 3 - 3.5 million "samples" per second.
Why does the way I initialize the modeDampingTermsExp
array impact performance later in the code where I access it? What am I missing here?
I am using Visual Studio 2019 with the following flags: /O2 /Oi /Ot /fp:fast
Thank you very much!