Consider the difference between the work being done and the time take between these two variants of code:
Task: A[i] *= (C[i] + D[i]) AND B[i] *= (D[i] - C[i])
- Test Case 1: using vector with 2,000,000 elements where vector objects are on the stack
- Test Case 2: using vector with 2,000,000 elements where vector is in dynamic memory
Case1: -Single for loop performing operations on two objects sequentially-
for(i to n): A[i] *= (C[i] + D[i]) B[i] *= (D[i] - C[i])
Case2: -Two separate for loops performing operations on each object independently-
for(i to n): A[i] *= (C[i] + D[i]) for(i to n): A[i] *= (D[i] - C[i])
Code: -Test Case 1: Vectors on Stack-
#include <algorithm>
#include <chrono>
#include <cstdint>
#include <exception>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <limits>
#include <random>
#include <sstream>
#include <string_view>
#include <utility>
#include <vector>
template<class Resolution = std::chrono::milliseconds>
class ExecutionTimer {
public:
using Clock = std::conditional_t<std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock,
std::chrono::steady_clock>;
private:
const Clock::time_point mStart = Clock::now();
public:
ExecutionTimer() = default;
~ExecutionTimer() {
const auto end = Clock::now();
std::ostringstream strStream;
strStream << "Destructor Elapsed: "
<< std::chrono::duration_cast<Resolution>(end - mStart).count()
<< std::endl;
std::cout << strStream.str() << std::endl;
}
inline void stop() {
const auto end = Clock::now();
std::ostringstream strStream;
strStream << "Stop Elapsed: "
<< std::chrono::duration_cast<Resolution>(end - mStart).count()
<< std::endl;
std::cout << strStream.str() << std::endl;
}
};
template<typename T>
void randomly_populate_vector(std::vector<T>& vec, std::mt19937& gen, T lower, T upper) {
std::uniform_int_distribution<T> dist{ lower, upper };
for (auto& e : vec) { e = dist(gen); }
}
// this function accounts for 0 base indexing and check max bounds...
// if user enters 1 for lower it will index location 0 in the vector
template<typename T>
void view_vector(const std::vector<T> vec, std::string_view name, size_t lower = 0, size_t upper = std::numeric_limits<size_t>::max()) {
if (lower == 0) lower = 1;
if (upper > vec.size()) upper = vec.size();
std::cout << name << " { ";
for (size_t i = lower - 1; i < upper; i++) {
std::cout << std::setw(3) << vec[i] << ", ";
}
std::cout << "... }\n";
}
int main() {
try {
std::random_device rd;
std::mt19937 gen{ rd() };
const std::size_t vec_size = 20000000;
constexpr uint32_t lower = 1, upper = 100;
std::vector<uint32_t> A( vec_size, 0 );
randomly_populate_vector<uint32_t>(A, gen, lower, upper);
std::vector<uint32_t> B( vec_size, 0 );
randomly_populate_vector<uint32_t>(B, gen, lower, upper);
std::vector<uint32_t> C( vec_size, 0 );
randomly_populate_vector<uint32_t>(C, gen, lower, upper);
std::vector<uint32_t> D( vec_size, 0 );
randomly_populate_vector<uint32_t>(D, gen, lower, upper);
// Just a sanity check to make sure we have populated arrays with different values...
view_vector(A, "A", 1, 20);
view_vector(B, "B", 1, 20);
view_vector(C, "C", 1, 20);
view_vector(D, "D", 1, 20);
std::cout << std::endl;
// Test Case 1:
std::cout << "Test Case 1 Time Profile:\n";
{
// Time Profile:
ExecutionTimer<std::chrono::milliseconds> timer;
// Test Case 1:
for (uint32_t i = 0; i < vec_size; i++) {
A[i] *= (C[i] + D[i]);
B[i] *= (D[i] - C[i]);
}
timer.stop();
}
// Test Case 2:
std::cout << "Test Case 2 Time Profile:\n";
{
// Time Profile:
ExecutionTimer<std::chrono::milliseconds> timer;
for (uint32_t i = 0; i < vec_size; i++) {
A[i] *= (C[i] + D[i]);
}
for (uint32_t i = 0; i < vec_size; i++) {
B[i] *= (D[i] - C[i]);
}
timer.stop();
}
std::cout << std::endl;
view_vector(A, "A", 1, 20);
view_vector(B, "B", 2, 20);
}
catch (const std::exception& e) {
std::cerr << e.what() << std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
Output
A { 32, 21, 90, 61, 5, 60, 49, 93, 7, 53, 57, 99, 52, 35, 65, 1, 12, 87, 62, 14, ... }
B { 1, 38, 12, 7, 4, 77, 14, 36, 93, 32, 11, 76, 54, 79, 53, 53, 8, 87, 51, 16, ... }
C { 6, 37, 97, 87, 2, 5, 38, 65, 57, 87, 36, 48, 97, 74, 35, 97, 98, 7, 65, 79, ... }
D { 54, 4, 96, 56, 24, 21, 25, 84, 93, 43, 89, 42, 64, 84, 45, 77, 60, 67, 89, 13, ... }
Test Case 1 Time Profile:
Stop Elapsed: 43014
Destructor Elapsed: 43014
Test Case 2 Time Profile:
Stop Elapsed: 43251
Destructor Elapsed: 43251
A { 115200, 35301, 3352410, 1247389, 3380, 40560, 194481, 2064693, 157500, 895700, 890625, 801900, 1347892, 873740, 4160
00, 30276, 299568, 476412, 1470392, 118496, ... }
B { 41382, 12, 6727, 1936, 19712, 2366, 12996, 120528, 61952, 30899, 2736, 58806, 7900, 5300, 21200, 11552, 313200, 293
76, 69696, ... }
Code -Test Case 2: Vectors in dynamic memory-
#include <algorithm>
#include <chrono>
#include <cstdint>
#include <exception>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <limits>
#include <memory>
#include <random>
#include <sstream>
#include <string_view>
#include <utility>
#include <vector>
template<class Resolution = std::chrono::milliseconds>
class ExecutionTimer {
public:
using Clock = std::conditional_t<std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock,
std::chrono::steady_clock>;
private:
const Clock::time_point mStart = Clock::now();
public:
ExecutionTimer() = default;
~ExecutionTimer() {
const auto end = Clock::now();
std::ostringstream strStream;
strStream << "Destructor Elapsed: "
<< std::chrono::duration_cast<Resolution>(end - mStart).count()
<< std::endl;
std::cout << strStream.str() << std::endl;
}
inline void stop() {
const auto end = Clock::now();
std::ostringstream strStream;
strStream << "Stop Elapsed: "
<< std::chrono::duration_cast<Resolution>(end - mStart).count()
<< std::endl;
std::cout << strStream.str() << std::endl;
}
};
template<typename T>
void randomly_populate_vector(std::vector<T>& vec, std::mt19937& gen, T lower, T upper) {
std::uniform_int_distribution<T> dist{ lower, upper };
for (auto& e : vec) { e = dist(gen); }
}
// this function accounts for 0 base indexing and check max bounds...
// if user enters 1 for lower it will index location 0 in the vector
template<typename T>
void view_vector(const std::vector<T> vec, std::string_view name, size_t lower = 0, size_t upper = std::numeric_limits<size_t>::max()) {
if (lower == 0) lower = 1;
if (upper > vec.size()) upper = vec.size();
std::cout << name << " { ";
for (size_t i = lower - 1; i < upper; i++) {
std::cout << std::setw(3) << vec[i] << ", ";
}
std::cout << "... }\n";
}
int main() {
try {
std::random_device rd;
std::mt19937 gen{ rd() };
const std::size_t vec_size = 20000000;
constexpr uint32_t lower = 1, upper = 100;
std::unique_ptr<std::vector<uint32_t>> A( new std::vector<uint32_t>( vec_size, 0 ) );
randomly_populate_vector<uint32_t>(*A.get(), gen, lower, upper);
std::unique_ptr<std::vector<uint32_t>> B( new std::vector<uint32_t>(vec_size, 0 ) );
randomly_populate_vector<uint32_t>(*B.get(), gen, lower, upper);
std::unique_ptr<std::vector<uint32_t>> C(new std::vector<uint32_t>(vec_size, 0));
randomly_populate_vector<uint32_t>(*C.get(), gen, lower, upper);
std::unique_ptr<std::vector<uint32_t>> D(new std::vector<uint32_t>(vec_size, 0));
randomly_populate_vector<uint32_t>(*D.get(), gen, lower, upper);
// Just a sanity check to make sure we have populated arrays with different values...
view_vector(*A.get(), "A", 1, 20);
view_vector(*B.get(), "B", 1, 20);
view_vector(*C.get(), "C", 1, 20);
view_vector(*D.get(), "D", 1, 20);
std::cout << std::endl;
// Test Case 1:
std::cout << "Test Case 1 Time Profile:\n";
{
// Time Profile:
ExecutionTimer<std::chrono::milliseconds> timer;
// Test Case 1:
for (uint32_t i = 0; i < vec_size; i++) {
(*A.get())[i] *= ((*C.get())[i] + (*D.get())[i]);
(*B.get())[i] *= ((*D.get())[i] - (*C.get())[i]);
}
timer.stop();
}
// Test Case 2:
std::cout << "Test Case 2 Time Profile:\n";
{
// Time Profile:
ExecutionTimer<std::chrono::milliseconds> timer;
for (uint32_t i = 0; i < vec_size; i++) {
(*A.get())[i] *= ((*C.get())[i] + (*D.get())[i]);
}
for (uint32_t i = 0; i < vec_size; i++) {
(*B.get())[i] *= ((*D.get())[i] - (*C.get())[i]);
}
timer.stop();
}
std::cout << std::endl;
view_vector(*A.get(), "A", 1, 20);
view_vector(*B.get(), "B", 2, 20);
}
catch (const std::exception& e) {
std::cerr << e.what() << std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
Output
A { 40, 84, 42, 66, 22, 70, 32, 51, 48, 13, 59, 28, 69, 59, 13, 86, 85, 33, 59, 11, ... }
B { 80, 76, 57, 21, 24, 21, 74, 14, 53, 75, 5, 96, 4, 47, 79, 73, 92, 4, 90, 3, ... }
C { 30, 49, 96, 46, 16, 46, 22, 46, 26, 83, 83, 23, 84, 70, 63, 69, 48, 32, 92, 8, ... }
D { 59, 90, 74, 2, 77, 84, 35, 81, 58, 78, 57, 40, 54, 8, 74, 15, 11, 70, 81, 27, ... }
Test Case 1 Time Profile:
Stop Elapsed: 54735
Destructor Elapsed: 54736
Test Case 2 Time Profile:
Stop Elapsed: 54098
Destructor Elapsed: 54099
A { 316840, 1622964, 1213800, 152064, 190278, 1183000, 103968, 822579, 338688, 336973, 1156400, 111132, 1314036, 358956,
243997, 606816, 295885, 343332, 1765811, 13475, ... }
B { 127756, 27588, 40656, 89304, 30324, 12506, 17150, 54272, 1875, 3380, 27744, 3600, 180668, 9559, 212868, 125948, 5776, 10890, 1083, ... }
The following output was generated using Visual Studio 2017 using C++17 in Debug x64 with no optimizations.
As we can see from the two samples tests with both test cases the time variations are relatively close even with 2 million samples while performing 2-3 operations per each on them... a multiplication, addition or subtraction, and assignment.
The time profiles on my Intel Core 2 Duo Quad Core Extreme with a 3.0Ghz processor with 8GB Ram on a Windows 7 - 64bit system... I was getting this range of values...
case1 case2 Stack: 43014 43251 Heap: 54735 54098
In the first example having multiple loops were slightly slower when the vectors were on the stack, but even with 2,000 cases and only a handful of operations on each element we can consider this negligible.
Even in the second example where the vectors themselves were on the heap or in dynamic memory... the difference between the two test cases is still relatively similar, however, in this case it appears that case 2 with the multiple for loops performed better.
I suspect that this trend would continue if the computations, functions calls, or memory manipulation on these objects were quite a bit more complex within the corresponding for loops.
I have yet to run both scenarios in release mode with full optimizations on... but even in debug mode we can see a slight performance gain by separating the work being done on our objects into different for loops as opposed to trying to do them all in a single loop.
Now, the actual elements within these vectors are just simple integral values... Imagine if these were complex objects or structures, where they might have been calling some function and that function might have multiple branches, or if one of those objects is querying from a queue, or is waiting on a thread... I tend to think that case 1 will become worse over time the more complex the objects are, as well as the size of the container, and how many other objects or containers you are working on within the same loop...
It may seem counterintuitive to have separate loops, but I believe this can be a performance gain. Now if your data sets are small, and your data structures are simple, then this wouldn't necessarily apply as the time differences are almost negligible, but if you are doing number crunching on large data sets with complex operations such as you would see in sound processing, then this might be something to consider!
I already have an idea as to why this is, but I would like to hear the community's response, position, and understanding in regard to this one... What do you think the root cause of this is?
EDIT
Here are the printed results running in Release mode with /O2
optimizations on...
Case 1
A { 31, 57, 72, 19, 12, 3, 43, 11, 66, 46, 25, 74, 72, 87, 22, 31, 78, 80, 12, 94, ... }
B { 60, 26, 69, 58, 2, 15, 74, 100, 61, 44, 94, 94, 48, 35, 32, 6, 74, 52, 4, 40, ... }
C { 28, 81, 98, 31, 65, 39, 97, 31, 89, 64, 35, 47, 35, 37, 79, 20, 86, 70, 80, 43, ... }
D { 46, 36, 69, 35, 24, 62, 11, 41, 36, 17, 91, 34, 94, 70, 7, 66, 32, 98, 18, 21, ... }
Test Case 1 Time Profile:
Stop Elapsed: 144
Destructor Elapsed: 144
Test Case 2 Time Profile:
Stop Elapsed: 106
Destructor Elapsed: 106
A { 169756, 780273, 2008008, 82764, 95052, 30603, 501552, 57024, 1031250, 301806, 396900, 485514, 1198152, 996063, 16271
2, 229276, 1086072, 2257920, 115248, 385024, ... }
B { 52650, 58029, 928, 3362, 7935, 547304, 10000, 171349, 97196, 294784, 15886, 167088, 38115, 165888, 12696, 215784, 40
768, 15376, 19360, ... }
Case 2
A { 31, 68, 94, 29, 10, 32, 37, 11, 50, 6, 18, 62, 88, 36, 10, 48, 19, 38, 49, 1, ... }
B { 8, 19, 8, 3, 26, 97, 80, 82, 54, 31, 80, 95, 68, 84, 82, 1, 59, 61, 60, 35, ... }
C { 80, 69, 14, 59, 42, 85, 14, 40, 52, 35, 10, 28, 82, 62, 84, 97, 70, 1, 54, 92, ... }
D { 74, 27, 46, 15, 13, 5, 17, 60, 33, 25, 59, 84, 53, 14, 98, 55, 62, 49, 42, 45, ... }
Test Case 1 Time Profile:
Stop Elapsed: 139
Destructor Elapsed: 139
Test Case 2 Time Profile:
Stop Elapsed: 104
Destructor Elapsed: 104
A { 735196, 626688, 338400, 158804, 30250, 259200, 35557, 110000, 361250, 21600, 85698, 777728, 1603800, 207936, 331240,
1108992, 331056, 95000, 451584, 18769, ... }
B { 33516, 8192, 5808, 21866, 620800, 720, 32800, 19494, 3100, 192080, 297920, 57188, 193536, 16072, 1764, 3776, 140544,
8640, 77315, ... }
In Release mode and with optimizations on we can see the following:
case1 case2 Stack: 144 106 Heap: 139 104
Now we can see that Case 2, the independent loops are performing better. Again, what do you think it is and why?