0

Consider the difference between the work being done and the time take between these two variants of code:

Task: A[i] *= (C[i] + D[i]) AND B[i] *= (D[i] - C[i])

  • Test Case 1: using vector with 2,000,000 elements where vector objects are on the stack
  • Test Case 2: using vector with 2,000,000 elements where vector is in dynamic memory
  • Case1: -Single for loop performing operations on two objects sequentially-

    for(i to n):
        A[i] *= (C[i] + D[i])
        B[i] *= (D[i] - C[i])
    
  • Case2: -Two separate for loops performing operations on each object independently-

    for(i to n):
        A[i] *= (C[i] + D[i])
    for(i to n):
        A[i] *= (D[i] - C[i])
    

Code: -Test Case 1: Vectors on Stack-

#include <algorithm>
#include <chrono>
#include <cstdint>
#include <exception>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <limits>
#include <random>
#include <sstream>
#include <string_view>
#include <utility>
#include <vector>

template<class Resolution = std::chrono::milliseconds>
class ExecutionTimer {
public:
    using Clock = std::conditional_t<std::chrono::high_resolution_clock::is_steady,
        std::chrono::high_resolution_clock,
        std::chrono::steady_clock>;
private:
    const Clock::time_point mStart = Clock::now();

public:
    ExecutionTimer() = default;
    ~ExecutionTimer() {
        const auto end = Clock::now();
        std::ostringstream strStream;
        strStream << "Destructor Elapsed: "
            << std::chrono::duration_cast<Resolution>(end - mStart).count()
            << std::endl;
        std::cout << strStream.str() << std::endl;
    }

    inline void stop() {
        const auto end = Clock::now();
        std::ostringstream strStream;
        strStream << "Stop Elapsed: "
            << std::chrono::duration_cast<Resolution>(end - mStart).count()
            << std::endl;
        std::cout << strStream.str() << std::endl;
    }
};

template<typename T>
void randomly_populate_vector(std::vector<T>& vec, std::mt19937& gen, T lower, T upper) {
    std::uniform_int_distribution<T> dist{ lower, upper };
    for (auto& e : vec) { e = dist(gen); }
}

// this function accounts for 0 base indexing and check max bounds... 
// if user enters 1 for lower it will index location 0 in the vector
template<typename T>
void view_vector(const std::vector<T> vec, std::string_view name, size_t lower = 0, size_t upper = std::numeric_limits<size_t>::max()) {
    if (lower == 0) lower = 1;
    if (upper > vec.size()) upper = vec.size();
 
    std::cout << name << " { ";
    for (size_t i = lower - 1; i < upper; i++) {
        std::cout << std::setw(3) << vec[i] << ", ";
    }
    std::cout << "... }\n";
}

int main() {
    try {
        std::random_device rd;
        std::mt19937 gen{ rd() };
        const std::size_t vec_size = 20000000;
        constexpr uint32_t lower = 1, upper = 100;

        std::vector<uint32_t> A( vec_size, 0 );
        randomly_populate_vector<uint32_t>(A, gen, lower, upper);

        std::vector<uint32_t> B( vec_size, 0 );
        randomly_populate_vector<uint32_t>(B, gen, lower, upper);

        std::vector<uint32_t> C( vec_size, 0 );
        randomly_populate_vector<uint32_t>(C, gen, lower, upper);

        std::vector<uint32_t> D( vec_size, 0 ); 
        randomly_populate_vector<uint32_t>(D, gen, lower, upper);

        // Just a sanity check to make sure we have populated arrays with different values...
        view_vector(A, "A", 1, 20);
        view_vector(B, "B", 1, 20);
        view_vector(C, "C", 1, 20);
        view_vector(D, "D", 1, 20);        

        std::cout << std::endl;

        // Test Case 1:      
        std::cout << "Test Case 1 Time Profile:\n";
        {
            // Time Profile:
            ExecutionTimer<std::chrono::milliseconds> timer;
            // Test Case 1: 
            for (uint32_t i = 0; i < vec_size; i++) {
                A[i] *= (C[i] + D[i]);
                B[i] *= (D[i] - C[i]);
            }
            timer.stop();
        }        
               
        // Test Case 2:
        std::cout << "Test Case 2 Time Profile:\n";
        {
            // Time Profile:
            ExecutionTimer<std::chrono::milliseconds> timer;
            for (uint32_t i = 0; i < vec_size; i++) {
                A[i] *= (C[i] + D[i]);
            }
            for (uint32_t i = 0; i < vec_size; i++) {
                B[i] *= (D[i] - C[i]);
            }
            timer.stop();
        }

        std::cout << std::endl;
        view_vector(A, "A", 1, 20);
        view_vector(B, "B", 2, 20);

    }
    catch (const std::exception& e) {
        std::cerr << e.what() << std::endl;
        return EXIT_FAILURE;
    }
    return EXIT_SUCCESS;
}

Output

A {  32,  21,  90,  61,   5,  60,  49,  93,   7,  53,  57,  99,  52,  35,  65,   1,  12,  87,  62,  14, ... }
B {   1,  38,  12,   7,   4,  77,  14,  36,  93,  32,  11,  76,  54,  79,  53,  53,   8,  87,  51,  16, ... }
C {   6,  37,  97,  87,   2,   5,  38,  65,  57,  87,  36,  48,  97,  74,  35,  97,  98,   7,  65,  79, ... }
D {  54,   4,  96,  56,  24,  21,  25,  84,  93,  43,  89,  42,  64,  84,  45,  77,  60,  67,  89,  13, ... }

Test Case 1 Time Profile:
Stop Elapsed: 43014

Destructor Elapsed: 43014

Test Case 2 Time Profile:
Stop Elapsed: 43251

Destructor Elapsed: 43251


A { 115200, 35301, 3352410, 1247389, 3380, 40560, 194481, 2064693, 157500, 895700, 890625, 801900, 1347892, 873740, 4160
00, 30276, 299568, 476412, 1470392, 118496, ... }
B { 41382,  12, 6727, 1936, 19712, 2366, 12996, 120528, 61952, 30899, 2736, 58806, 7900, 5300, 21200, 11552, 313200, 293
76, 69696, ... }

Code -Test Case 2: Vectors in dynamic memory-

#include <algorithm>
#include <chrono>
#include <cstdint>
#include <exception>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <limits>
#include <memory>
#include <random>
#include <sstream>
#include <string_view>
#include <utility>
#include <vector>

template<class Resolution = std::chrono::milliseconds>
class ExecutionTimer {
public:
    using Clock = std::conditional_t<std::chrono::high_resolution_clock::is_steady,
        std::chrono::high_resolution_clock,
        std::chrono::steady_clock>;
private:
    const Clock::time_point mStart = Clock::now();

public:
    ExecutionTimer() = default;
    ~ExecutionTimer() {
        const auto end = Clock::now();
        std::ostringstream strStream;
        strStream << "Destructor Elapsed: "
            << std::chrono::duration_cast<Resolution>(end - mStart).count()
            << std::endl;
        std::cout << strStream.str() << std::endl;
    }

    inline void stop() {
        const auto end = Clock::now();
        std::ostringstream strStream;
        strStream << "Stop Elapsed: "
            << std::chrono::duration_cast<Resolution>(end - mStart).count()
            << std::endl;
        std::cout << strStream.str() << std::endl;
    }
};

template<typename T>
void randomly_populate_vector(std::vector<T>& vec, std::mt19937& gen, T lower, T upper) {
    std::uniform_int_distribution<T> dist{ lower, upper };
    for (auto& e : vec) { e = dist(gen); }
}

// this function accounts for 0 base indexing and check max bounds... 
// if user enters 1 for lower it will index location 0 in the vector
template<typename T>
void view_vector(const std::vector<T> vec, std::string_view name, size_t lower = 0, size_t upper = std::numeric_limits<size_t>::max()) {
    if (lower == 0) lower = 1;
    if (upper > vec.size()) upper = vec.size();
 
    std::cout << name << " { ";
    for (size_t i = lower - 1; i < upper; i++) {
        std::cout << std::setw(3) << vec[i] << ", ";
    }
    std::cout << "... }\n";
}

int main() {
    try {
        std::random_device rd;
        std::mt19937 gen{ rd() };
        const std::size_t vec_size = 20000000;
        constexpr uint32_t lower = 1, upper = 100;

        std::unique_ptr<std::vector<uint32_t>> A( new std::vector<uint32_t>( vec_size, 0 ) );
        randomly_populate_vector<uint32_t>(*A.get(), gen, lower, upper);

        std::unique_ptr<std::vector<uint32_t>> B( new std::vector<uint32_t>(vec_size, 0 ) );
        randomly_populate_vector<uint32_t>(*B.get(), gen, lower, upper);

        std::unique_ptr<std::vector<uint32_t>> C(new std::vector<uint32_t>(vec_size, 0));
        randomly_populate_vector<uint32_t>(*C.get(), gen, lower, upper);

        std::unique_ptr<std::vector<uint32_t>> D(new std::vector<uint32_t>(vec_size, 0));
        randomly_populate_vector<uint32_t>(*D.get(), gen, lower, upper);

        // Just a sanity check to make sure we have populated arrays with different values...
        view_vector(*A.get(), "A", 1, 20);
        view_vector(*B.get(), "B", 1, 20);
        view_vector(*C.get(), "C", 1, 20);
        view_vector(*D.get(), "D", 1, 20);        

        std::cout << std::endl;

        // Test Case 1:      
        std::cout << "Test Case 1 Time Profile:\n";
        {
            // Time Profile:
            ExecutionTimer<std::chrono::milliseconds> timer;
            // Test Case 1: 
            for (uint32_t i = 0; i < vec_size; i++) {
                (*A.get())[i] *= ((*C.get())[i] + (*D.get())[i]);
                (*B.get())[i] *= ((*D.get())[i] - (*C.get())[i]);
            }
            timer.stop();
        }        
               
        // Test Case 2:
        std::cout << "Test Case 2 Time Profile:\n";
        {
            // Time Profile:
            ExecutionTimer<std::chrono::milliseconds> timer;
            for (uint32_t i = 0; i < vec_size; i++) {
                (*A.get())[i] *= ((*C.get())[i] + (*D.get())[i]);
            }
            for (uint32_t i = 0; i < vec_size; i++) {
                (*B.get())[i] *= ((*D.get())[i] - (*C.get())[i]);
            }
            timer.stop();
        }

        std::cout << std::endl;
        view_vector(*A.get(), "A", 1, 20);
        view_vector(*B.get(), "B", 2, 20);

    }
    catch (const std::exception& e) {
        std::cerr << e.what() << std::endl;
        return EXIT_FAILURE;
    }
    return EXIT_SUCCESS;
}

Output

A {  40,  84,  42,  66,  22,  70,  32,  51,  48,  13,  59,  28,  69,  59,  13,  86,  85,  33,  59,  11, ... }
B {  80,  76,  57,  21,  24,  21,  74,  14,  53,  75,   5,  96,   4,  47,  79,  73,  92,   4,  90,   3, ... }
C {  30,  49,  96,  46,  16,  46,  22,  46,  26,  83,  83,  23,  84,  70,  63,  69,  48,  32,  92,   8, ... }
D {  59,  90,  74,   2,  77,  84,  35,  81,  58,  78,  57,  40,  54,   8,  74,  15,  11,  70,  81,  27, ... }

Test Case 1 Time Profile:
Stop Elapsed: 54735

Destructor Elapsed: 54736

Test Case 2 Time Profile:
Stop Elapsed: 54098

Destructor Elapsed: 54099


A { 316840, 1622964, 1213800, 152064, 190278, 1183000, 103968, 822579, 338688, 336973, 1156400, 111132, 1314036, 358956,
 243997, 606816, 295885, 343332, 1765811, 13475, ... }
B { 127756, 27588, 40656, 89304, 30324, 12506, 17150, 54272, 1875, 3380, 27744, 3600, 180668, 9559, 212868, 125948, 5776, 10890, 1083, ... }

The following output was generated using Visual Studio 2017 using C++17 in Debug x64 with no optimizations.

As we can see from the two samples tests with both test cases the time variations are relatively close even with 2 million samples while performing 2-3 operations per each on them... a multiplication, addition or subtraction, and assignment.

The time profiles on my Intel Core 2 Duo Quad Core Extreme with a 3.0Ghz processor with 8GB Ram on a Windows 7 - 64bit system... I was getting this range of values...

       case1   case2
Stack: 43014   43251
Heap:  54735   54098

In the first example having multiple loops were slightly slower when the vectors were on the stack, but even with 2,000 cases and only a handful of operations on each element we can consider this negligible.

Even in the second example where the vectors themselves were on the heap or in dynamic memory... the difference between the two test cases is still relatively similar, however, in this case it appears that case 2 with the multiple for loops performed better.

I suspect that this trend would continue if the computations, functions calls, or memory manipulation on these objects were quite a bit more complex within the corresponding for loops.

I have yet to run both scenarios in release mode with full optimizations on... but even in debug mode we can see a slight performance gain by separating the work being done on our objects into different for loops as opposed to trying to do them all in a single loop.

Now, the actual elements within these vectors are just simple integral values... Imagine if these were complex objects or structures, where they might have been calling some function and that function might have multiple branches, or if one of those objects is querying from a queue, or is waiting on a thread... I tend to think that case 1 will become worse over time the more complex the objects are, as well as the size of the container, and how many other objects or containers you are working on within the same loop...

It may seem counterintuitive to have separate loops, but I believe this can be a performance gain. Now if your data sets are small, and your data structures are simple, then this wouldn't necessarily apply as the time differences are almost negligible, but if you are doing number crunching on large data sets with complex operations such as you would see in sound processing, then this might be something to consider!

I already have an idea as to why this is, but I would like to hear the community's response, position, and understanding in regard to this one... What do you think the root cause of this is?



EDIT

Here are the printed results running in Release mode with /O2 optimizations on...

Case 1

A {  31,  57,  72,  19,  12,   3,  43,  11,  66,  46,  25,  74,  72,  87,  22,  31,  78,  80,  12,  94, ... }
B {  60,  26,  69,  58,   2,  15,  74, 100,  61,  44,  94,  94,  48,  35,  32,   6,  74,  52,   4,  40, ... }
C {  28,  81,  98,  31,  65,  39,  97,  31,  89,  64,  35,  47,  35,  37,  79,  20,  86,  70,  80,  43, ... }
D {  46,  36,  69,  35,  24,  62,  11,  41,  36,  17,  91,  34,  94,  70,   7,  66,  32,  98,  18,  21, ... }

Test Case 1 Time Profile:
Stop Elapsed: 144

Destructor Elapsed: 144

Test Case 2 Time Profile:
Stop Elapsed: 106

Destructor Elapsed: 106


A { 169756, 780273, 2008008, 82764, 95052, 30603, 501552, 57024, 1031250, 301806, 396900, 485514, 1198152, 996063, 16271
2, 229276, 1086072, 2257920, 115248, 385024, ... }
B { 52650, 58029, 928, 3362, 7935, 547304, 10000, 171349, 97196, 294784, 15886, 167088, 38115, 165888, 12696, 215784, 40
768, 15376, 19360, ... }

Case 2

A {  31,  68,  94,  29,  10,  32,  37,  11,  50,   6,  18,  62,  88,  36,  10,  48,  19,  38,  49,   1, ... }
B {   8,  19,   8,   3,  26,  97,  80,  82,  54,  31,  80,  95,  68,  84,  82,   1,  59,  61,  60,  35, ... }
C {  80,  69,  14,  59,  42,  85,  14,  40,  52,  35,  10,  28,  82,  62,  84,  97,  70,   1,  54,  92, ... }
D {  74,  27,  46,  15,  13,   5,  17,  60,  33,  25,  59,  84,  53,  14,  98,  55,  62,  49,  42,  45, ... }

Test Case 1 Time Profile:
Stop Elapsed: 139

Destructor Elapsed: 139

Test Case 2 Time Profile:
Stop Elapsed: 104

Destructor Elapsed: 104


A { 735196, 626688, 338400, 158804, 30250, 259200, 35557, 110000, 361250, 21600, 85698, 777728, 1603800, 207936, 331240,
 1108992, 331056, 95000, 451584, 18769, ... }
B { 33516, 8192, 5808, 21866, 620800, 720, 32800, 19494, 3100, 192080, 297920, 57188, 193536, 16072, 1764, 3776, 140544,
 8640, 77315, ... }

In Release mode and with optimizations on we can see the following:

       case1   case2
Stack: 144     106
Heap:  139     104

Now we can see that Case 2, the independent loops are performing better. Again, what do you think it is and why?

Francis Cugler
  • 7,788
  • 2
  • 28
  • 59
  • Performance tests without optimizations are not meaningful. Rerun them with optimization. – mch Jul 23 '20 at 09:12
  • @mch The concept here that I'm after has an impact regardless of optimization settings. The underlying performance issue where the same work is being done, but in two different ways, it doesn't even involve software or the hardware for that matter! It's actually a mathematical and physics problem! – Francis Cugler Jul 23 '20 at 09:14
  • @mch however as your request, I added the results to the bottom of the original question under the edit section... – Francis Cugler Jul 23 '20 at 09:27
  • You first state "we can consider this negligible" about the performance difference between the two test cases, and then try to draw conclusions from that difference ? Noise is just noise - no matter how hard you try to derive meaning from it. – Sander De Dycker Jul 23 '20 at 09:31
  • Did you repeat the experiment large number of times, what is the statistical significance of the hypothesis the x is better than y? (What satistical test did you run? What is the p value? I recommend starting with wilcoxon signed ranked test unless you have good explanation why a different test would be better) – amit Jul 23 '20 at 09:32
  • @amit I already stated that I have an idea as to why. I'm asking the community why they think so! I'll give you a hint, it doesn't even involve software or hardware for that matter! It's an algorithmic problem! – Francis Cugler Jul 23 '20 at 09:48
  • @FrancisCugler What I am trying to say, before claiming (in general) X is better than Y, and backing this claim by empirical results, the empirical results should be statistical significant, otherwise - they are worseless. It does NOT mean this is not a real issue. It only means the empirical results should not be trusted without getting a significance. – amit Jul 23 '20 at 09:52
  • @Sander De Dycker the difference is so minuscule as it is in fractions of a second difference with a data set of a couple million. There are background processes running and other things happening within the OS. However, there is a difference between the two methods or variants of the for loops. But in general case 2 will perform better especially the more complex the computations are within the loops. – Francis Cugler Jul 23 '20 at 09:53
  • Have you tried switching the two loops? as caching could be an issue with so small tests examples. – Surt Jul 23 '20 at 10:01
  • @Surt, I'm not having any issues with profiling or testing... I stated I already know what the answer is or at least why I believe it is so. I was asking the community in general as to why they would think one method is faster than the other... – Francis Cugler Jul 23 '20 at 10:03
  • When I run you shown code here, Test Case 1 is always faster then Test Case 2 – t.niese Jul 23 '20 at 10:03
  • @t.niese could be your compiler or OS... but in general independent for loops should be faster than single for loops working on multiple data. However, compilers these days are very sophisticated and they might be doing tricks in generating assembly that you and I would never think of... – Francis Cugler Jul 23 '20 at 10:04
  • If you guys and gals are not sure as to what I'm getting at, you can follow the link in my provided answer below... – Francis Cugler Jul 23 '20 at 10:11
  • @FrancisCugler : You're asserting that one is faster than the other without presenting the proof for it. Furthermore, you're claiming you have an explanation for that difference without presenting it. The comments here are asking you to add the missing details in order to make this answerable. – Sander De Dycker Jul 23 '20 at 10:18
  • 1
    @FrancisCugler I tested it with gcc and clang with various versions and settings. And for none of these, I can reproduce the results you show. In Debug both are nearly identical, at one run Case 1 is faster and at the other run Case 2 is faster. In Release - for my setup - Case 1 is always faster than Case 2. Given that, the different results are less likely related to the loop, then other effects. (Changes in the ENV variables, e.g. a different path in which the app runs, will have an effect on the memory layout and can lead to such differences) – t.niese Jul 23 '20 at 10:27
  • @t.niese yes, modern pcs are amazing in what they can do, how they can optimize code, etc... compilers are better than they have ever been... and stuff just keeps on improving. However, there is a mathematical proof as to why it is faster to have multiple for loops each working on their own independent dataset than there is having a single loop working on multiple datasets. I provided this proof as an answer to another similar -related question. You can find that here: https://stackoverflow.com/a/41937746/1757805 – Francis Cugler Jul 23 '20 at 10:41
  • @FrancisCugler I won't deny that there are cases where two loops are better than one due to cache misses and other effects. But as always it depends on the exact use-case and architecture. And what is the point you want to know in that case, if it contradicts observations? You say that CPUs are getting better, fine, but if they are then able to handle case 1 better, then case 2 (and yes I also created tests in which I ensured that the loops persisted that way in the final machine code), what is the mathematical proof worth then? – t.niese Jul 23 '20 at 11:10
  • @t.niese Just something to be aware of consciously and subconsciously... If the data sets are simple and on the stack and you have great cache hits, then this won't be an issue. However, if your data sets are on the heap through dynamic memory allocations, this can become an issue. It's an interpolation bottleneck that comes from the fact of having to move from dataset `A` to `B` and back to `A` on each iteration. If they are billions of addresses apart and these containers are large, this will make an impact. So when working with heap objects, seperate your loops per dataset! – Francis Cugler Jul 23 '20 at 13:06
  • @t.niese Now this bottleneck or the time difference is minimal and the movement from `A` to `B` and back to `A` only accounts for about 20% - 30% of the overall time complexity of their respective line of instructions... This percentage will vary depending on the nature of `C` and `D` and what kind of tasks they have to perform. – Francis Cugler Jul 23 '20 at 13:09
  • @FrancisCugler Let's leave aside for a moment that the specification has neither stack nor heap, and consider only common implementations: For both of your examples `Test Case 1` and `Test Case 2` the data hold by the vectors are on the heap. Stack and heap are the same memory and a concept of the runtime environment, so the have the same problem regarding cache misses. If the vectors are on the heap and the compiler does not notice that the pointer to the data won't change then you might have cache misses due to the indirection, but the cache misses for the data is likely similar. – t.niese Jul 23 '20 at 14:32
  • @FrancisCugler depending on the CPU Architecture a single loop could for the given code have less cache misses, because `(C[i] + D[i])` and `(D[i] - C[i])` access the same area in memory for `C` and `D` so `(D[i] - C[i])` could profit from `(C[i] + D[i])`. – t.niese Jul 23 '20 at 14:41

1 Answers1

-1

People are beginning to argue with me on semantics instead of reading the question. Yes, the sample set that I provided is very small, and the differences are minute in this simple test case.

When I say simple I mean the data types within the containers are simple integers and not complex structures that may or may not be calling other functions, managing or accessing memory, sitting in a priority queue, waiting on threads, etc...

There is an underlying phenomenon here as to why independent for loops working on a single data set will perform better than a single for loop working on multiple data sets.

I know why this is and I'll give you a hint or a clue. It doesn't even involve or pertain to any hardware, or software. The problem here is an algorithmic problem. It is a mathematical - physics-based problem.

I won't go into full detail here to prove why this is as I have already stated this before as an answer to a different question here on Stackoverflow... I'll just provide the link to my answer there. You can find it here!



Edit

To give a short version of that post.

We have to consider their locations relative to each other. We have to consider the scope that they are within.

In the first case with the single for and performing operations on multiple data sets: A is at location x and B is at location y in memory regardless if it is in the cache, ram, or somewhere else. We know that x and y has some defined distance from each other.

Code execution within the scope has to travel first to A and has to wait on C and D to finish their jobs to return the value back to A, then execution has to travel from A to B then wait again on C and D to finish their jobs to return the value back to B. Then on the next iteration, it has to travel from B back to A. This has to be performed on each and every iteration of the for-loop.

Now let's consider the second case where there is an independent loop for each dataset...

Code execution has to travel to A for the scope of this loop, then has to wait for C and D to finish their jobs to return their value back to A. Code execution doesn't have to move from here as it is already @ A. Now we just repeat this for every iteration of the loop. Once this loop goes out of scope and execution continues to the next loop. It has to travel only 1 time from A to B, then it waits on C & D to finish their jobs to return their value back to A and again there is no movement here as it is already at A and we just continue with the iterations of the loops.

So when we look at these two types of looping structure:

for(i to n) {
    A[i] = C[i] op D[i]; // execution has to move from A's location 
    B[i] = C[i] op D[i]; // to B's location and back to A's on every iteration.
}

This will create a bottleneck. It is marginal and almost negligible when working with data objects and data sets that are on the stack locally and when their values are cached frequently. However, when these structures become more complex, and the memory is no longer within the cache (close and fast) and is now in the main memory where it could be several hundred addresses to billions of addresses away. We end up having an interpolation issue here from the first line of execution within this loop to the second.

The time difference here is linear in that it will grow linear as the container grows in size as it is proportional to the size of the container... To prevent this just move each independent data set to their own independent loop and scope.

for ( i to n ) {
    A[i] = C[i] op D[i] // Travels to A just once, then continues iteration.
}      

for ( i to n ) {
    B[i] = C[i] op D[i] // Travels from A to B just once, then continues iteration.
}

The underlying issue here has nothing to do with hardware and software at all. Do they play a roll? Of course! But this interpolation bottleneck is apparent from the mathematics of the algorithm and the physics of actual motion. However, in modern computer systems with modern operating systems and compilers... They have become so advanced, that they can literally behind the scenes generate the appropriate "Assembly" instructions for you to prevent or to minimize code structures such as this, however that may not always be the case! I just think that this is something worth being consciously aware of when designing and implementing your algorithms especially if it is library or API code. You don't know how your users are going to use it.

Francis Cugler
  • 7,788
  • 2
  • 28
  • 59
  • Please use the edit link on your question to add additional information. The Post Answer button should be used only for complete answers to the question. - [From Review](/review/low-quality-posts/26759016) – nvoigt Jul 23 '20 at 12:21
  • @nvoigt I did what you suggested by making this answer complete. I summarized my position from the post that I provided. It's basically the short version of that answer. – Francis Cugler Jul 23 '20 at 13:12
  • 3
    I don't really understand, is this an answer to your question or a part/explanation of your question? – nvoigt Jul 23 '20 at 14:03