Suppose that one has a function foo_with_mem_allocs
that operates by recursion over a vector of ints, such that it needs to allocate/delete a buffer array in at every recursion step (similar to what a merge_sort algorithm would do):
void foo_with_mem_allocs(vector<int>& v, int c)
{
if (c > 1)
{
c /= 2;
foo_with_mem_allocs(v, c);
int *buffer = new int[v.size()];
for (int i = 0; i < v.size(); i++)
{
buffer[i] = v[i];
}
for (int j = 0; j < c; j++)
v[j] = buffer[j] * c;
delete[] buffer;
}
}
Now suppose that one wants to avoid the cost of allocating/deleting the array buffer by passing the buffer to the function. My immediate idea was to implement a copy of that same function above, with a minor modification:
void foo_with_buffer(vector<int>& v, int c, vector<int>& buffer)
{
if (c > 1)
{
c /= 2;
foo_with_buffer(v, c, buffer);
for (int i = 0; i < v.size(); i++)
{
buffer[i] = v[i];
}
for (int j = 0; j < c; j++)
v[j] = buffer[j] * c;
}
}
I would expect foo_with_buffer
to be significantly faster than foo_with_mem_allocs
. However, when I run test cases and profile them, what I get is actually that they both take roughly the same amount of time to run for any given size of v
and any given value of c
. Quite often, the version with the buffer even runs slower:
I would really love to understand why the version without many memory allocations and deallocations is not going faster as I think it should. IF it helps, I tested that both compiling with Visual Studio 2015 (in Release mode, 64bits) under Windows 10 64bits and also compiling with G++ under Unix with g++ -o test test.cpp
(where test.cpp is obviously the name of the file I put the whole code in). The picture above with an example of results is from a run under Unix.
Below you can find the entire code, including the profiling test, in a directly reproducible format:
//////////////////////////////////////////////////////////////////////////////
#include <iostream>
#include <ctime>
#include <cstdlib>
#include <vector>
#include <algorithm>
using namespace std;
void foo_with_mem_allocs(vector<int>& v, int c) {
if (c > 1)
{
c /= 2;
foo_with_mem_allocs(v, c);
int *buffer = new int[v.size()];
for (int i = 0; i < v.size(); i++)
{
buffer[i] = v[i];
}
for (int j = 0; j < c; j++)
v[j] = buffer[j] * c;
delete[] buffer;
}
}
void foo_with_buffer(vector<int>& v, int c, vector<int>& buffer) {
if (c > 1)
{
c /= 2;
foo_with_buffer(v, c, buffer);
for (int i = 0; i < v.size(); i++)
{
buffer[i] = v[i];
}
for (int j = 0; j < c; j++)
v[j] = buffer[j] * c;
}
}
int main(int argc, char** argv) {
// setting a random seed using clock
srand(time(NULL));
// print out header of output information
cout << "Size\tMethod\t\t\tTicks\tSecs" << endl;
// iterate over possible input data sizes specified in the arguments.
int sz = 100000;
vector<int> v(sz);
vector<int> v1(sz);
vector<int> v2(sz);
vector<int> buffer(sz);
for (int i = 0; i < sz; ++i)
v[i] = v1[i] = v2[i] = rand();
// timing foo that does a bunch of memory allocations/deallocations
clock_t t = clock();
foo_with_mem_allocs(v1, sz);
t = clock() - t;
cout << sz << "\tWith mem alloc\t\t" << t << "\t"
<< (double)t / CLOCKS_PER_SEC << endl;
// timing foo that uses buffers to avoid memory allocations/deallocations
t = clock();
foo_with_buffer(v2, sz, buffer);
t = clock() - t;
cout << sz << "\tWith buffer\t\t" << t <<
"\t" << (double)t / CLOCKS_PER_SEC << endl;
bool changed = false;
for (int i = 0; i < v1.size(); i++)
if (v1[i] != v2[i])
changed = true;
if (changed)
std::cout << "Note: results from the two functions were different.\n" << std::endl;
else
std::cout << "Note: results from the two functions were identical.\n" << std::endl;
return 0;
}
///////////////////////////////////////////////////////////////////////////////