Believe it or not, it can make a huge difference (even accounting for cache-hits making access of the same element over and over very efficient).
Here's some test code:
#include <vector>
#include <cstdint>
#include <iostream>
static inline std::uint64_t RDTSC()
{
unsigned int hi, lo;
__asm__ volatile("rdtsc" : "=a" (lo), "=d" (hi));
return ((uint64_t)hi << 32) | lo;
}
void v1(const std::vector<double> & v, std::vector<double> & ov)
{
for(int i=0 ; i<100000000 ; ++i)
ov.push_back(v[5]);
}
void v2(const std::vector<double> & v, std::vector<double> & ov)
{
auto fixed_var = v[5];
for(int i=0 ; i<100000000 ; ++i)
ov.push_back(fixed_var);
}
void v3(const std::vector<double> & v, std::vector<double> & ov)
{
const double fixed_var = v[5];
for(int i=0 ; i<100000000 ; ++i)
ov.push_back(fixed_var);
}
void flush_cache()
{
//Flush L1 and L2 cache by thrashing it with garbage
const int cache_size = 256*1024*1024;
auto garbage = new char[cache_size];
for(int i=0 ; i < 48; ++i)
{
for (int j=0 ; j<cache_size ; j++)
garbage[j] = i*j;
}
delete[] garbage;
std::cout << "flushed cache\n";
}
int main(void)
{
std::vector<double> v;
std::vector<double> ov;
for(int i=0 ; i<10000000 ; ++i)
{
v.push_back(i/(i+1000000));
}
//try v1
auto start = RDTSC();
v1(v,ov);
auto end = RDTSC();
auto v1t = end-start;
std::cout << "V1: 1.0x\n";
//flush and clear
ov.clear();
flush_cache();
//try v2
start = RDTSC();
v2(v,ov);
end = RDTSC();
auto v2t = end-start;
std::cout << "V2: " << ((double)v2t)/v1t << "x\n";
//flush and clear
ov.clear();
flush_cache();
//try v3
start = RDTSC();
v3(v,ov);
end = RDTSC();
auto v3t = end-start;
std::cout << "V3: " << ((double)v3t)/v1t << "x\n";
}
We see there's actually a huge difference between the naive approach and using a variable:
V1: 1.0x
flushed cache
V2: 0.221311x
flushed cache
V3: 0.222199x
I double checked the assembly to make sure stuff wasn't being optimized away since we weren't using the result.
We're also using the RTDSC instruction to ensure consistent timing in terms of CPU cycles.
The difference between V2 and V3 isn't significant: they trade places on any given run. But it's definitely a 70-80% speed improvement not accessing the array each time.
Note that if you don't flush the L1/L2 cache between runs and run V1 after V2 and V3 they'll all have similar timings. Thus, it's really important to flush the cache.
Some more testing reveals that g++ and c++ will not optimize this away (and just store the value in a register) but Intel C++ will. Go figure...