Just a little c++ code, confirmed behavior in java.
This is example code what reproduce this behavior compiled with Visual Studio 2019 Release x64. I got:
611ms for just increment element.
631ms for increment element with cache, so additional 20ms for overhead.
But when i add heavy op for before each increment(i choised random number generation) and got:
2073ms for just increment element.
1432ms for increment element using cache.
I have intel cpu 10700K, and 3200RAM if it matter.
#include <iostream>
#include <random>
#include <chrono>
#include <cstdlib>
#define ARR_SIZE 256 * 256 * 256
#define ACCESS_SIZE 256 * 256
#define CACHE_SIZE 1024
#define ITERATIONS 1000
using namespace std;
using chrono::high_resolution_clock;
using chrono::duration_cast;
using chrono::milliseconds;
int* arr;
int* cache;
int counter = 0;
void flushCache() {
for (int j = 0; j < CACHE_SIZE; ++j)
{
++arr[cache[j]];
}
counter = 0;
}
void incWithCache(int i) {
cache[counter] = i;
++counter;
if (counter == CACHE_SIZE) {
flushCache();
}
}
void incWithoutCache(int i) {
++arr[i];
}
int heavyOp() {
return rand() % 107;
}
void main()
{
arr = new int[ARR_SIZE];
cache = new int[CACHE_SIZE];
int* access = new int[ACCESS_SIZE];
random_device rd;
mt19937 gen(rd());
for (int i = 0; i < ACCESS_SIZE; ++i) {
access[i] = gen() % (ARR_SIZE);
}
for (int i = 0; i < ARR_SIZE; ++i) {
arr[i] = 0;
}
auto t1 = high_resolution_clock::now();
for (int iter = 0; iter < ITERATIONS; ++iter) {
for (int i = 0; i < ACCESS_SIZE; ++i) {
incWithoutCache(access[i]);
}
}
auto t2 = high_resolution_clock::now();
auto ms_int = duration_cast<milliseconds>(t2 - t1);
cout << "Time without cache " << ms_int.count() << "ms\n";
t1 = high_resolution_clock::now();
for (int iter = 0; iter < ITERATIONS; ++iter) {
for (int i = 0; i < ACCESS_SIZE; ++i) {
incWithCache(access[i]);
}
flushCache();
}
t2 = high_resolution_clock::now();
ms_int = duration_cast<milliseconds>(t2 - t1);
cout << "Time with cache " << ms_int.count() << "ms\n";
t1 = high_resolution_clock::now();
for (int iter = 0; iter < ITERATIONS; ++iter) {
for (int i = 0; i < ACCESS_SIZE; ++i) {
heavyOp();
incWithoutCache(access[i]);
}
}
t2 = high_resolution_clock::now();
ms_int = duration_cast<milliseconds>(t2 - t1);
cout << "Time without cache and time between " << ms_int.count() << "ms\n";
t1 = high_resolution_clock::now();
for (int iter = 0; iter < ITERATIONS; ++iter) {
for (int i = 0; i < ACCESS_SIZE; ++i) {
heavyOp();
incWithCache(access[i]);
}
flushCache();
}
t2 = high_resolution_clock::now();
ms_int = duration_cast<milliseconds>(t2 - t1);
cout << "Time with cache and time between " << ms_int.count() << "ms\n";
}