Is the memory latency slowing the execution on a large dataset?

Question

I was having fun with Google Benchmark with this code (compiled in -O3 with GCC):

#include <iostream>
#include <thread>
#include <random>
#include <benchmark/benchmark.h>

using namespace std;


struct PointRecord{
    double latitude, longitude;
    double speed_x, speed_y, speed_z;
    double acceleration_x, acceleration_y, acceleration_z;
    long timestamp;
};


struct PointGpsRecord{
    double latitude, longitude;
};


tuple<double, double> avg_pos_obj_oriented(vector<PointRecord>& data){
    double mean_lon = 0.0;
    double mean_lat = 0.0;
    auto size = data.size();
    for(auto& point : data){
        mean_lat += point.latitude;
        mean_lon += point.longitude;
    }
    return make_tuple(mean_lat / size, mean_lon / size);
}

static void BM_object_oriented(benchmark::State &state) {
    random_device rnd_device;
    mt19937 rng{rnd_device()};
    uniform_real_distribution<double> dist{-10, 10};

    int size = state.range(0);
    vector<PointRecord> points;
    points.reserve(size);
    for(int i = 0; i < size; i++){
        auto point = PointRecord();
        point.latitude = dist(rng);
        point.longitude = dist(rng);
        point.timestamp = 12345678;
        points.push_back(point);
    }


    for (auto _: state) {
        auto p = avg_pos_obj_oriented(points);
        benchmark::DoNotOptimize(p);
    }

    state.SetBytesProcessed(int64_t(state.iterations()) *
                            int64_t(sizeof(double)*2*size));
    state.counters["BytesProcessed"] = benchmark::Counter(size*sizeof(PointRecord), benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);

}


tuple<double,double> avg_vec_data_oriented(vector<PointGpsRecord>& data, uint size){
    double sum_lat = 0.0;
    double sum_lon = 0.0;
    for(int i = 0; i < size; i++){
        sum_lat += data[i].latitude;
        sum_lon += data[i].longitude;
    }
    return make_tuple(sum_lat/size, sum_lon/size);
}

static void BM_data_oriented(benchmark::State &state) {
    random_device rnd_device;
    mt19937 rng{rnd_device()};
    uniform_real_distribution<double> dist{-10, 10};

    int size = state.range(0);

    vector<PointGpsRecord> records;
    records.reserve(size);

    for(int i = 0; i < size; i++){
        auto point = PointGpsRecord();
        point.latitude = dist(rng);
        point.longitude = dist(rng);
        records.push_back(point);
    }


    for (auto _: state) {
        tuple<double,double> avg = avg_vec_data_oriented(records, size);
        benchmark::DoNotOptimize(avg);
    }

    int64_t iterations = state.iterations();
    int64_t bytes = sizeof(double) * 2 * size;

    state.SetBytesProcessed(iterations * bytes);
    state.counters["BytesProcessed"] = benchmark::Counter(size*sizeof(PointGpsRecord), benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);

}


BENCHMARK(BM_object_oriented)->Unit(benchmark::kMicrosecond)->RangeMultiplier(4)->Range(64, 1 << 25);
BENCHMARK(BM_data_oriented)->Unit(benchmark::kMicrosecond)->RangeMultiplier(4)->Range(64, 1 << 25);


BENCHMARK_MAIN();

With these results:

Run on (16 X 5000 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 256 KiB (x8)
  L3 Unified 16384 KiB (x1)
Load Average: 1.29, 1.01, 0.84
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
--------------------------------------------------------------------------------------
Benchmark                            Time             CPU   Iterations UserCounters...
--------------------------------------------------------------------------------------
BM_object_oriented/64            0.039 us        0.039 us     17779654 BytesProcessed=4.5k bytes_per_second=24.5771G/s
BM_object_oriented/256           0.213 us        0.213 us      3300654 BytesProcessed=18k bytes_per_second=17.8812G/s
BM_object_oriented/1024          0.906 us        0.906 us       770645 BytesProcessed=72k bytes_per_second=16.8411G/s
BM_object_oriented/4096           3.84 us         3.84 us       180761 BytesProcessed=288k bytes_per_second=15.902G/s
BM_object_oriented/16384          18.8 us         18.8 us        37463 BytesProcessed=1.125M bytes_per_second=13.0112G/s
BM_object_oriented/65536          75.1 us         75.1 us         9406 BytesProcessed=4.5M bytes_per_second=13.008G/s
BM_object_oriented/262144          529 us          529 us         1355 BytesProcessed=18M bytes_per_second=7.37992G/s
BM_object_oriented/1048576        3072 us         3072 us          226 BytesProcessed=72M bytes_per_second=5.08572G/s
BM_object_oriented/4194304       12958 us        12957 us           55 BytesProcessed=288M bytes_per_second=4.82362G/s
BM_object_oriented/16777216      52470 us        52467 us           13 BytesProcessed=1.125G bytes_per_second=4.76492G/s
BM_object_oriented/33554432     104667 us       104662 us            7 BytesProcessed=2.25G bytes_per_second=4.77726G/s
BM_data_oriented/64              0.038 us        0.038 us     18196596 BytesProcessed=1024 bytes_per_second=25.0373G/s
BM_data_oriented/256             0.211 us        0.211 us      3312330 BytesProcessed=4k bytes_per_second=18.057G/s
BM_data_oriented/1024            0.898 us        0.898 us       776186 BytesProcessed=16k bytes_per_second=16.9891G/s
BM_data_oriented/4096             3.64 us         3.64 us       193013 BytesProcessed=64k bytes_per_second=16.7622G/s
BM_data_oriented/16384            14.6 us         14.6 us        47183 BytesProcessed=256k bytes_per_second=16.7451G/s
BM_data_oriented/65536            58.3 us         58.3 us        11894 BytesProcessed=1024k bytes_per_second=16.7614G/s
BM_data_oriented/262144            233 us          233 us         2970 BytesProcessed=4M bytes_per_second=16.7626G/s
BM_data_oriented/1048576          1131 us         1131 us          611 BytesProcessed=16M bytes_per_second=13.8116G/s
BM_data_oriented/4194304          4910 us         4910 us          145 BytesProcessed=64M bytes_per_second=12.7299G/s
BM_data_oriented/16777216        19468 us        19468 us           36 BytesProcessed=256M bytes_per_second=12.8415G/s
BM_data_oriented/33554432        39500 us        39497 us           17 BytesProcessed=512M bytes_per_second=12.6591G/s

So, we can observe data processing rate decreasing in concurrency with the filling of the various layer of caches.

I thought that, with various caches, the CPU could sustain a better rate when increasing the size, by prefetching the data. So I have some question: is the RAM definitely being the bottleneck here?

Is there any way to speed-up the data rate (still on big data)?

Then, would using some kind of CPU-parallelization not be useful, since the cores would still be not be fed?

EDIT: also, here it is a OpenMP parallel version and its results:

tuple<double,double> avg_vec_data_oriented_parallel(vector<PointGpsRecord>& data, uint size){
    double sum_lat = 0.0;
    double sum_lon = 0.0;

#pragma omp parallel for reduction(+:sum_lat, sum_lon)
    for(auto& p: data){
        sum_lat += p.latitude;
        sum_lon += p.longitude;
    }
    return make_tuple(sum_lat/size, sum_lon/size);
}

static void BM_data_oriented_parallel(benchmark::State &state) {
    random_device rnd_device;
    mt19937 rng{rnd_device()};
    uniform_real_distribution<double> dist{-10, 10};

    int size = state.range(0);

    vector<PointGpsRecord> records;
    records.reserve(size);

    for(int i = 0; i < size; i++){
        auto point = PointGpsRecord();
        point.latitude = dist(rng);
        point.longitude = dist(rng);
        records.push_back(point);
    }


    for (auto _: state) {
        tuple<double,double> avg = avg_vec_data_oriented_parallel(records, size);
        benchmark::DoNotOptimize(avg);
    }

    int64_t iterations = state.iterations();
    int64_t bytes = sizeof(PointGpsRecord) * size;

    state.SetBytesProcessed(iterations * bytes);
    state.counters["BytesProcessed"] = benchmark::Counter(bytes, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);

}



BM_data_oriented_parallel/64             5.39 us         5.38 us       148050 BytesProcessed=1024 bytes_per_second=181.657M/s
BM_data_oriented_parallel/256            4.25 us         4.24 us       154501 BytesProcessed=4k bytes_per_second=920.244M/s
BM_data_oriented_parallel/1024           4.43 us         4.42 us       161006 BytesProcessed=16k bytes_per_second=3.44957G/s
BM_data_oriented_parallel/4096           5.60 us         5.38 us       100000 BytesProcessed=64k bytes_per_second=11.3523G/s
BM_data_oriented_parallel/16384          5.97 us         5.76 us       104429 BytesProcessed=256k bytes_per_second=42.3808G/s
BM_data_oriented_parallel/65536          11.2 us         11.0 us        65653 BytesProcessed=1024k bytes_per_second=89.0536G/s
BM_data_oriented_parallel/262144         31.9 us         31.7 us        18759 BytesProcessed=4M bytes_per_second=123.122G/s
BM_data_oriented_parallel/1048576         113 us          112 us         5247 BytesProcessed=16M bytes_per_second=139.253G/s
BM_data_oriented_parallel/4194304        1671 us         1661 us          433 BytesProcessed=64M bytes_per_second=37.6235G/s
BM_data_oriented_parallel/16777216       7411 us         7406 us           92 BytesProcessed=256M bytes_per_second=33.7556G/s
BM_data_oriented_parallel/33554432      15007 us        15005 us           47 BytesProcessed=512M bytes_per_second=33.3221G/s

Note that it peaks when the L3 cache is full.

What kind of 16-core machine do you have? If it's a modern Xeon, then yes, memory bandwidth available to a single thread is quite low, even if the other cores are idle. Like in [Why is Skylake so much better than Broadwell-E for single-threaded memory throughput?](https://stackoverflow.com/q/39260020) but worse for Skylake-X (mesh instead of ring bus). It scales well with # of cores to a high aggregate bandwidth, but a single thread running alone has significantly worse mem bandwidth than on a "client" desktop/laptop chip. (If it's an AMD CPU, then IDK; edit your Q with CPU model.) — Peter Cordes, Feb 28 '22 at 03:02
@PeterCordes It is a i7-10870H (shouldn't be Comet Lake?). If I recognize correctly it is not so much different from Skylake architecture. Using open — Federico Vaccaro, Feb 28 '22 at 09:14
@PeterCordes Using OpenMP parallel reduce the results aren't that better when exceeding L3 cache. (I'm going to edit the main post introducing the new version/results) — Federico Vaccaro, Feb 28 '22 at 09:20
Ok, yeah, 8c16t Skylake-family client chip. So one core should be able to come close to maxing out DRAM bandwidth. (Especially if energy_performance_preference is set to `performance` instead of `balance-power` or something, which can lead it to lower the CPU clock when running mostly memory-bound code.) I'll read the question details when I have some time now that I know what kind of system to think about. — Peter Cordes, Feb 28 '22 at 09:32

Is the memory latency slowing the execution on a large dataset?

0 Answers0