I'm adding a profiler to an existing evaluator. The evaluator is implemented in c++ (std17 x86 msvc) and I need to store 3 int32, 1 uint16 and 1 uint8 to represent the context of the execution frame. As this is interpreted code, I cannot write any kernel level driver to take snapshots, so we have to add it to the eval loop. As with all profiling, we want to avoid slowing down the actual computation. So far for the motivation/context of this question.
At a high level, the behavior is as follows: before every instruction gets executed by the evaluator, it calls a small function ("hey, i'm here right now"), the sampling profiler is only interested in that frame position once every X ms (or μs, but that's most likely pushing it). So we need 2 threads (let's ignore the serialization for now) that want to share data. We have a very frequent writer (one single thread), and an infrequent reader (a different single thread). We like to minimize the performance penalty on the write size. Note, sometimes the writer becomes slow, so it might be stuck on a frame for a few seconds, we would like to be able to observe this.
So to help this question, I've written a small benchmark setup.
#include <memory>
#include <chrono>
#include <string>
#include <iostream>
#include <thread>
#include <immintrin.h>
#include <atomic>
#include <cstring>
using namespace std;
typedef struct frame {
int32_t a;
int32_t b;
uint16_t c;
uint8_t d;
} frame;
class ProfilerBase {
public:
virtual void EnterFrame(int32_t a, int32_t b, uint16_t c, uint8_t d) = 0;
virtual void Stop() = 0;
virtual ~ProfilerBase() {}
virtual string Name() = 0;
};
class NoOp : public ProfilerBase {
public:
void EnterFrame(int32_t a, int32_t b, uint16_t c, uint8_t d) override {}
void Stop() override {}
string Name() override { return "NoOp"; }
};
class JustStore : public ProfilerBase {
private:
frame _current = { 0 };
public:
string Name() override { return "OnlyStoreInMember"; }
void EnterFrame(int32_t a, int32_t b, uint16_t c, uint8_t d) override {
_current.a = a;
_current.b = b;
_current.c = c;
_current.d = d;
}
void Stop() override {
if ((_current.a + _current.b + _current.c + _current.d) == _current.a) {
cout << "Make sure optimizer keeps the record around";
}
}
};
class WithSampler : public ProfilerBase {
private:
unique_ptr<thread> _sampling;
atomic<bool> _keepSampling = true;
protected:
const chrono::milliseconds _sampleEvery;
virtual void _snap() = 0;
virtual string _subname() = 0;
public:
WithSampler(chrono::milliseconds sampleEvery): _sampleEvery(sampleEvery) {
_sampling = make_unique<thread>(&WithSampler::_sampler, this);
}
void Stop() override {
_keepSampling = false;
_sampling->join();
}
string Name() override {
return _subname() + to_string(_sampleEvery.count()) + "ms";
}
private:
void _sampler() {
auto nextTick = chrono::steady_clock::now();
while (_keepSampling)
{
const auto sleepTime = nextTick - chrono::steady_clock::now();
if (sleepTime > chrono::milliseconds(0))
{
this_thread::sleep_for(sleepTime);
}
_snap();
nextTick += _sampleEvery;
}
}
};
struct checkedFrame {
frame actual;
int32_t check;
};
// https://rigtorp.se/spinlock/
struct spinlock {
std::atomic<bool> lock_ = { 0 };
void lock() noexcept {
for (;;) {
// Optimistically assume the lock is free on the first try
if (!lock_.exchange(true, std::memory_order_acquire)) {
return;
}
// Wait for lock to be released without generating cache misses
while (lock_.load(std::memory_order_relaxed)) {
// Issue X86 PAUSE or ARM YIELD instruction to reduce contention between
// hyper-threads
_mm_pause();
}
}
}
void unlock() noexcept {
lock_.store(false, std::memory_order_release);
}
};
class Spinlock : public WithSampler {
private:
spinlock _loc;
checkedFrame _current;
public:
using WithSampler::WithSampler;
string _subname() override { return "Spinlock"; }
void EnterFrame(int32_t a, int32_t b, uint16_t c, uint8_t d) override {
_loc.lock();
_current.actual.a = a;
_current.actual.b = b;
_current.actual.c = c;
_current.actual.d = d;
_current.check = a + b + c + d;
_loc.unlock();
}
protected:
void _snap() override {
_loc.lock();
auto snap = _current;
_loc.unlock();
if ((snap.actual.a + snap.actual.b + snap.actual.c + snap.actual.d) != snap.check) {
cout << "Corrupted snap!!\n";
}
}
};
static constexpr int32_t LOOP_MAX = 1000 * 1000 * 1000;
int measure(unique_ptr<ProfilerBase> profiler) {
cout << "Running profiler: " << profiler->Name() << "\n ";
cout << "\tProgress: ";
auto start_time = std::chrono::steady_clock::now();
int r = 0;
for (int32_t x = 0; x < LOOP_MAX; x++)
{
profiler->EnterFrame(x, x + x, x & 0xFFFF, x & 0xFF);
r += x;
if (x % (LOOP_MAX / 1000) == 0)
{
this_thread::sleep_for(chrono::nanoseconds(10)); // simulat that sometimes we do other stuff not like storing
}
if (x % (LOOP_MAX / 10) == 0)
{
cout << static_cast<int>((static_cast<double>(x) / LOOP_MAX) * 10);
}
if (x % 1000 == 0) {
_mm_pause(); // give the other threads some time
}
if (x == (LOOP_MAX / 2)) {
// the first half of the loop we take as warmup
// so now we take the actual time
start_time = std::chrono::steady_clock::now();
}
}
cout << "\n";
const auto done_calc = std::chrono::steady_clock::now();
profiler->Stop();
const auto done_writing = std::chrono::steady_clock::now();
cout << "\tcalc: " << chrono::duration_cast<chrono::milliseconds>(done_calc - start_time).count() << "ms\n";
cout << "\tflush: " << chrono::duration_cast<chrono::milliseconds>(done_writing - done_calc).count() << "ms\n";
return r;
}
int main() {
measure(make_unique<NoOp>());
measure(make_unique<JustStore>());
measure(make_unique<Spinlock>(chrono::milliseconds(1)));
measure(make_unique<Spinlock>(chrono::milliseconds(10)));
return 0;
}
Compiling this with /O2
in x86 mode on my machine gives this output:
Running profiler: NoOp
Progress: 0123456789
calc: 1410ms
flush: 0ms
Running profiler: OnlyStoreInMember
Progress: 0123456789
calc: 1368ms
flush: 0ms
Running profiler: Spinlock1ms
Progress: 0123456789
calc: 3952ms
flush: 4ms
Running profiler: Spinlock10ms
Progress: 0123456789
calc: 3985ms
flush: 11ms
(while this was compiled with msvc in VS2022, I think g++ --std=c++17 -O2 -m32 -pthread -o testing small-test-case.cpp
should come close enough).
Here we see that the Spinlock based sampler adds a ~2.5x overhead to the one without any. I've profiled it, and as expected, a lot of time is spend on taking a lock (where in most cases, there was no need for the lock).