They are about the same performance, this is because the std::remove_if
is the only function that modifies the array, then the difference comes form the other functions, std::vector::resize
makes a realloc to fit the new size (std::distance
just returns the size so it's negligible) and std::vector::erase
just adopts the container making it slightly faster.
As pointed by @Peter Cordes "It's actually guaranteed not to reallocate", std::vector::resize
does not resize the vector if the new size is smaller, so the difference should be from an extra move(vs, clang) that erase (vs, clang) does and resize(vs, clang) doesn't.
To be able to mesure differences generate_input()
needs to return the same vector for all of the tests, in your implementation every call returns a new random vector that makes imposible to tell apart a run to run variance from the vectors changing.
With that, @463035818_is_not_a_number makes an interesting point, but for the same reason as before, there is no difference between those functions. That doesn't mean that it's the same case, for a struct the penalty from a bad branch much greater making std::remove_if
a prime target for optimization, see in the windows figures bellow.
All of the figures are from 50 runs each, green is the range between the best run and the average and the red is the range from the average to the worst result, this is to show the variance from run to run.
i5-1155G7 + 3200MHz RAM on manjaro with clang-13 and -O3
1600X + 1067MHz RAM on windows 10 (22H2) with vs2022 and /O2
1600X + 1067MHz RAM on windows 10 (22H2) with clang-16 and -O3 -pthread
1600X + 1067MHz RAM on ubuntu 22 with clang-13 and -O3
With vs, it seems that there is an optimization bug (?), that makes simple_remove
underperform with u32 and u64.
#include <cstdint>
#include <random>
#include <cstring>
#include <vector>
#include <tuple>
// #include <typeinfo> // For clang
#include <benchmark/benchmark.h>
#pragma warning(disable:4267) // Conversion errors
#pragma warning(disable:4244) // Conversion errors
constexpr int N_ELEMS = 500;
constexpr int RAND_SEED = 8888;
template <typename T>
struct Filler {
T * ptr = nullptr;
int64_t padd [sizeof(T)];
Filler() {
ptr = new T(0);
memset(padd, 0, sizeof(T) * 8);
}
Filler(const T num){
ptr = new T(num);
for (int64_t& e : padd){
e = num;
}
}
Filler(const Filler& other){
ptr = new T(*other.ptr);
memcpy(padd, other.padd, sizeof(T) * 8);
}
~Filler() {
delete ptr;
}
Filler& operator=(Filler const& other){
memcpy(padd, other.padd, sizeof(T) * 8);
*ptr = *other.ptr;
return *this;
}
inline bool operator < (const Filler& other) { return *ptr < *other.ptr; }
inline bool operator <= (const Filler& other) { return *ptr <= *other.ptr; }
inline bool operator > (const Filler& other) { return *ptr > *other.ptr; }
inline bool operator >= (const Filler& other) { return *ptr >= *other.ptr; }
inline bool operator == (const Filler& other) { return *ptr == *other.ptr; }
inline bool operator != (const Filler& other) { return *ptr != *other.ptr; }
inline bool operator < (const T other) { return *ptr < other; }
inline bool operator <= (const T other) { return *ptr <= other; }
inline bool operator > (const T other) { return *ptr > other; }
inline bool operator >= (const T other) { return *ptr >= other; }
inline bool operator == (const T other) { return *ptr == other; }
inline bool operator != (const T other) { return *ptr != other; }
};
static size_t THRESH;
template <typename T>
struct Foo {
static std::vector<T> generate_input(size_t max = 0) {
static size_t dist_max = 0;
static std::vector<T> nums;
if (nums.empty() || max){
if (max) {
THRESH = max / 2;
dist_max = max;
}
std::mt19937 gen(RAND_SEED);
std::uniform_int_distribution<uint64_t> dist(0, dist_max);
for (auto& n : nums = std::vector<T>(N_ELEMS)){
n = T(dist(gen));
}
}
return nums;
}
static void just_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
std::ignore = std::remove_if(
nums.begin(), nums.end(),
[](T x) { return x < THRESH; }
);
benchmark::DoNotOptimize(nums);
}
}
static void erase_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
nums.erase(
std::remove_if(
nums.begin(), nums.end(),
[](T x) { return x < THRESH; }
),
nums.end()
);
benchmark::DoNotOptimize(nums);
}
}
static void resize_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
nums.resize(
std::distance(
nums.begin(),
std::remove_if(
nums.begin(), nums.end(),
[](T x) { return x < THRESH; }
)
)
);
benchmark::DoNotOptimize(nums);
}
}
static void simple_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
T * n = &nums.front();
T * m = &nums.front();
const T thresh = T(THRESH);
const T * back = &nums.back();
do {
if (*m >= thresh){
*(n++) = std::move(*m);
}
} while (m++ < back);
nums.resize(n - &nums.front());
benchmark::DoNotOptimize(nums);
}
}
static void simple_remove_unroll(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
T * n = &nums.front();
T * m = &nums.front();
const T thresh = T(THRESH);
const T * back = &nums.back();
switch (nums.size() % 4) {
case 3:
if (*m >= thresh){
*(n++) = std::move(*(m++));
} else {
m++;
}
case 2:
if (*m >= thresh){
*(n++) = std::move(*(m++));
} else {
m++;
}
case 1:
if (*m >= thresh){
*(n++) = std::move(*(m++));
} else {
m++;
}
}
do {
if (*(m + 0) >= thresh){ *(n++) = std::move(*(m + 0)); }
if (*(m + 1) >= thresh){ *(n++) = std::move(*(m + 1)); }
if (*(m + 2) >= thresh){ *(n++) = std::move(*(m + 2)); }
if (*(m + 3) >= thresh){ *(n++) = std::move(*(m + 3)); }
m += 4;
} while (m < back);
nums.resize(n - &nums.front());
benchmark::DoNotOptimize(nums);
}
}
};
template<typename T>
void benchmark_batch(size_t max_num) {
std::string type = typeid(T).name();
Foo<T>::generate_input(max_num);
benchmark::RegisterBenchmark(
std::string("just_remove/") + type,
Foo<T>::just_remove
);
benchmark::RegisterBenchmark(
std::string("erase_remove/") + type,
Foo<T>::erase_remove
);
benchmark::RegisterBenchmark(
std::string("resize_remove/") + type,
Foo<T>::resize_remove
);
benchmark::RegisterBenchmark(
std::string("simple_remove/") + type,
Foo<T>::simple_remove
);
benchmark::RegisterBenchmark(
std::string("simple_remove_unroll/") + type,
Foo<T>::simple_remove_unroll
);
}
int main(int argc, char** argv) {
benchmark_batch<uint8_t>(INT8_MAX);
benchmark_batch<uint32_t>(INT32_MAX);
benchmark_batch<uint64_t>(INT64_MAX);
benchmark_batch<Filler<uint8_t>>(INT8_MAX);
benchmark_batch<Filler<uint32_t>>(INT32_MAX);
benchmark_batch<Filler<uint64_t>>(INT64_MAX);
benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
return 0;
}
And the code to recreate the plot.
0..49 | % { .\Release\test_cxx.exe --benchmark_min_warmup_time=0.1 --benchmark_format=csv > "./data/run_$_.csv" }
Get-ChildItem ./data | Select-Object -ExpandProperty FullName | Import-Csv | Export-Csv .\benchmark.csv -NoTypeInformation -Append
import matplotlib.ticker as tck
import matplotlib.pyplot as plt
import csv
def read_data():
data = {}
test_len = 0
with open('./build/benchmark.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader :
test_len += 1
name = row['name'];
if not name in data:
data[name] = {
'min': {
'iterations': float(row['iterations']),
'real_time': float(row['real_time']),
'cpu_time': float(row['cpu_time']),
},
'max': {
'iterations': float(row['iterations']),
'real_time': float(row['real_time']),
'cpu_time': float(row['cpu_time']),
},
'avg': {
'iterations': float(row['iterations']),
'real_time': float(row['real_time']),
'cpu_time': float(row['cpu_time']),
},
}
else:
for k in ['iterations', 'real_time', 'cpu_time']:
data[name]['avg'][k] += float(row[k])
if float(row[k]) < float(data[name]['min'][k]):
data[name]['min'][k] = float(row[k])
if float(row[k]) > float(data[name]['max'][k]):
data[name]['max'][k] = float(row[k])
test_len /= len(data.keys())
for k in data:
for kk in data[k]['avg']:
data[k]['avg'][kk] /= test_len
return data
def plot_data(data, key):
labels = []
values = {
'max': [], 'avg': [], 'min': [],
}
labels_struct = []
values_struct = {
'max': [], 'avg': [], 'min': [],
}
for k in list(data.keys()):
if 'struct' in k or '6' in k:
labels_struct.append(k.replace('/', '\n').replace('struct ', ''))
values_struct['min'].append(data[k]['min'][key])
values_struct['max'].append(data[k]['max'][key])
values_struct['avg'].append(data[k]['avg'][key])
else:
labels.append(k.replace('/', '\n'))
values['min'].append(data[k]['min'][key])
values['max'].append(data[k]['max'][key])
values['avg'].append(data[k]['avg'][key])
return labels, values, labels_struct, values_struct
thickness = 0.8
benckmark_value = 'iterations'
colors = ['#1dad2b', '#af1c23', '#e0e0e0']
if __name__ == '__main__':
data = read_data()
labels, values, labels_struct, values_struct = plot_data(data, benckmark_value)
fig = plt.figure(layout="constrained")
spec = fig.add_gridspec(ncols=2, nrows=1)
int_formater = lambda x, p: format(int(x), ',')
ax0 = fig.add_subplot(spec[0, 0])
ax0.set_ylabel(benckmark_value)
ax0.set_title('std::vector<T>')
ax0.set_xticklabels(labels, rotation=90)
ax0.get_yaxis().set_major_formatter(tck.FuncFormatter(int_formater))
ax1 = fig.add_subplot(spec[0, 1])
ax1.set_ylabel(benckmark_value)
ax1.set_title('std::vector<Filler<T>>')
ax1.set_xticklabels(labels_struct, rotation=90)
ax1.get_yaxis().set_major_formatter(tck.FuncFormatter(int_formater))
for i, (k, v) in enumerate(values.items()):
ax0.bar(labels, v, thickness, color=colors[i])
for i, (k, v) in enumerate(values_struct.items()):
ax1.bar(labels_struct, v, thickness, color=colors[i])
plt.show()