I have a collection called Dataframe
that is essentially an std::vector<char>
. It contains records
of a specific length record_size
and it may be composed by 1 or multiple fields of different type. The definition of the Dataframe
is not negotiable because it is tightly coupled with the rest of the codebase. In general, the structure could contain a huge number of records, in the order of millions and a huge number of columns, in the order of hundreds.
Based on what I've said above, there could be just a single column or multiple ones. The single case column, is what I call the trivial case because the vector of chars can be cast to the actual type and the sorting is extremely efficient.
Where I have the problem is that with multiple columns, I need to have a vector of pointers to the records, apply a comparator function that jumps into memory to access the real data, then swap the pointers, and when the sorting procedure ends, apply to the real data the permutation identified by the pointers. This is slower, especially when the Dataframe
contains real world data.
Below an extract of code that better explains the situation. Is there any alternative approach to solve the problem?
#include <iostream>
#include <vector>
#include <cstring>
#include <algorithm>
#include <numeric>
#include <iomanip>
#include <functional>
class Dataframe {
public:
enum class Base : char
{
SIGNED = 'S',
UNSIGNED = 'U',
CHAR = 'A',
// and other types like floats, date, timestamp, etc.
};
class Dtype
{
public:
Dtype(Base base_dtype, std::size_t size) : m_base_dtype(base_dtype), m_size(size) {}
auto base_dtype() const { return m_base_dtype; }
auto size() const { return m_size; }
private:
Base m_base_dtype;
std::size_t m_size;
};
class Comparator {
public:
using comparator_fn = std::function<std::int8_t(const char*, const char*)>;
static comparator_fn make_comparator(Dataframe::Base base_dtype, std::size_t size, std::size_t offset) {
switch (base_dtype) {
case Dataframe::Base::CHAR:
return [offset](const char *a, const char *b) {
return std::strcmp(a + offset, b + offset);
};
case Dataframe::Base::SIGNED:
return [offset](const char *a, const char *b) {
const auto v1 = *reinterpret_cast<const int*>(a + offset);
const auto v2 = *reinterpret_cast<const int*>(b + offset);
return (v1 < v2) ? -1 : (v1 > v2) ? 1 : 0;
};
case Dataframe::Base::UNSIGNED:
return [offset](const char *a, const char *b) {
const auto v1 = *reinterpret_cast<const unsigned int*>(a + offset);
const auto v2 = *reinterpret_cast<const unsigned int*>(b + offset);
return (v1 < v2) ? -1 : (v1 > v2) ? 1 : 0;
};
default:
throw std::runtime_error("Unknown base dtype");
}
}
static comparator_fn make_composite_comparator(const std::vector<Dataframe::Dtype> &dtypes) {
std::vector<comparator_fn> F;
std::size_t offset = 0;
for (auto &dtype : dtypes) {
F.push_back(make_comparator(dtype.base_dtype(), dtype.size(), offset));
offset += dtype.size();
}
return [F](const char *a, const char *b) {
for (auto &f : F) {
if (const int8_t result = f(a, b); result != 0)
return result < 0;
}
return false;
};
}
};
Dataframe(const std::vector<char> &raw_data, const std::vector<Dtype> &dtypes) : m_raw_data(raw_data), m_dtypes(dtypes)
{
cols_count = m_dtypes.size();
m_record_size = std::accumulate(m_dtypes.begin(), m_dtypes.end(), 0, [](std::size_t acc, const Dtype &dtype) {
return acc + dtype.size();
});
m_records_count = m_raw_data.size() / m_record_size;
std::cout << "Info:" << std::endl;
std::cout << " - cols count: " << cols_count << std::endl;
std::cout << " - record size: " << m_record_size << std::endl;
std::cout << " - records count: " << m_records_count << std::endl;
std::cout << std::endl;
}
void print() {
// cycle over the records
std::cout << "Dataframe:" << std::endl;
for (std::size_t i = 0; i < m_records_count; i++) {
// cycle over the dtypes (e.g. columns)
auto current_field = m_raw_data.data() + i * m_record_size;
for (std::size_t j = 0; j < m_dtypes.size(); j++) {
auto dtype = m_dtypes[j];
auto base_dtype = dtype.base_dtype();
auto size = dtype.size();
current_field += j > 0 ? m_dtypes[j - 1].size() : 0;
// print the data
switch (base_dtype) {
case Base::CHAR:
std::cout << std::setw(1) << *current_field << ";";
break;
case Base::SIGNED:
std::cout << std::setw(4) << *reinterpret_cast<int*>(current_field) << ";";
break;
case Base::UNSIGNED:
std::cout << std::setw(4) << *reinterpret_cast<unsigned int*>(current_field) << ";";
break;
}
}
std::cout << std::endl;
}
std::cout << std::endl;
}
void sort() {
if (cols_count == 1) {
auto dtype = m_dtypes[0];
auto base_dtype = dtype.base_dtype();
auto size = dtype.size();
auto data = m_raw_data.data();
// print the data
switch (base_dtype) {
case Base::CHAR:
std::sort(data, data + m_raw_data.size());
break;
case Base::SIGNED:
std::sort((int*)data, (int*)data + m_raw_data.size() / size);
break;
case Base::UNSIGNED:
std::sort((unsigned int*)data, (unsigned int*)data + m_raw_data.size() / size);
break;
}
} else {
// This is a naive implementation, it works but it's not efficient
// use the ptr, compare and then swap the records
auto comparator = Comparator::make_composite_comparator(m_dtypes);
// create a set of pointers to the records
std::vector<char*> ptrs;
for (std::size_t i = 0; i < m_records_count; i++) {
ptrs.push_back(m_raw_data.data() + i * m_record_size);
}
// sort the pointers
std::sort(ptrs.begin(), ptrs.end(), comparator);
// apply pointers permutation
std::vector<char> tmp(m_raw_data.size());
for (std::size_t i = 0; i < m_records_count; i++) {
std::copy(ptrs[i], ptrs[i] + m_record_size, tmp.data() + i * m_record_size);
}
m_raw_data = tmp;
}
}
private:
std::vector<char> m_raw_data;
std::vector<Dtype> m_dtypes;
std::size_t m_records_count;
std::size_t m_record_size;
std::size_t cols_count;
};
void trivial_case_with_1_column() {
std::vector<char> raw;
// Example, trivial case with 1 column into the dataframe
int data[] = {8, 30, 8, 19, 7, 6, 10, 5, 3, 2, 1, 5};
raw.insert(raw.end(), (char*)data, (char*)data + sizeof(data));
Dataframe df(raw, {Dataframe::Dtype(Dataframe::Base::SIGNED, sizeof(int))});
df.print();
df.sort();
std::cout << "AFTER SORTING" << std::endl;
df.print();
}
void problematic_case_with_2_or_more_columns() {
std::vector<char> raw;
int column1[] = {8, 30, 8, 19, 7, 6, 10, 5, 3, 2, 1, 5};
char column2[] = {'z', 'a', 'a', 'b', 'c', 'd', 'e', 'x', 'g', 'h', 'i', 'a'};
for (std::size_t i = 0; i < sizeof(column1) / sizeof(int); i++) {
raw.insert(raw.end(), (char*)&column1[i], (char*)&column1[i] + sizeof(int));
raw.insert(raw.end(), (char*)&column2[i], (char*)&column2[i] + sizeof(char));
}
const auto dtypes = {Dataframe::Dtype(Dataframe::Base::SIGNED, sizeof(int)), Dataframe::Dtype(Dataframe::Base::CHAR, sizeof(char))};
Dataframe df(raw, dtypes);
df.print();
df.sort();
std::cout << "AFTER SORTING" << std::endl;
df.print();
}
int main(int argc, char const *argv[])
{
std::cout << "This case can be sorted by using a std::sort that casts the pointer to the correct type and it's efficient" << std::endl;
trivial_case_with_1_column();
std::cout << "----------------------------------------------------------------\n" << std::endl;
std::cout << "I don't know how to manage effectively this case" << std::endl;
problematic_case_with_2_or_more_columns();
return 0;
}
Online version: https://onlinegdb.com/gP5-8LOmv