I thought I would answer this question as I am looking at a similar problem. The name for this idea of hashes of similar objects having a high chance of clashing is 'locality sensitive hashing'. There is a lot of literature on the subject, but here is a simple example:
Imagine we have a binary vector {1,0} of a fixed length. We can select a random subset of indices on which to calculate the hash using builtin stl and boost algorithms:
#include <unordered_map>
#include <unordered_set>
#include <random>
#include <algorithm>
#include <boost/iterator/filter_iterator.hpp>
#include <boost/functional/hash.hpp>
template<class It, class pred>
std::size_t hash_filtered_range(It first, It last, pred f){
return boost::hash_range(boost::make_filter_iterator(f, first, last),
boost::make_filter_iterator(f, last, last));
}
template<class iter>
struct IterableHash{
IterableHash(const iter indices_begin, const iter indices_end): _inc_indices(indices_begin, indices_end){
}
template <class obj_type>
std::size_t operator()(const obj_type& type)const{
int _ix = 0;
return hash_filtered_range(std::begin(type), std::end(type), [this, &_ix](const auto& t){
return (this->_inc_indices.find(_ix++) != this->_inc_indices.end());
});
}
private:
std::unordered_set<int> _inc_indices;
};
template<class hasher>
struct ApproxEqual{
ApproxEqual(const hasher& hash):hash(hash) {}
template<class obj_type>
bool operator() (const obj_type& o1, const obj_type& o2)const{
return hash(o1) == hash(o2);
}
private:
hasher hash;
};
Then, iterable objects have the same hash and equal values if they are equal only at these indices:
i.e on my computer
std::vector<int> hash_vec{0,2,3};
using it = std::vector<int>::iterator;
IterableHash<it> hasher(hash_vec.begin(),
hash_vec.end());
ApproxEqual<IterableHash<it>> cmp(hasher);
std::unordered_map<std::vector<char>, int, IterableHash<it>, ApproxEqual<IterableHash<it>> > map( 0, hasher,
cmp);
std::vector<char> vec {1,0,1,0,1};
map[vec] = 33;
std::cout << hasher(vec)<< "\n";
std::vector<char> fuzzy_vec {1,0,1,0,0};
std::cout << hasher(fuzzy_vec)<< "\n";
std::cout << (map.find(fuzzy_vec)->second);
produces
11093822460655
11093822460655
33
i.e we recover the value for a different vector res when we query with fuzzy_res;