In array of numbers, each number appears even number of times, and only one number appears odd number of times. We need to find that number (question was previously discussed on Stack Overflow).
Here is a solution that solves the question with 3 different methods — two methods are O(N) (hash_set and hash_map), while one is O(NlogN) (sorting). However, profiling for arbitrarily large input shows that sorting is faster, and gets more and more faster (in comparison) as input increases.
What is wrong with implementation or complexity analysis, why is O(NlogN) method faster?
#include <algorithm>
#include <chrono>
#include <cmath>
#include <iostream>
#include <functional>
#include <string>
#include <vector>
#include <unordered_set>
#include <unordered_map>
using std::cout;
using std::chrono::high_resolution_clock;
using std::chrono::milliseconds;
using std::endl;
using std::string;
using std::vector;
using std::unordered_map;
using std::unordered_set;
class ScopedTimer {
public:
ScopedTimer(const string& name)
: name_(name), start_time_(high_resolution_clock::now()) {}
~ScopedTimer() {
cout << name_ << " took "
<< std::chrono::duration_cast<milliseconds>(
high_resolution_clock::now() - start_time_).count()
<< " milliseconds" << endl;
}
private:
const string name_;
const high_resolution_clock::time_point start_time_;
};
int find_using_hash(const vector<int>& input_data) {
unordered_set<int> numbers(input_data.size());
for(const auto& value : input_data) {
auto res = numbers.insert(value);
if(!res.second) {
numbers.erase(res.first);
}
}
return numbers.size() == 1 ? *numbers.begin() : -1;
}
int find_using_hashmap(const vector<int>& input_data) {
unordered_map<int,int> counter_map;
for(const auto& value : input_data) {
++counter_map[value];
}
for(const auto& map_entry : counter_map) {
if(map_entry.second % 2 == 1) {
return map_entry.first;
}
}
return -1;
}
int find_using_sort_and_count(const vector<int>& input_data) {
vector<int> local_copy(input_data);
std::sort(local_copy.begin(), local_copy.end());
int prev_value = local_copy.front();
int counter = 0;
for(const auto& value : local_copy) {
if(prev_value == value) {
++counter;
continue;
}
if(counter % 2 == 1) {
return prev_value;
}
prev_value = value;
counter = 1;
}
return counter == 1 ? prev_value : -1;
}
void execute_and_time(const string& method_name, std::function<int()> method) {
ScopedTimer timer(method_name);
cout << method_name << " returns " << method() << endl;
}
int main()
{
vector<int> input_size_vec({1<<18,1<<20,1<<22,1<<24,1<<28});
for(const auto& input_size : input_size_vec) {
// Prepare input data
std::vector<int> input_data;
const int magic_number = 123454321;
for(int i=0;i<input_size;++i) {
input_data.push_back(i);
input_data.push_back(i);
}
input_data.push_back(magic_number);
std::random_shuffle(input_data.begin(), input_data.end());
cout << "For input_size " << input_size << ":" << endl;
execute_and_time("hash-set:",std::bind(find_using_hash, input_data));
execute_and_time("sort-and-count:",std::bind(find_using_sort_and_count, input_data));
execute_and_time("hash-map:",std::bind(find_using_hashmap, input_data));
cout << "--------------------------" << endl;
}
return 0;
}
Profiling results:
sh$ g++ -O3 -std=c++11 -o main *.cc
sh$ ./main
For input_size 262144:
hash-set: returns 123454321
hash-set: took 107 milliseconds
sort-and-count: returns 123454321
sort-and-count: took 37 milliseconds
hash-map: returns 123454321
hash-map: took 109 milliseconds
--------------------------
For input_size 1048576:
hash-set: returns 123454321
hash-set: took 641 milliseconds
sort-and-count: returns 123454321
sort-and-count: took 173 milliseconds
hash-map: returns 123454321
hash-map: took 731 milliseconds
--------------------------
For input_size 4194304:
hash-set: returns 123454321
hash-set: took 3250 milliseconds
sort-and-count: returns 123454321
sort-and-count: took 745 milliseconds
hash-map: returns 123454321
hash-map: took 3631 milliseconds
--------------------------
For input_size 16777216:
hash-set: returns 123454321
hash-set: took 14528 milliseconds
sort-and-count: returns 123454321
sort-and-count: took 3238 milliseconds
hash-map: returns 123454321
hash-map: took 16483 milliseconds
--------------------------
For input_size 268435456:
hash-set: returns 123454321
hash-set: took 350305 milliseconds
sort-and-count: returns 123454321
sort-and-count: took 60396 milliseconds
hash-map: returns 123454321
hash-map: took 427841 milliseconds
--------------------------
Addition
Fast solution with xor suggested by @Matt is of course out of contest — under 1 sec for worst case in example:
int find_using_xor(const vector<int>& input_data) {
int output = 0;
for(const int& value : input_data) {
output = output^value;
}
return output;
}
For input_size 268435456:
xor: returns 123454321
xor: took 264 milliseconds
but the question still stands — why is hash so inefficient compared to sorting in practice despite theoretical algorithmic complexity advantage?