If u want print or store something like this {'ABC': 3, 'BCD': 1, 'CDE': 1, 'DEF': 1, ...and so on}
u need O(n * k)
memory and time.
Even if u can split it into chunks and run parallel and handle corner cases around splittings, you need merge that dictionaries and i dont' know how to parallel this so it take at least O(n)
and this cant be O(n / threads)
anyway (my opinion). Best i can offer to u it's this:
#include <iostream>
#include <fstream>
#include <unordered_set>
#include <unordered_map>
#include <cassert>
#include <vector>
#include <thread>
const long long MOD = 1e9 + 7; // prime number
const long long P = 269; // polynomial hash base
int calculateCustomHash(std::string::const_iterator beg, int k) // O(k)
{
long long res = 0;
for (int i = 0; i < k; ++i, ++beg)
{
res *= P;
res += *beg;
res %= MOD;
}
res += (res < 0) * MOD;
return res;
}
void format(std::string::const_iterator beg, int k, int value, std::string &to_add)
{
std::string res;
to_add.reserve(to_add.size() + k + 12);
to_add.push_back('\'');
std::copy(beg, beg + k, std::back_inserter(to_add));
to_add += "': ";
to_add += std::to_string(value);
to_add += ", ";
}
struct SubstringCount // mem : O(s.size()) time : O(s.size()), does not depend on k
{
std::string _s;
int _k;
std::unordered_map<int, int> _res{};
std::vector<int> hashes{};
SubstringCount(std::string::const_iterator beg = {}, std::string::const_iterator end = {}, int k = 0) : _k(k) // O(end - beg)
{
if (k == 0)
return;
_s.reserve(end - beg);
std::copy(beg, end, std::back_inserter(_s));
if (end - beg < k)
{
return;
}
hashes.reserve(end - beg - k + 1);
long long p_pow_k = 1;
long long current_hash = 0;
std::string::const_iterator it = beg;
for (int i = 0; i < k; ++i, ++it)
{
current_hash *= P;
current_hash += *it;
current_hash %= MOD;
p_pow_k *= P;
p_pow_k %= MOD;
}
current_hash += (current_hash < 0) * MOD;
hashes.push_back(current_hash);
++_res[current_hash];
for (; it != end; ++it, ++beg)
{
current_hash *= P;
current_hash += *it;
current_hash -= p_pow_k * *beg;
current_hash %= MOD;
current_hash += (current_hash < 0) * MOD;
hashes.push_back(current_hash);
++_res[current_hash];
}
}
std::string decode() // O((end - beg) * k)
{
std::string::const_iterator beg = _s.begin();
std::string::const_iterator end = _s.end();
if (end - beg < _k)
{
return {};
}
std::string res = "{";
std::unordered_set<int> printed;
long long p_pow_k = 1;
long long current_hash = 0;
std::string::const_iterator it = beg;
for (int i = 0; i < _k; ++i, ++it)
{
current_hash *= P;
current_hash += *it;
current_hash %= MOD;
p_pow_k *= P;
p_pow_k %= MOD;
}
current_hash += (current_hash < 0) * MOD;
if (printed.insert(current_hash).second)
{
format(beg, _k, _res.at(current_hash), res);
}
for (; it != end; ++it, ++beg)
{
current_hash *= P;
current_hash += *it;
current_hash -= p_pow_k * *beg;
current_hash %= MOD;
current_hash += (current_hash < 0) * MOD;
if (printed.insert(current_hash).second)
{
format(beg + 1, _k, _res.at(current_hash), res);
}
}
res += "}";
return res;
}
int getCount(const std::string &s) // O(k)
{
assert(s.size() == _k);
int hash = calculateCustomHash(s.begin(), _k);
if (_res.count(hash))
{
return _res.at(hash);
}
return 0;
}
int getCount(int ind)
{
if (ind < 0)
return 0;
if (ind >= hashes.size())
return 0;
return _res.at(hashes[ind]);
}
};
int main()
{
int k = 3;
std::string s = "ABCDEFGHIJKABCLMNOPQRSTUVABCSDLSFIJKKJJKLOP";
SubstringCount subscnt(s.begin(), s.end(), k); // mem : O(s.size()) time : O(s.size()), does not depend on k
std::cout << subscnt.decode() << "\n"; // O(s.size() * k)
// accesing to any substring count O(k) each
std::string ijk;
format(std::string("IJK").begin(),
k,
subscnt.getCount("IJK"), // O(k)
ijk);
std::cout << ijk << "\n";
ijk.clear();
format(s.begin() + 8,
k,
subscnt.getCount(8), // O(1)
ijk);
std::cout << ijk << "\n";
// multi-threading ???
const int threads_num = 8;
int piece_size = s.size() / threads_num + 1;
SubstringCount subscnts[threads_num];
std::thread threads[threads_num];
for (int i = 0; i < threads_num; ++i)
{
threads[i] = std::thread([piece_size, i, &s, &subscnts, k]()
{
std::string piece = s.substr(piece_size * i, piece_size);
subscnts[i] = SubstringCount(piece.begin(), piece.end(), k); }); // O(piece_size)
}
for (int i = 0; i < threads_num; ++i){
threads[i].join();
}
std::cout << "\n";
// u cant write from diffrent threads (or u can get mess of symbols overlaping eachother) so u must join them before and then print:
for (int i = 0 ; i < threads_num; ++i){
std::cout << subscnts[i].decode() << "\n"; // O(piece_size * k)
}
}
not multi-threaded but works O(n)
time and O(n)
mem (i don't think it can be better even with multi-threading)
-I want to use this for very large files only ( a few GBs ). Is there any way?
do you have enough RAM to store this string? if no u have bigger problems than multi-threading
- it is true some may not work but I'm choosing to ignore those cases...do you see a solution without counting that?
if u ignore this cases just split string and run in single thread and not merge resulting dict
u can do like code above
or u can do shared unordered_map
protected by mutex and update it from multiple threads by batches about 64 records for one mutex lock