I want to find ANY repeating combinations of k taken from n across all vector of vectors of 20 int values each. That vector of vectors is all about 50.000+ in size.
For example this is the SQLite piece of code that finds all repeating combos of 5 across all rows:
DROP TABLE IF EXISTS unpivot;
CREATE TABLE IF NOT EXISTS unpivot AS
SELECT *
FROM (
SELECT DrawId, N1 as n_value FROM draws union all
SELECT DrawId, N2 as n_value FROM draws union all
SELECT DrawId, N3 as n_value FROM draws union all
SELECT DrawId, N4 as n_value FROM draws union all
SELECT DrawId, N5 as n_value FROM draws union all
SELECT DrawId, N6 as n_value FROM draws union all
SELECT DrawId, N7 as n_value FROM draws union all
SELECT DrawId, N8 as n_value FROM draws union all
SELECT DrawId, N9 as n_value FROM draws union all
SELECT DrawId, N10 as n_value FROM draws union all
SELECT DrawId, N11 as n_value FROM draws union all
SELECT DrawId, N12 as n_value FROM draws union all
SELECT DrawId, N13 as n_value FROM draws union all
SELECT DrawId, N14 as n_value FROM draws union all
SELECT DrawId, N15 as n_value FROM draws union all
SELECT DrawId, N16 as n_value FROM draws union all
SELECT DrawId, N17 as n_value FROM draws union all
SELECT DrawId, N18 as n_value FROM draws union all
SELECT DrawId, N19 as n_value FROM draws union all
SELECT DrawId, N20 as n_value FROM draws
) as T;
CREATE TABLE IF NOT EXISTS COMB5 AS SELECT n1, n2, n3, n4, n5, count(*) as total
FROM
(
SELECT up1.n_value as n1, up2.n_value as n2, up3.n_value as n3, up4.n_value as n4, up5.n_value as n5
FROM unpivot up1
JOIN unpivot up2
ON up1.DrawId = up2.DrawId
AND up1.n_value < up2.n_value
JOIN unpivot up3
ON up2.DrawId = up3.DrawId
AND up2.n_value < up3.n_value
JOIN unpivot up4
ON up3.DrawId = up4.DrawId
AND up3.n_value < up4.n_value
JOIN unpivot up5
ON up4.DrawId = up5.DrawId
AND up4.n_value < up5.n_value
) T
GROUP BY n1, n2, n3, n4, n5
ORDER BY total desc;
Input sample:
Date,Hour,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,N11,N12,N13,N14,N15,N16,N17,N18,N19,N20
2020-08-16, 15:00,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
2020-08-15, 22:40,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40
2020-08-15, 15:00,18,19,20,23,24,25,26,27,49,50,51,52,53,54,55,56,57,58,59,60
2020-08-14, 22:40,1,2,23,24,25,26,27,47,69,70,71,72,73,74,75,76,77,78,79,80
2020-08-14, 15:00,23,24,25,26,27,140,144,145,146,147,148,92,93,94,95,96,97,98,99,100
Output sample:
n1, n2, n3, n4, n5, total
23, 24, 25, 26, 27, 4
Which means (for this example) that the only combination of 5 numbers that is repeating in two or more rows is: 23,24,25,26,27 and it is found 4 times in total across all table (and this is shown by total column with the value of 4 in this case). It is count only if all those numbers are found all together.
Image showing this more clearly:
I can modify that SQLite working code above to find all repeating combinations of 6 numbers, 7 numbers and so on. And it works fine for each combination size I want, even for those with 15 or 18 numbers on each of them.
The only big problem is that this is very time consuming and it takes a lot of time to complete its job.
That is why I am trying to create a program that is doing the same thing but more faster than these SQLite commands.
So, I did this one below:
#include <fstream>
#include <sstream>
#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
#include <cstring>
#include <unordered_map>
#include <map>
#include <tuple>
#include <set>
using namespace std;
vector<vector<int>> comb(vector<int> &myVec, int K) {
/* Returning all combinations from a vector as a vector of vectors. */
vector<vector<int>> all_combos; //vector to store all combos
vector<int> cmb; //vector to store one single combo!
std::string bitmask(K, 1); // K leading 1's
bitmask.resize((int) myVec.size(), 0); // N-K trailing 0's
// print integers and permute bitmask
do {
for (int i = 0; i < (int) myVec.size(); ++i) // [0..N-1] integers
{
if (bitmask[i]) {
cmb.push_back(myVec[i]);
}
}//cmb finished to be generated!
//store cmb.
all_combos.push_back(cmb);
cmb.clear();
} while (std::prev_permutation(bitmask.begin(), bitmask.end()));
return all_combos;
}
std::string join(const std::vector<int> & sequence, const std::string & separator)
{
std::string result;
for(size_t i = 0; i < sequence.size(); ++i)
result += to_string(sequence[i]) + ((i != sequence.size()-1) ? separator : "");
return result;
}
vector<vector<int>> extrageri;
int main() {
std::ifstream inFile("./DrawsDB.csv");
if (inFile.is_open())
{
std::string line;
while( std::getline(inFile,line) )
{
std::stringstream ss(line);
std::string Date, Hour, N1, N2, N3, N4, N5, N6, N7, N8, N9, N10, N11, N12, N13, N14, N15, N16, N17, N18, N19, N20;
std::getline(ss,Date,',');
std::getline(ss,Hour,',');
std::getline(ss,N1,',');
std::getline(ss,N2,',');
std::getline(ss,N3,',');
std::getline(ss,N4,',');
std::getline(ss,N5,',');
std::getline(ss,N6,',');
std::getline(ss,N7,',');
std::getline(ss,N8,',');
std::getline(ss,N9,',');
std::getline(ss,N10,',');
std::getline(ss,N11,',');
std::getline(ss,N12,',');
std::getline(ss,N13,',');
std::getline(ss,N14,',');
std::getline(ss,N15,',');
std::getline(ss,N16,',');
std::getline(ss,N17,',');
std::getline(ss,N18,',');
std::getline(ss,N19,',');
std::getline(ss,N20,',');
vector<int> draw;
draw.push_back(stoi(N1));
draw.push_back(stoi(N2));
draw.push_back(stoi(N3));
draw.push_back(stoi(N4));
draw.push_back(stoi(N5));
draw.push_back(stoi(N6));
draw.push_back(stoi(N7));
draw.push_back(stoi(N8));
draw.push_back(stoi(N9));
draw.push_back(stoi(N10));
draw.push_back(stoi(N11));
draw.push_back(stoi(N12));
draw.push_back(stoi(N13));
draw.push_back(stoi(N14));
draw.push_back(stoi(N15));
draw.push_back(stoi(N16));
draw.push_back(stoi(N17));
draw.push_back(stoi(N18));
draw.push_back(stoi(N19));
draw.push_back(stoi(N20));
extrageri.push_back(draw);
}
}
inFile.close();
for(int cmb=1; cmb<21; cmb++){
std::unordered_map<string, set<int>> combos;
ofstream MyFile("REPEATING_COMBOS_OF_"+ to_string(cmb) +".csv");
for(int i=0; i<extrageri.size(); i++){
vector<vector<int>> temp_draws = std::vector<vector<int>>(extrageri.begin() + i+1, extrageri.end());
for(auto & temp_draw : temp_draws){
std::vector<int> v_intersection;
std::set_intersection(extrageri[i].begin(), extrageri[i].end(),
temp_draw.begin(), temp_draw.end(),
std::back_inserter(v_intersection));
if(v_intersection.size()==cmb){
string stringval = join(v_intersection, ",");
combos[stringval].insert(i);
}else if(v_intersection.size()>cmb){
vector<vector<int>> groups = comb(v_intersection, cmb);
for(auto & group : groups){
string stringval = join(group, ",");
combos[stringval].insert(i);
}
}
}
}
for (const auto& keyvaluepair : combos)
{
// .first to access key, .second to access value
MyFile << keyvaluepair.first <<","<<keyvaluepair.second.size()+1<<endl;
}
MyFile.close();
}
return 0;
}
Which is working very well until it reaches Combinations of 6. Then it crashes. Also, during this time and until it get crashed it uses all the available RAM. The SQLite command works fine on 8GB RAM PC and retrieves even repeating Combinations of 15 or even 18 while this program crashes because not enough RAM memory available even when running it on 16GB RAM PC I have.
When the full amount of RAM is used then it crashes and this error is thrown:
terminate called after throwing an instance of 'std::bad_alloc'
what(): std::bad_alloc
Abort
I found more info about this bad_alloc: here, here, here, here and here and all of them are saying that this is caused by not enough memory available and not by the program logic itself.
My question is:
What is the problem with my C++ program and why it crashes and why SQLite commands runs very well and not get into "not enough memory" problems while this C++ program does?
Thank you very much in advance!
P.S. For the C++ sample input remove the first row (Date,Hour,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,N11,N12,N13,N14,N15,N16,N17,N18,N19,N20) which is the .csv header I posted in order to make it clear which column is which.
Update:
Maybe it is possible to do it using python and pandas?
I tried to do it here:
The very fast way to find repeating combinations in Python using pandas?
//Tags added too.