0

I want to find ANY repeating combinations of k taken from n across all vector of vectors of 20 int values each. That vector of vectors is all about 50.000+ in size.

For example this is the SQLite piece of code that finds all repeating combos of 5 across all rows:

    DROP TABLE IF EXISTS unpivot;
CREATE TABLE IF NOT EXISTS unpivot AS
SELECT *
FROM (
        SELECT DrawId, N1 as n_value FROM draws union all
        SELECT DrawId, N2 as n_value FROM draws union all
        SELECT DrawId, N3 as n_value FROM draws union all
        SELECT DrawId, N4 as n_value FROM draws union all
        SELECT DrawId, N5 as n_value FROM draws union all
        SELECT DrawId, N6 as n_value FROM draws union all
        SELECT DrawId, N7 as n_value FROM draws union all
        SELECT DrawId, N8 as n_value FROM draws union all
        SELECT DrawId, N9 as n_value FROM draws union all
        SELECT DrawId, N10 as n_value FROM draws union all
        SELECT DrawId, N11 as n_value FROM draws union all
        SELECT DrawId, N12 as n_value FROM draws union all
        SELECT DrawId, N13 as n_value FROM draws union all
        SELECT DrawId, N14 as n_value FROM draws union all
        SELECT DrawId, N15 as n_value FROM draws union all
        SELECT DrawId, N16 as n_value FROM draws union all
        SELECT DrawId, N17 as n_value FROM draws union all
        SELECT DrawId, N18 as n_value FROM draws union all
        SELECT DrawId, N19 as n_value FROM draws union all
        SELECT DrawId, N20 as n_value FROM draws
     ) as T;
     CREATE TABLE IF NOT EXISTS COMB5 AS SELECT n1, n2, n3, n4, n5, count(*) as total
FROM
    (
    SELECT up1.n_value as n1, up2.n_value as n2, up3.n_value as n3, up4.n_value as n4, up5.n_value as n5
    FROM unpivot up1
    JOIN unpivot up2
      ON up1.DrawId = up2.DrawId
     AND up1.n_value < up2.n_value
     JOIN unpivot up3
      ON up2.DrawId = up3.DrawId
     AND up2.n_value < up3.n_value
    JOIN unpivot up4
        ON up3.DrawId = up4.DrawId
    AND up3.n_value < up4.n_value
    JOIN unpivot up5
        ON up4.DrawId = up5.DrawId
    AND up4.n_value < up5.n_value
   ) T
GROUP BY n1, n2, n3, n4, n5
ORDER BY total desc;

Input sample:

Date,Hour,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,N11,N12,N13,N14,N15,N16,N17,N18,N19,N20
2020-08-16, 15:00,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
2020-08-15, 22:40,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40
2020-08-15, 15:00,18,19,20,23,24,25,26,27,49,50,51,52,53,54,55,56,57,58,59,60
2020-08-14, 22:40,1,2,23,24,25,26,27,47,69,70,71,72,73,74,75,76,77,78,79,80
2020-08-14, 15:00,23,24,25,26,27,140,144,145,146,147,148,92,93,94,95,96,97,98,99,100

Output sample:

n1, n2, n3, n4, n5, total
23, 24, 25, 26, 27, 4

Which means (for this example) that the only combination of 5 numbers that is repeating in two or more rows is: 23,24,25,26,27 and it is found 4 times in total across all table (and this is shown by total column with the value of 4 in this case). It is count only if all those numbers are found all together.

Image showing this more clearly:

enter image description here

I can modify that SQLite working code above to find all repeating combinations of 6 numbers, 7 numbers and so on. And it works fine for each combination size I want, even for those with 15 or 18 numbers on each of them.

The only big problem is that this is very time consuming and it takes a lot of time to complete its job.

That is why I am trying to create a program that is doing the same thing but more faster than these SQLite commands.

So, I did this one below:

#include <fstream>
#include <sstream>
#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
#include <cstring>
#include <unordered_map>
#include <map>
#include <tuple>
#include <set>


using namespace std;


vector<vector<int>> comb(vector<int> &myVec, int K) {
    /* Returning all combinations from a vector as a vector of vectors. */
    vector<vector<int>> all_combos; //vector to store all combos
    vector<int> cmb; //vector to store one single combo!
    std::string bitmask(K, 1); // K leading 1's
    bitmask.resize((int) myVec.size(), 0); // N-K trailing 0's

    // print integers and permute bitmask
    do {
        for (int i = 0; i < (int) myVec.size(); ++i) // [0..N-1] integers
        {
            if (bitmask[i]) {
                cmb.push_back(myVec[i]);
            }
        }//cmb finished to be generated!
        //store cmb.
        all_combos.push_back(cmb);
        cmb.clear();
    } while (std::prev_permutation(bitmask.begin(), bitmask.end()));
    return all_combos;
}

std::string join(const std::vector<int> & sequence, const std::string & separator)
{
    std::string result;
    for(size_t i = 0; i < sequence.size(); ++i)
        result += to_string(sequence[i]) + ((i != sequence.size()-1) ? separator : "");
    return result;
}

vector<vector<int>> extrageri;

int main() {
    std::ifstream inFile("./DrawsDB.csv");
    if (inFile.is_open())
    {
        std::string line;
        while( std::getline(inFile,line) )
        {
            std::stringstream ss(line);

            std::string Date, Hour, N1, N2, N3, N4, N5, N6, N7, N8, N9, N10, N11, N12, N13, N14, N15, N16, N17, N18, N19, N20;
            std::getline(ss,Date,',');
            std::getline(ss,Hour,',');
            std::getline(ss,N1,',');
            std::getline(ss,N2,',');
            std::getline(ss,N3,',');
            std::getline(ss,N4,',');
            std::getline(ss,N5,',');
            std::getline(ss,N6,',');
            std::getline(ss,N7,',');
            std::getline(ss,N8,',');
            std::getline(ss,N9,',');
            std::getline(ss,N10,',');
            std::getline(ss,N11,',');
            std::getline(ss,N12,',');
            std::getline(ss,N13,',');
            std::getline(ss,N14,',');
            std::getline(ss,N15,',');
            std::getline(ss,N16,',');
            std::getline(ss,N17,',');
            std::getline(ss,N18,',');
            std::getline(ss,N19,',');
            std::getline(ss,N20,',');


            vector<int> draw;
            draw.push_back(stoi(N1));
            draw.push_back(stoi(N2));
            draw.push_back(stoi(N3));
            draw.push_back(stoi(N4));
            draw.push_back(stoi(N5));
            draw.push_back(stoi(N6));
            draw.push_back(stoi(N7));
            draw.push_back(stoi(N8));
            draw.push_back(stoi(N9));
            draw.push_back(stoi(N10));
            draw.push_back(stoi(N11));
            draw.push_back(stoi(N12));
            draw.push_back(stoi(N13));
            draw.push_back(stoi(N14));
            draw.push_back(stoi(N15));
            draw.push_back(stoi(N16));
            draw.push_back(stoi(N17));
            draw.push_back(stoi(N18));
            draw.push_back(stoi(N19));
            draw.push_back(stoi(N20));

            extrageri.push_back(draw);

        }
    }
    inFile.close();

    for(int cmb=1; cmb<21; cmb++){
        std::unordered_map<string, set<int>> combos;
        ofstream MyFile("REPEATING_COMBOS_OF_"+ to_string(cmb)  +".csv");
        for(int i=0; i<extrageri.size(); i++){
            vector<vector<int>> temp_draws = std::vector<vector<int>>(extrageri.begin() + i+1, extrageri.end());
            for(auto & temp_draw : temp_draws){
                std::vector<int> v_intersection;
                std::set_intersection(extrageri[i].begin(), extrageri[i].end(),
                                      temp_draw.begin(), temp_draw.end(),
                                      std::back_inserter(v_intersection));

                if(v_intersection.size()==cmb){
                    string stringval = join(v_intersection, ",");
                    combos[stringval].insert(i);
                }else if(v_intersection.size()>cmb){
                    vector<vector<int>> groups = comb(v_intersection, cmb);
                    for(auto & group : groups){
                        string stringval = join(group, ",");
                        combos[stringval].insert(i);
                    }
                }
            }
        }


        for (const auto& keyvaluepair : combos)
        {
            // .first to access key, .second to access value
            MyFile << keyvaluepair.first <<","<<keyvaluepair.second.size()+1<<endl;

        }

        MyFile.close();
    }


    return 0;
}

Which is working very well until it reaches Combinations of 6. Then it crashes. Also, during this time and until it get crashed it uses all the available RAM. The SQLite command works fine on 8GB RAM PC and retrieves even repeating Combinations of 15 or even 18 while this program crashes because not enough RAM memory available even when running it on 16GB RAM PC I have.

When the full amount of RAM is used then it crashes and this error is thrown:

terminate called after throwing an instance of 'std::bad_alloc'
  what():  std::bad_alloc
Abort

I found more info about this bad_alloc: here, here, here, here and here and all of them are saying that this is caused by not enough memory available and not by the program logic itself.

My question is:

What is the problem with my C++ program and why it crashes and why SQLite commands runs very well and not get into "not enough memory" problems while this C++ program does?

Thank you very much in advance!

P.S. For the C++ sample input remove the first row (Date,Hour,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,N11,N12,N13,N14,N15,N16,N17,N18,N19,N20) which is the .csv header I posted in order to make it clear which column is which.

Update:

Maybe it is possible to do it using python and pandas?

I tried to do it here:

The very fast way to find repeating combinations in Python using pandas?

//Tags added too.

YoYoYo
  • 439
  • 2
  • 11
  • "What is the problem with my C++ program and why it crashes and why SQLite commands runs very well and not get into "not enough memory" problems while this C++ program does?" Well, how much memory do you expect the C++ program to use? See if you can try to reason it out, by thinking about how many elements will be in the vectors. – Karl Knechtel Jun 01 '22 at 10:40
  • 1
    @KarlKnechtel Why SQLite commands runs well with the same parameters while this program crashes because not enough memory available? – YoYoYo Jun 01 '22 at 10:41
  • 1
    I 'm also using std::unordered_map> combos; inside the for loop which is reset every time the loop starts again and also the value for the unordered_map is set for memory optimizations and I am not using a vector in order to avoid the usage of too much memory. – YoYoYo Jun 01 '22 at 10:44
  • 1
    Presumably because SQLite uses a better algorithm, and does not need to build a comparably exhaustive data structure in order to compute the result. – Karl Knechtel Jun 01 '22 at 10:47
  • 1
    @KarlKnechtel Probably you are right. I don't know how SQLite works but the problem with SQLite is (like I said above) that it is very time consuming and it takes a lot to complete its job which (almost) takes for ever. That is why I want to accomplish this into C++. Do you have any idea how to avoid these memory problems, please? Thank you in advance! – YoYoYo Jun 01 '22 at 10:48
  • 1
    Building all combinations of taking N values from a vector grows exponentially and most of those won't occur multiple times. You should build up slowly. First find all vectors of length 1 that occur 2 or more times. Then use those to construct vectors of length 2 and check which occur 2 or more times. Use those to construct length 3 and so on. Most combinations will be eliminated early that way. – Goswin von Brederlow Jun 01 '22 at 11:42
  • 1
    @GoswinvonBrederlow If you take a look at my code and also at the SQLite command, you will find out that it is not building all combinations of taking N values exactly for the reason you are talking about. It just compares a row against the other row and take the common numbers (only if they match the needed length) and store it into that unordered map. Then compares the next row and so on. – YoYoYo Jun 01 '22 at 13:37

0 Answers0