1

I have a big txt file (100MB with 23 milion lines) and i want to open it line by line and shuffle it like the GNU shuf command from linux. I work on a windows platform, i installed Visual Studio 2015 and start programing in c++. I tried the first time using an old c++ code of mine but it was too slow and i switched to boost libraries. I have to admit, it is really fast, but i don't know ho to put results into an array and shuffle them (array must hold up to 100.000.000 indexs).

This is what i try

#include <boost/iostreams/device/mapped_file.hpp> // for mmap
#include <algorithm>  // for std::find
#include <iostream>   // for std::cout
#include <cstring>

#include <fstream>
#include <sstream>
#include <string>

int main()
{
    boost::iostreams::mapped_file mmap("input.txt", boost::iostreams::mapped_file::readonly);
    auto f = mmap.const_data();
    auto l = f + mmap.size();

    uintmax_t m_numLines = 0;
    int inc1 = 0;

    char ** ip = NULL;

    boost::array<char, sizeof(int)> send_buf; <-- error here
    /*
    Severity    Code    Description Project File    Line    Suppression State
    Error (active)      namespace "boost" has no member "array" hshuffle    c:\path_to_the\main.cpp 21  
    Severity    Code    Description Project File    Line    Suppression State
    Error (active)      type name is not allowed    hshuffle    c:\path_to_the\main.cpp 21  
    Severity    Code    Description Project File    Line    Suppression State
    Error (active)      identifier "send_buf" is undefined  hshuffle    c:\path_to_the\main.cpp 21  
    Severity    Code    Description Project File    Line    Suppression State
    Error (active)      a value of type "const char *" cannot be assigned to an entity of type "char *" hshuffle    c:\path_to_the\main.cpp 29  
    */

    while (f && f != l)
    {
        if ((f = static_cast<const char*>(memchr(f, '\n', l - f))))
        {
            if ((m_numLines % 1000000) == 0)
            {
                ip[m_numLines] = l;
                std::cout << m_numLines << "\n";
            }


            m_numLines++, f++;
        }
    }

    std::cout << "m_numLines = " << m_numLines << "\n";




    printf("endfille\n");

    char a;
    std::cin >> a;
}

OLD C++ program

puts("reading ips file [./i]");

if((fp=fopen("i","r")) == NULL)
{ 
   printf("FATAL: Cant find i\n");
   return -1;
}

int increment_ips = 0;
indIP = 0;
while (fgets(nutt,2024,fp))
{
    while (t = strchr (nutt,'\n'))
        *t = ' ';

    temp = strtok (nutt, " ");

    if (temp != NULL) {
        string = strdup (temp);
        indIP++;

        while (temp = strtok (NULL, " "))
        {
            indIP++;
        }
    }

    increment_ips++;
}
fclose(fp);




if((fp=fopen("i","r")) == NULL)
{ 
   printf("FATAL: Cant find i\n");
   return -1;
}

increment_ips = 0;
ip = new char*[indIP];
indIP = 0;

while (fgets(nutt,2024,fp))
{
    while (t = strchr (nutt,'\n'))
        *t = ' ';

    temp = strtok (nutt, " ");

    if (temp != NULL) {
        string = strdup (temp);     
        ip[indIP++]=string;

        while (temp = strtok (NULL, " "))
        {
            string = strdup (temp);

            ip[indIP++]=string;
        }
    }

    increment_ips++;
}
fclose(fp);

// shuffle
printf("Loaded [%d] ips\n",increment_ips);

puts("Shuffeling ips");
srand(time(NULL));
for(int i = 0; i <= increment_ips; i++)
{
    int randnum = rand() % increment_ips + 1;
    char* tempval;
    tempval = ip[i];

    ip[i] = ip[randnum];
    ip[randnum] = tempval;
}
puts("Shuffeled");

Any solutions? I preffer boost thus it is really fast.

Thanks.

Danio Varacovic
  • 59
  • 1
  • 1
  • 6
  • Do you just want to know how to do a random sort on an array?(not bogo, really make it random) – turoni Jun 09 '16 at 10:06
  • shuffle a big text file, right now i don't know how to define an array and store there variables, the array must hold 100milion+ lines – Danio Varacovic Jun 09 '16 at 10:10
  • I have never done this myself but I think you'd be best using a [memory based B+ tree](http://stackoverflow.com/questions/1720738/looking-for-a-disk-based-b-tree-implementation-in-c-or-c) to hold that much indexes. – turoni Jun 09 '16 at 10:20
  • http://abiusx.com/me/code/wb/ -> i need it for windows, not linux – Danio Varacovic Jun 09 '16 at 10:40

1 Answers1

1

The "old" program reads the input file twice, the first time to count the space separeted words (not the lines, it seems) the second one to actually store the data in an array. Using a std::vector of std::string there's no need to know in advance the exact number of elements, one can reserve some space and let the memory managment to the standard library.

Since C++11 it's also possible to use std::shuffle to do what OP need. However, it's hard to imagine a cache friendly implementation of a Fisher–Yates (or Knuth) shuffle algorithm for such big an array (millions of elements).

I don't know how to put results into an array and shuffle them

A possible solution (without Boost) may be:

#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <algorithm>
#include <random>

using std::string;
using std::vector;
using std::cout;

int main() {
    // initialize random number generator
    std::random_device rd;
    std::mt19937 g(rd());

    // open input file  
    string file_name{"input.txt"};
    std::ifstream in_file{file_name};
    if ( !in_file ) {
        std::cerr << "Error: Failed to open file \"" << file_name << "\"\n";
        return -1;
    }

    vector<string> words;
    // if you want to avoid too many reallocations:
    const int expected = 100000000;
    words.reserve(expected);

    string word;
    while ( in_file >> word ) {
        words.push_back(word);
    }

    std::cout << "Number of elements read: " << words.size() << '\n';
    std::cout << "Beginning shuffle..." << std::endl;

    std::shuffle(words.begin(),words.end(),g);

    std::cout << "Shuffle done." << std::endl;

    // do whatever you need to do with the shuffled vector...

    return 0;
}
Bob__
  • 12,361
  • 3
  • 28
  • 42