fast file reading in C++, comparison of different strategies with mmap() and std::cin() performance results interpretation

Question

After advises on a question I made to close mmapped files (c++ close a open() file read with mmap) I made some comparisons and I noticed that, as suggested by some users, std::cin buffer approach is performing similar to my mmapped approach.

I decided to make a performance comparison: every script opens a file that contains other files path (about 3500), read the file and takes 10 random paths and opening all these 10 files (of 500 lines every, about 700 characters each line) and read the number of newlines in it, for 1000 times randomly.

My original version was a mapped approach (but it did not close the file at the end and it gives you an error after 300-400 file openings, not reached in this case (c++ close a open() file read with mmap)).

MMAP.OPEN (v0):

#include <algorithm>
#include <iostream>
#include <cstring>
#include <vector>
#include <set>
#include <typeinfo>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <fstream>
#include <sstream>
#include <unistd.h>

const char* map_file(const char* fname, size_t& length);
void  gnomadIndex( std::string &nomeFileIndex,
                    std::vector<std::string> &vSubfileNames,
                    std::vector<int> &vSubfileStarts,
                    std::vector<int> &vSubfileStops,
                    std::vector<std::vector<int>> &vSubfilePosizioniVector);
void gnomadSubfileAnalysis( std::string &nomeFile );
void populateVector( std::vector<int> &vec, int n );
int generateRandInt( int l0, int u0 );

int main() {

      //this take the INDEX file (with the paths to the athor) and populate these vectors
      std::string gnomadSplitDir = "/Volumes/enrico/gnomad/exomes/splitted_5c/";
      std::string nomeChr = "1";
      std::stringstream streamNomeGnomadChrDir;
      streamNomeGnomadChrDir << gnomadSplitDir << "ex_" << nomeChr << "_5c/" << "chr_" << nomeChr << "/";
      std::string pathGnomadChrDir = streamNomeGnomadChrDir.str();
      std::stringstream streamNomeGnomadChrIndex;
      streamNomeGnomadChrIndex << pathGnomadChrDir << "chr_" << nomeChr << ".txt";
      std::string pathGnomadChrIndex = streamNomeGnomadChrIndex.str();
      std::vector<std::string> vSubfileNames;
      std::vector<int> vSubfileStarts;
      std::vector<int> vSubfileStops;
      std::vector<std::vector<int>> vSubfilePosizioniVector;
      gnomadIndex(pathGnomadChrIndex,
                  vSubfileNames,
                  vSubfileStarts,
                  vSubfileStops,
                  vSubfilePosizioniVector
                );
      std::vector<std::string> vGnomadSubfilePaths;
      srand((unsigned)time(NULL)); //seeds the pseudo random number generator that rand() uses (http://www.cplusplus.com/forum/beginner/29699/)
      int size0 = 10;
      std::vector<int> v0;
      populateVector(v0, size0);

      //the vector with the file names is converted in file paths and then opened and line counted for each file
       std::vector<std::string> vSubfileNames2;
       for (auto si : v0) vSubfileNames2.push_back(vSubfileNames[si]);
       for ( int subCount = 0; subCount < vSubfileNames2.size(); subCount++ ) {
        std::stringstream streamNomeSubfileGnomad;
        streamNomeSubfileGnomad << pathGnomadChrDir << vSubfileNames2[subCount];
        std::string pathGnomadSubfile = streamNomeSubfileGnomad.str();
        gnomadSubfileAnalysis(pathGnomadSubfile);
      }

}




void gnomadSubfileAnalysis( std::string &nomeFile ) {
  size_t length;
  auto f = map_file(nomeFile.c_str(), length);
  auto l = f + length;
  std::vector<int> v0;
  for (int i=0; i<length; i++) if (f[i] == '\n') v0.push_back(i);
  std::cout << "subfile: " << nomeFile << ", has: " << v0.size() << " rows in: " << length << " bytes." << '\n';
}

void  gnomadIndex( std::string &nomeFileIndex,
                    std::vector<std::string> &vSubfileNames,
                    std::vector<int> &vSubfileStarts,
                    std::vector<int> &vSubfileStops,
                    std::vector<std::vector<int>> &vSubfilePosizioniVector
                  ) {
  size_t length;
  auto f = map_file(nomeFileIndex.c_str(), length);
  auto l = f + length;

  std::vector<int> v0;

  v0.push_back(0); 
  for (int i=0; i<length; i++) {
    if (f[i] == '\n') v0.push_back(i+1);
  }
  v0.pop_back();
  for (int nl = 0; nl < v0.size(); nl++) {
    int ncol = 0;
    std::vector<char> vSubfile;
    std::vector<char> vSubStart;
    std::vector<char> vSubStop;
    std::vector<char> vSubPos;
    std::vector<int> vSubfilePosVector;

    for( int start=v0[nl]; start<v0[nl+1]; start++  ) {
      if (f[start] == ',') ncol++;

      if (ncol == 0) {
        if ( f[start] != ',' ) {
          vSubfile.push_back(f[start]);
        }
      }

      if (ncol == 1) {
        if ( f[start] != ',' ) {
          vSubStart.push_back(f[start]);
        }
      }

      if (ncol == 2) {
        if ( f[start] != ',' ) {
          vSubStop.push_back(f[start]);
        }
      }

      if (ncol == 3) {
        if ( f[start] != ',' ) {
          if ( f[start] != ':' ) {
            vSubPos.push_back(f[start]);
          }
          if ( f[start] == ':' || f[start] == '\n' ) {

              std::string subfilePosValue ( vSubPos.begin(), vSubPos.end() );
              vSubPos.clear();

              vSubfilePosVector.push_back( stoi(subfilePosValue) );
              subfilePosValue.erase();
            }
          }
        }
      }

  std::string subFileValue ( vSubfile.begin(), vSubfile.end() );
  vSubfile.clear();
  std::string subfileStartValue ( vSubStart.begin(), vSubStart.end() );
  vSubStart.clear();
  std::string subfileStopValue ( vSubStop.begin(), vSubStop.end() );
  vSubStop.clear();

  vSubfileNames.push_back( subFileValue );
  vSubfileStarts.push_back( stoi( subfileStartValue ) );
  vSubfileStops.push_back( stoi(subfileStopValue) );
  vSubfilePosizioniVector.push_back( vSubfilePosVector );
  }
}

void handle_error(const char* msg) {
    perror(msg);
    exit(255);
}

const char* map_file(const char* fname, size_t& length) {

    int fd = open(fname, O_RDONLY);

    if (fd == -1)
        handle_error("open");

    struct stat sb;

    if (fstat(fd, &sb) == -1)
        handle_error("fstat");

    length = sb.st_size;

    const char* addr = static_cast<const char*>(mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, 0u));
    if (addr == MAP_FAILED)
        handle_error("mmap");

    return addr;
}

void populateVector( std::vector<int> &vec, int n ) {
  for( int i = 0 ; i < n ; ++i ) vec.push_back( 0 + rand() % (3000 - 1 + 1)); //gli ultimi due num sono upper e lower range limits
  sort( vec.begin(), vec.end() );
}

int generateRandInt( int l0, int u0 ) {
  int n0 = l0 + rand() % (u0 - l0 + 1);
  return n0;
}

As suggested, I then tried with CIN (v1), here the code:

#include <iostream>
#include <fstream>
#include <string>
#include <cstring>
#include <vector>
#include <sstream>


void read_file_cin( std::string &nomeFile );
void read_gnomad_index_cin( std::string &nomeFile, char &separator, std::vector<std::string> &vectorName );
void populateVector( std::vector<int> &vec, int n );
int generateRandInt( int l0, int u0 );
std::vector<int> sovrapposizioniVectorsInt( std::vector<int> &v0, std::vector<int> &v1 );

int main() {

  std::string pathGnomadChrIndex = "/Volumes/enrico/gnomad/exomes/splitted_5c/ex_1_5c/chr_1/chr_1.txt";
  char separator = ',';
  std::vector<std::string> vSubfileNames;

  read_gnomad_index_cin(pathGnomadChrIndex, separator, vSubfileNames);

  srand((unsigned)time(NULL)); //seeds the pseudo random number generator that rand() uses (http://www.cplusplus.com/forum/beginner/29699/)
  int size0 = 10;
  std::vector<int> v0;
  populateVector(v0, size0);

   std::vector<std::string> vSubfileNames2;
   for (auto si : v0) vSubfileNames2.push_back(vSubfileNames[si]);

   std::string gnomadSplitDir = "/Volumes/enrico/gnomad/exomes/splitted_5c/ex_1_5c/chr_1/";
   for ( auto vi : vSubfileNames2 ) {
     std::cout << vi << '\n';
     std::stringstream streamNomeGnomadChrDir;
     streamNomeGnomadChrDir << gnomadSplitDir << vi;
     std::string pathSubfile = streamNomeGnomadChrDir.str();
     read_file_cin( pathSubfile );
   }
}

void first_column_cin( char &separator, std::vector<std::string> &vectorName) {
    std::string line;
    int lineCount = 0;
    while(std::getline(std::cin, line)) {
      std::string filename;
      std::istringstream iss(line); 
      getline(iss, filename, separator); 
      vectorName.push_back(filename);
      lineCount++;
    }
}

void read_gnomad_index_cin( std::string &nomeFile, char &separator, std::vector<std::string> &vectorName ) {
    std::ifstream in(nomeFile.c_str());
    std::streambuf *cinbuf = std::cin.rdbuf(); 
    std::cin.rdbuf(in.rdbuf());

    first_column_cin( separator, vectorName );

    std::cin.rdbuf(cinbuf); 
}

void file_countline_cin() {
    std::string line;
    int lineCount = 0;
    while(std::getline(std::cin, line)) lineCount++;
    std::cout << "file has: " << lineCount << " rows." << '\n';
}
void read_file_cin( std::string &nomeFile ) {
    std::ifstream in(nomeFile.c_str());
    std::streambuf *cinbuf = std::cin.rdbuf(); //save old buf
    std::cin.rdbuf(in.rdbuf()); //redirect std::cin to in.txt!
    file_countline_cin(); //call function
    std::cin.rdbuf(cinbuf);   //reset to standard input again
}

void populateVector( std::vector<int> &vec, int n ) {
  for( int i = 0 ; i < n ; ++i ) vec.push_back( 0 + rand() % (3000 - 1 + 1)); //gli ultimi due num sono upper e lower range limits
  sort( vec.begin(), vec.end() );
}

int generateRandInt( int l0, int u0 ) {
  int n0 = l0 + rand() % (u0 - l0 + 1);
  return n0;
}

I then tried to speed up CIN (https://www.geeksforgeeks.org/fast-io-for-competitive-programming/) with:

std::ios_base::sync_with_stdio(false);
std::cin.tie(NULL);

here the CIN.SPEED (v2) version:

void read_file_cin( std::string &nomeFile );
void read_gnomad_index_cin( std::string &nomeFile, char &separator, std::vector<std::string> &vectorName );


void populateVector( std::vector<int> &vec, int n );
int generateRandInt( int l0, int u0 );

std::vector<int> sovrapposizioniVectorsInt( std::vector<int> &v0, std::vector<int> &v1 );

int main() {

  std::string pathGnomadChrIndex = "/Volumes/enrico/gnomad/exomes/splitted_5c/ex_1_5c/chr_1/chr_1.txt";
  char separator = ',';
  std::vector<std::string> vSubfileNames;

  read_gnomad_index_cin(pathGnomadChrIndex, separator, vSubfileNames);

  srand((unsigned)time(NULL));
  int size0 = 10;
  std::vector<int> v0;
  populateVector(v0, size0);

   std::vector<std::string> vSubfileNames2;
   for (auto si : v0) vSubfileNames2.push_back(vSubfileNames[si]);

   std::string gnomadSplitDir = "/Volumes/enrico/gnomad/exomes/splitted_5c/ex_1_5c/chr_1/";
   for ( auto vi : vSubfileNames2 ) {
     std::cout << vi << '\n';
     std::stringstream streamNomeGnomadChrDir;
     streamNomeGnomadChrDir << gnomadSplitDir << vi;
     std::string pathSubfile = streamNomeGnomadChrDir.str();
     read_file_cin( pathSubfile );
   }
}

void first_column_cin( char &separator, std::vector<std::string> &vectorName) {
  /* SPEEDUP */
  std::ios_base::sync_with_stdio(false);
  std::cin.tie(NULL);
  /* SPEEDUP */
  std::string line;
    int lineCount = 0;
    while(std::getline(std::cin, line)) {
      std::string filename;
      std::istringstream iss(line); // string stream
      getline(iss, filename, separator); // read first part up to comma, ignore the comma
      vectorName.push_back(filename);
      lineCount++;
    }
}

void read_gnomad_index_cin( std::string &nomeFile, char &separator, std::vector<std::string> &vectorName ) {
  /* SPEEDUP */
  std::ios_base::sync_with_stdio(false);
  std::cin.tie(NULL);
  /* SPEEDUP */
    std::ifstream in(nomeFile.c_str());
    std::streambuf *cinbuf = std::cin.rdbuf(); //save old buf
    std::cin.rdbuf(in.rdbuf()); //redirect std::cin to in.txt!

    first_column_cin( separator, vectorName ); //call function

    std::cin.rdbuf(cinbuf);   //reset to standard input again
}

void file_countline_cin() {
  /* SPEEDUP */
  std::ios_base::sync_with_stdio(false);
  std::cin.tie(NULL);
  /* SPEEDUP */
    std::string line;
    int lineCount = 0;
    while(std::getline(std::cin, line)) lineCount++;
    std::cout << "file has: " << lineCount << " rows." << '\n';
}


void read_file_cin( std::string &nomeFile ) {
  /* SPEEDUP */
  std::ios_base::sync_with_stdio(false);
  std::cin.tie(NULL);
  /* SPEEDUP */
    std::ifstream in(nomeFile.c_str());
    std::streambuf *cinbuf = std::cin.rdbuf(); //save old buf
    std::cin.rdbuf(in.rdbuf()); //redirect std::cin to in.txt!

    file_countline_cin(); //call function
    std::cin.rdbuf(cinbuf);   //reset to standard input again
}

I then tried with a version of mmap that munmapped() and close() the file that pushed all file content in a vector and then return the vector to analyze file (just counting newline in this test) content while it was munmap and closing the mmap and file.

Here MMAP.VECTOR (v3):

const char* map_file(const char* fname, size_t& length);

void mmap_file( std::string &filename, std::vector<char> &vFile);

void  gnomadIndex( std::string &nomeFileIndex,
                    std::vector<std::string> &vSubfileNames,
                    std::vector<int> &vSubfileStarts,
                    std::vector<int> &vSubfileStops,
                    std::vector<std::vector<int>> &vSubfilePosizioniVector
                  );
void gnomadSubfileAnalysis( std::string &nomeFile );
void populateVector( std::vector<int> &vec, int n );
int generateRandInt( int l0, int u0 );

int main() {

      std::string gnomadSplitDir = "/Volumes/enrico/gnomad/exomes/splitted_5c/";
      std::string nomeChr = "1";

      std::stringstream streamNomeGnomadChrDir;
      streamNomeGnomadChrDir << gnomadSplitDir << "ex_" << nomeChr << "_5c/" << "chr_" << nomeChr << "/";
      std::string pathGnomadChrDir = streamNomeGnomadChrDir.str();
      std::stringstream streamNomeGnomadChrIndex;
      streamNomeGnomadChrIndex << pathGnomadChrDir << "chr_" << nomeChr << ".txt";
      std::string pathGnomadChrIndex = streamNomeGnomadChrIndex.str();

      std::vector<std::string> vSubfileNames;
      std::vector<int> vSubfileStarts;
      std::vector<int> vSubfileStops;
      std::vector<std::vector<int>> vSubfilePosizioniVector;

      gnomadIndex(pathGnomadChrIndex,
                  vSubfileNames,
                  vSubfileStarts,
                  vSubfileStops,
                  vSubfilePosizioniVector
                );

      std::vector<std::string> vGnomadSubfilePaths;


     srand((unsigned)time(NULL));
      int size0 = 10;
      std::vector<int> v0;
      populateVector(v0, size0);

       std::vector<std::string> vSubfileNames2;
       for (auto si : v0) vSubfileNames2.push_back(vSubfileNames[si]);

      for ( int subCount = 0; subCount < vSubfileNames2.size(); subCount++ ) {

        std::stringstream streamNomeSubfileGnomad;
        streamNomeSubfileGnomad << pathGnomadChrDir << vSubfileNames2[subCount];
        std::string pathGnomadSubfile = streamNomeSubfileGnomad.str();

        gnomadSubfileAnalysis(pathGnomadSubfile);
      }

}

void gnomadSubfileAnalysis( std::string &nomeFile ) {

  std::vector<char> f;
  mmap_file(nomeFile, f);

  std::vector<int> v0;

  for (int i=0; i<f.size(); i++) if (f[i] == '\n') v0.push_back(i);

  std::cout << "subfile: " << nomeFile << ", has: " << v0.size() << " rows in: " << f.size() << " bytes." << '\n';
}

void  gnomadIndex( std::string &nomeFileIndex,
                    std::vector<std::string> &vSubfileNames,
                    std::vector<int> &vSubfileStarts,
                    std::vector<int> &vSubfileStops,
                    std::vector<std::vector<int>> &vSubfilePosizioniVector
                  ) {


  std::vector<char> f;
  mmap_file(nomeFileIndex, f);
  std::vector<int> v0;

  v0.push_back(0); //la prima riga mi serve qui
  for (int i=0; i<f.size(); i++) {
    if (f[i] == '\n') v0.push_back(i+1);
  }
  v0.pop_back();

  for (int nl = 0; nl < v0.size(); nl++) {

    int ncol = 0;

    std::vector<char> vSubfile;
    std::vector<char> vSubStart;
    std::vector<char> vSubStop;
    std::vector<char> vSubPos;
    std::vector<int> vSubfilePosVector;

    for( int start=v0[nl]; start<v0[nl+1]; start++  ) {

      if (f[start] == ',') ncol++;

      if (ncol == 0) {
        if ( f[start] != ',' ) {
          vSubfile.push_back(f[start]);
        }
      }

      if (ncol == 1) {
        if ( f[start] != ',' ) {
          vSubStart.push_back(f[start]);
        }
      }

      if (ncol == 2) {
        if ( f[start] != ',' ) {
          vSubStop.push_back(f[start]);
        }
      }

      if (ncol == 3) {
        if ( f[start] != ',' ) {
          if ( f[start] != ':' ) {
            vSubPos.push_back(f[start]);
          }
          if ( f[start] == ':' || f[start] == '\n' ) {

              std::string subfilePosValue ( vSubPos.begin(), vSubPos.end() );
              vSubPos.clear();

              vSubfilePosVector.push_back( stoi(subfilePosValue) );
              subfilePosValue.erase();
            }
          }
        }
      }

  std::string subFileValue ( vSubfile.begin(), vSubfile.end() );
  vSubfile.clear();
  std::string subfileStartValue ( vSubStart.begin(), vSubStart.end() );
  vSubStart.clear();
  std::string subfileStopValue ( vSubStop.begin(), vSubStop.end() );
  vSubStop.clear();

  vSubfileNames.push_back( subFileValue );
  vSubfileStarts.push_back( stoi( subfileStartValue ) );
  vSubfileStops.push_back( stoi(subfileStopValue) );
  vSubfilePosizioniVector.push_back( vSubfilePosVector );
  }
}


void handle_error(const char* msg) {
    perror(msg);
    exit(255);
}

const char* map_file(const char* fname, size_t& length) {

    int fd = open(fname, O_RDONLY);

    if (fd == -1)
        handle_error("open");

    struct stat sb;

    if (fstat(fd, &sb) == -1)
        handle_error("fstat");

    length = sb.st_size;

    const char* addr = static_cast<const char*>(mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, 0u));
    if (addr == MAP_FAILED)
        handle_error("mmap");

    return addr;
}



void populateVector( std::vector<int> &vec, int n ) {
  for( int i = 0 ; i < n ; ++i ) vec.push_back( 0 + rand() % (3000 - 1 + 1)); //gli ultimi due num sono upper e lower range limits
  sort( vec.begin(), vec.end() );
}

int generateRandInt( int l0, int u0 ) {
  int n0 = l0 + rand() % (u0 - l0 + 1);
  return n0;
}


size_t getFilesize(const char* filename) {
    struct stat st;
    stat(filename, &st);
    return st.st_size;
}


void mmap_file( std::string &filename, std::vector<char> &vFile) {
    size_t filesize = getFilesize(filename.c_str());
    int fd = open(filename.c_str(), O_RDONLY, 0);
    assert(fd != -1);
    void* mmappedData = mmap(NULL, filesize, PROT_READ, MAP_PRIVATE, fd, 0);
    assert(mmappedData != MAP_FAILED);

    const char* f = static_cast<const char*>(mmappedData);

    for ( int i = 0; i < filesize; i++ ) vFile.push_back(f[i]);

    int rc = munmap(mmappedData, filesize);
    assert(rc == 0);
    close(fd);

}

But I immediatedly saw it was slower, I thought it could be due to the further passage of filling the vector so I tried a version that counted newlines while mmap was open and then closed it.

MMAP.CLOSE (v4):

 const char* map_file(const char* fname, size_t& length);

void mmap_file( std::string &filename, std::vector<char> &vFile);
int mmap_file_nlines( std::string &filename );
void  gnomadIndex( std::string &nomeFileIndex,
                    std::vector<std::string> &vSubfileNames,
                    std::vector<int> &vSubfileStarts,
                    std::vector<int> &vSubfileStops,
                    std::vector<std::vector<int>> &vSubfilePosizioniVector
                  );
void gnomadSubfileAnalysis( std::string &nomeFile );
void populateVector( std::vector<int> &vec, int n );
int generateRandInt( int l0, int u0 );

int main() {

      std::string gnomadSplitDir = "/Volumes/enrico/gnomad/exomes/splitted_5c_text/";
      std::string nomeChr = "1";

      std::stringstream streamNomeGnomadChrDir;
      streamNomeGnomadChrDir << gnomadSplitDir << "chr_" << nomeChr << "/";
      std::string pathGnomadChrDir = streamNomeGnomadChrDir.str();
      std::stringstream streamNomeGnomadChrIndex;
      streamNomeGnomadChrIndex << pathGnomadChrDir << "chr_" << nomeChr << ".txt";
      std::string pathGnomadChrIndex = streamNomeGnomadChrIndex.str();

      std::vector<std::string> vSubfileNames;
      std::vector<int> vSubfileStarts;
      std::vector<int> vSubfileStops;
      std::vector<std::vector<int>> vSubfilePosizioniVector;

      gnomadIndex(pathGnomadChrIndex,
                  vSubfileNames,
                  vSubfileStarts,
                  vSubfileStops,
                  vSubfilePosizioniVector
                );

      std::vector<std::string> vGnomadSubfilePaths;


      srand((unsigned)time(NULL));
      int size0 = 10;
      std::vector<int> v0;
      populateVector(v0, size0);
       std::vector<std::string> vSubfileNames2;
       for (auto si : v0) vSubfileNames2.push_back(vSubfileNames[si]);

      for ( int subCount = 0; subCount < vSubfileNames2.size(); subCount++ ) {
        std::stringstream streamNomeSubfileGnomad;
        streamNomeSubfileGnomad << pathGnomadChrDir << vSubfileNames2[subCount];
        std::string pathGnomadSubfile = streamNomeSubfileGnomad.str();
        gnomadSubfileAnalysis(pathGnomadSubfile);
      }

}

void gnomadSubfileAnalysis( std::string &nomeFile ) {
  int nLinee = mmap_file_nlines(nomeFile);

  std::cout << "subfile: " << nomeFile << ", has: " << nLinee << " rows" << '\n';
}

void  gnomadIndex( std::string &nomeFileIndex,
                    std::vector<std::string> &vSubfileNames,
                    std::vector<int> &vSubfileStarts,
                    std::vector<int> &vSubfileStops,
                    std::vector<std::vector<int>> &vSubfilePosizioniVector
                  ) {

  size_t length;
  auto f = map_file(nomeFileIndex.c_str(), length);
  auto l = f + length;
  std::vector<int> v0;
  v0.push_back(0); //la prima riga mi serve qui
  for (int i=0; i<length; i++) {
    if (f[i] == '\n') v0.push_back(i+1);
  }
  v0.pop_back();

  for (int nl = 0; nl < v0.size(); nl++) {

    int ncol = 0;
    std::vector<char> vSubfile;
    std::vector<char> vSubStart;
    std::vector<char> vSubStop;
    std::vector<char> vSubPos;
    std::vector<int> vSubfilePosVector;

    for( int start=v0[nl]; start<v0[nl+1]; start++  ) {
      if (f[start] == ',') ncol++;
      if (ncol == 0) {
        if ( f[start] != ',' ) {
          vSubfile.push_back(f[start]);
        }
      }
      if (ncol == 1) {
        if ( f[start] != ',' ) {
          vSubStart.push_back(f[start]);
        }
      }
      if (ncol == 2) {
        if ( f[start] != ',' ) {
          vSubStop.push_back(f[start]);
        }
      }
      if (ncol == 3) {
        if ( f[start] != ',' ) {
          if ( f[start] != ':' ) {
            vSubPos.push_back(f[start]);
          }
          if ( f[start] == ':' || f[start] == '\n' ) {
              std::string subfilePosValue ( vSubPos.begin(), vSubPos.end() );
              vSubPos.clear();
              vSubfilePosVector.push_back( stoi(subfilePosValue) );
              subfilePosValue.erase();
            }
          }
        }
      }

  std::string subFileValue ( vSubfile.begin(), vSubfile.end() );
  vSubfile.clear();
  std::string subfileStartValue ( vSubStart.begin(), vSubStart.end() );
  vSubStart.clear();
  std::string subfileStopValue ( vSubStop.begin(), vSubStop.end() );
  vSubStop.clear();
  vSubfileNames.push_back( subFileValue );
  vSubfileStarts.push_back( stoi( subfileStartValue ) );
  vSubfileStops.push_back( stoi(subfileStopValue) );
  vSubfilePosizioniVector.push_back( vSubfilePosVector );
  }
}

void handle_error(const char* msg) {
    perror(msg);
    exit(255);
}
const char* map_file(const char* fname, size_t& length) {

    int fd = open(fname, O_RDONLY);
    if (fd == -1)
        handle_error("open");
    struct stat sb;
    if (fstat(fd, &sb) == -1)
        handle_error("fstat");
    length = sb.st_size;

    const char* addr = static_cast<const char*>(mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, 0u));
    if (addr == MAP_FAILED)
        handle_error("mmap");
    return addr;
}

void populateVector( std::vector<int> &vec, int n ) {
  for( int i = 0 ; i < n ; ++i ) vec.push_back( 0 + rand() % (3000 - 1 + 1)); //gli ultimi due num sono upper e lower range limits
  sort( vec.begin(), vec.end() );
}

int generateRandInt( int l0, int u0 ) {
  int n0 = l0 + rand() % (u0 - l0 + 1);
  return n0;
}

size_t getFilesize(const char* filename) {
    struct stat st;
    stat(filename, &st);
    return st.st_size;
}

void mmap_file( std::string &filename, std::vector<char> &vFile) {
    size_t filesize = getFilesize(filename.c_str());
    int fd = open(filename.c_str(), O_RDONLY, 0);
    assert(fd != -1);
    void* mmappedData = mmap(NULL, filesize, PROT_READ, MAP_PRIVATE, fd, 0);
    assert(mmappedData != MAP_FAILED);

    const char* f = static_cast<const char*>(mmappedData);

    for ( int i = 0; i < filesize; i++ ) vFile.push_back(f[i]);
    int rc = munmap(mmappedData, filesize);
    assert(rc == 0);
    close(fd);
}

int mmap_file_nlines( std::string &filename ) {
    size_t filesize = getFilesize(filename.c_str());
    int fd = open(filename.c_str(), O_RDONLY, 0);
    assert(fd != -1);
    void* mmappedData = mmap(NULL, filesize, PROT_READ, MAP_PRIVATE, fd, 0);
    assert(mmappedData != MAP_FAILED);

    const char* f = static_cast<const char*>(mmappedData);
    int lineCount =0 ;
    for ( int i = 0; i < filesize; i++ ) if( f[i] == '\n' ) lineCount++;
    int rc = munmap(mmappedData, filesize);
    assert(rc == 0);
    close(fd);

    return lineCount;

}

Here the means in seconds for 1000 tries with each approach:

          cin 0.983380
    cin.speed 0.989011
   mmap.close 2.863860
    mmap.open 0.915395
  mmap.vector 4.683976

I was someway surprised that the "speed-up" for CIN changed nothing, (maybe I got something wrong?) and that closing the file was so time consuming for mmap!

Here the plot of the results.

Any comments by more experts people are really welcomed!

PS: my machine is a iMac (Retina 5K, 27-inch, Late 2014), 4 GHz Intel Core i7, 16 GB 1600 MHz DDR3

Please reformat your code with logical indentation. In its current shape it's mostly unreadable. — Sam Varshavchik, Mar 27 '19 at 14:37
Sorry, it's passage through bash that removes indentation, I'm editing — cccnrc, Mar 27 '19 at 14:38
Try this to format: http://format.krzaq.cc/ then paste the result back select it an hit the {} button (which will format it in a code block). — drescherjm, Mar 27 '19 at 14:40
to print the 4 spaces, if I do with atom (my editor) it pass the code further away. I' editing anyway — cccnrc, Mar 27 '19 at 14:40

fast file reading in C++, comparison of different strategies with mmap() and std::cin() performance results interpretation

0 Answers0

Linked