0

I am quite new to C++ so sorry if I ask something silly, but I found no answer online (only a post that refers to python (Can mmap and gzip collaborate?)), trying to see if is possible to read a .GZ file through mmap() function (following: Fast textfile reading in c++) in order to operate some operations on the file and write it down on another file. I need to retain only some part of the original rows and columns based on some columns/fields values to later retrieve them and compare with other similar files but from different subjects, in order to extract similarities/differences. The files are very big (up to 10GB .GZ) so I am trying to use fast comparison methods for GZIP files. It is more a "performance comparison" with other methods. Here is it the code (sorry, it is long and I think awful):

#include <algorithm>
#include <iostream>
#include <cstring>
#include <vector>
#include <typeinfo>

// for mmap:
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>

//for writefile
#include <fstream>

template <int N>
void emptyArray( char (&arrayName) [N] ) {
  std::fill( std::begin( arrayName ), std::end( arrayName ), 0 );
}

const char* map_file(const char* fname, size_t& length);

int main() {

//prende la dimensione del file da aprire
size_t length;
auto f = map_file("myfile.vcf", length);
auto l = f + length;


uintmax_t m_numLines = 0;

std::vector<int> v0;
std::vector<int> v1;
std::vector<int> v2;

for (int i=1; i<length; i++) {
  //vettore di posizioni # in prima posizione di una linea
  if (f[i] == '#' && f[i-1] == '\n') v0.push_back(i);
  //vettore di nuove linee
  if (f[i] == '\n') v1.push_back(i+1);
}

std::vector<int> inter;
set_intersection(v0.begin(), v0.end(),
                  v1.begin(), v1.end(),
                  back_inserter(inter));

v1.erase(set_difference(v1.begin(), v1.end(),
                        inter.begin(), inter.end(),
                        v1.begin()), v1.end());

v1.pop_back();


char chromArray[3];
char posArray[10];
char refArray[50];
char altArray[50];
char qualityArray[10];
char gtArray[4];
char gqxArray[5];
char dpArray[5];
char adArray[10];

//LOOP per NUM RIGA
//apro loop su vettore NL (non #)
for (int nl =0; nl<v1.size(); nl++) {

  //CONTATORI

  int ncol = 0;
  int chri = 0;
  int posi = 0;
  int refi = 0;
  int alti = 0;

  int qi = 0;
  int formatHeaderCount = 0;
  int formatLastCount = 0;
  int numGT = 0;
  int gti = 0;
  int numGQX = 0;
  int gqxi = 0;
  int numDP = 0;
  int dpi = 0;
  int numAD = 0;
  int adi = 0;

  std::string chromValue;
  emptyArray(chromArray);
  std::string posValue;
  emptyArray(posArray);
  std::string refValue;
  emptyArray(refArray);
  std::string altValue;
  emptyArray(altArray);
  std::string quality;
  emptyArray(qualityArray);
  std::string gtValue;
  emptyArray(gtArray);
  std::string gqxValue;
  emptyArray(gqxArray);
  std::string dpValue;
  emptyArray(dpArray);
  std::string adValue;
  emptyArray(adArray);

  for( int start=v1[nl]; start<v1[nl+1]; start++  ) {
    if (f[start] == '\t') ncol++;
    if (ncol == 0) {
      if ( f[start] != '\t' && f[start] != 'c' && f[start] != 'h' && f[start] != 'r' ) {
        chromArray[chri] = f[start];
        chri++;
      }
    }

    if (ncol == 1) {
      if ( f[start] != '\t' ) {
        posArray[posi] = f[start];
        posi++;
      }
    }

    if (ncol == 3) {
      if ( f[start] != '\t' ) {
        refArray[refi] = f[start];
        refi++;
      }
    }

    if (ncol == 4) {
      if ( f[start] != '\t' ) {
        altArray[alti] = f[start];
        alti++;
      }
    }

    if (ncol == 5) {
      if ( f[start] != '\t' ) {
        qualityArray[qi] = f[start];
        qi++;
      }
    }

    if (ncol == 8) {
      if ( f[start] != '\t' ) {
        if (f[start] == ':') formatHeaderCount++;
        if (f[start] == 'G' && f[start+1] == 'T' && f[start+2] == ':' ) {
          numGT = formatHeaderCount;
        }
        if (f[start] == ':' && f[start+1] == 'G' && f[start+2] == 'Q' &&  f[start+3] == 'X' && f[start+4] == ':') {
          numGQX = formatHeaderCount;
        }

        if (f[start] == ':' && f[start+1] == 'D' && f[start+2] == 'P' && ( f[start+3] == ':' || ( f[start+3] == 'I' && f[start+4] == ':') )) {
          numDP = formatHeaderCount;
        }

        if (f[start] == ':' && f[start+1] == 'A' && f[start+2] == 'D' && f[start+3] == ':' ) {
          numAD = formatHeaderCount;
        }

      }
    }


    if (ncol == 9) {
      if ( f[start] != '\t' ) {
        if (f[start] == ':') formatLastCount++;
        if (formatLastCount == numGT) {
          if ( f[start] != ':' ) {
            gtArray[gti] = f[start];
            gti++;
          }
        }

        if (formatLastCount == numGQX) {
          if ( f[start] != ':' ) {
            gqxArray[gqxi] = f[start];
            gqxi++;
          }
        }

        if (formatLastCount == numDP) {
          if ( f[start] != ':' ) {
            dpArray[dpi] = f[start];
            dpi++;
          }
        }

        if (formatLastCount == numAD) {
          if ( f[start] != ':' ) {
            adArray[adi] = f[start];
            adi++;
          }
        }

      }
    }


  }

  chromValue.append(chromArray);
  posValue.append(posArray);
  refValue.append(refArray);
  altValue.append(altArray);
  quality.append(qualityArray);
  gtValue.append(gtArray);
  gqxValue.append(gqxArray);
  dpValue.append(dpArray);
  adValue.append(adArray);

  if (gqxi < 2 || dpi < 2 || qi < 2) continue;
  if (stoi(gqxValue) < 30) continue;

  std::ofstream myfile ("myRes.txt", std::ios_base::app);
  if (myfile.is_open()) {
    myfile <<
            nl << "\t" <<
            chromValue << "-" << posValue << "-" << refValue << "-" << altValue << "\t" <<
            gtValue << "\t" <<
            gqxValue << "\t" <<
            quality << "\t" <<
            dpValue << "\t" <<
            adValue <<
            "\n";
    myfile.close();
  } else {
    std::cout << "Unable to open file" << '\n';
  }
}

}

void handle_error(const char* msg) {
perror(msg);
exit(255);
}

const char* map_file(const char* fname, size_t& length) {

int fd = open(fname, O_RDONLY);

if (fd == -1)
    handle_error("open");

struct stat sb;
if (fstat(fd, &sb) == -1)
    handle_error("fstat");
length = sb.st_size;

const char* addr = static_cast<const char*>(mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, 0u));
if (addr == MAP_FAILED)
    handle_error("mmap");

return addr;
}

Now, I know I can open GZIP file with something like:

#include <fstream>
#include <iostream>
#include <sstream>
#include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/iostreams/filter/gzip.hpp>

//NB: devo linkare a libreria boost zlib in comando: c++ --std=c++11 -L/opt/X11/lib -lboost_iostreams -lz gzread.cpp -o gzread

using namespace std;
using namespace boost::iostreams;

int main()
{
ifstream file("myfile.gz", ios_base::in | ios_base::binary);
filtering_streambuf<input> inbuf; //iniziallizzo filtering_streambuf inbuf
inbuf.push(gzip_decompressor()); //ci metto dentro decompressore GZIP (se file GZIP)
inbuf.push(file); //ci metto dentro file

//Convert streambuf to istream
std::istream instream(&inbuf);
//Iterate lines
std::string line;

string chr;

while(std::getline(instream, line)) {
istringstream iss(line); // string stream della linea
int i = 0;
while (getline(iss, line, t)) { // read first part up to comma, ignore the comma (il terzo arfomento di getline gli indica dove fermarsi, se assente si ferma a newline)
if (i == 2) cout << line << "n";
++i;
}
}
// copy(inbuf, cout); //copio in stdout
}

here an example of a file row:

chr1 1246301 . A C 4 OffTarget;LowGQX SNVSB=0.0;SNVHPOL=2;phyloP=1.096;CSQT=1|ACAP3|NM_030649.2|upstream_gene_variant,1|PUSL1|NM_153339.1|missense_variant,1|CPSF3L|NM_001256456.1|downstream_gene_variant GT:GQ:GQX:DP:DPF:AD:PL 0/1:3:0:1:0:0,1:37,3,0

Is there a way to combine them? Or even other approaches if they can be more "performant".

Thanks a lot for any suggestion!

cccnrc
  • 1,195
  • 11
  • 27
  • "performant" is *not* a word. Anyway, what are you trying to achieve? – Jesper Juhl Mar 21 '19 at 20:06
  • I need to retain only some part of the original rows based on the value of some columns/fields (ex. if (gqxi < 2 || dpi < 2 || qi < 2) continue; if (stoi(gqxValue) < 30) continue;) to later pass them and compare with other programs filtering of similar file but from different subjects, in order to extract similarities/differences. The other files are very big (up to 10GB .GZ) so I am trying to use fast comparison methods. – cccnrc Mar 21 '19 at 20:08
  • Why is that info not part of the question? – Jesper Juhl Mar 21 '19 at 20:10
  • completely right, edited – cccnrc Mar 21 '19 at 20:13

1 Answers1

1

You can read a memory-mapped gzip file with zlib's inflate() functions. (Read the documentation in zlib.h.)

However regardless of whether you read from the file or read from the memory map, you cannot jump around the uncompressed data. The uncompressed data must be processed sequentially, or saved sequentially for later random access processing.

Mark Adler
  • 101,978
  • 13
  • 118
  • 158
  • Thank you for your reply, I tried the library but it seems that a real time decompression-reading with buffer is faster... – cccnrc Mar 23 '19 at 03:00