27

I have 3 terabyte .gz file and want to read its uncompressed content line-by-line in a C++ program. As the file is quite huge, I want to avoid loading it completely in memory.

Can anyone post a simple example of doing it?

Philipp Claßen
  • 41,306
  • 31
  • 146
  • 239
Shihab
  • 271
  • 1
  • 3
  • 3

7 Answers7

16

You most probably will have to use ZLib's deflate, example is available from their site

Alternatively you may have a look at BOOST C++ wrapper

The example from BOOST page (decompresses data from a file and writes it to standard output)

#include <fstream>
#include <iostream>
#include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/iostreams/filter/zlib.hpp>

int main() 
{
    using namespace std;

    ifstream file("hello.z", ios_base::in | ios_base::binary);
    filtering_streambuf<input> in;
    in.push(zlib_decompressor());
    in.push(file);
    boost::iostreams::copy(in, cout);
}
bobah
  • 18,364
  • 2
  • 37
  • 70
  • 3
    Here there is a better example of how to read line-by-line using boost: https://techoverflow.net/2013/11/03/c-iterating-lines-in-a-gz-file-using-boostiostreams/ – Lluís Padró Jun 13 '18 at 09:20
15

For something that is going to be used regularly, you probably want to use one of the previous suggestions. Alternatively, you can do

gzcat file.gz | yourprogram

and have yourprogram read from cin. This will decompress parts of the file in memory as it is needed, and send the uncompressed output to yourprogram.

KeithB
  • 16,577
  • 3
  • 41
  • 45
5

Using zlib, I'm doing something along these lines:

// return a line in a std::vector< char >
std::vector< char > readline( gzFile f ) {
    std::vector< char > v( 256 );
    unsigned pos = 0;
    for ( ;; ) {
        if ( gzgets( f, &v[ pos ], v.size() - pos ) == 0 ) {
            // end-of-file or error
            int err;
            const char *msg = gzerror( f, &err );
            if ( err != Z_OK ) {
                // handle error
            }
            break;
        }
        unsigned read = strlen( &v[ pos ] );
        if ( v[ pos + read - 1 ] == '\n' ) {
            if ( pos + read >= 2 && v[ pos + read - 2 ] == '\r' ) {
                pos = pos + read - 2;
            } else {
                pos = pos + read - 1;
            }
            break;
        }
        if ( read == 0 || pos + read < v.size() - 1 ) {
            pos = read + pos;
            break;
        }
        pos = v.size() - 1;
        v.resize( v.size() * 2 );
    }
    v.resize( pos );
    return v;
}

EDIT: Removed two mis-copied * in the example above. EDIT: Corrected out of bounds read on v[pos + read - 2]

pawel_j
  • 419
  • 4
  • 14
mkluwe
  • 3,823
  • 2
  • 28
  • 45
  • 1
    Hmm, this has been downvoted at least twice. It's a working example on how to use "basic" zlib IMHO, so please consider comment when voting this down to make an improvemt possible. – mkluwe Jan 02 '14 at 11:26
  • 1
    This had a major bug on line with v[pos+read-2] when line was only one character '\n' (as empty line in linux) you have a beyond array read (-1 is converted to a large unsigned memory address). I've added a pre-check. – pawel_j Nov 28 '18 at 14:50
2

Here is some code with which you can read normal and zipped files line by line:

char line[0x10000];
FILE *infile=open_file(file);
bool gzipped=endsWith(file, ".gz");
if(gzipped) 
    init_gzip_stream(infile,&line[0]);
while (readLine(infile,line,gzipped)) {
    if(line[0]==0)continue;// skip gzip new_block
    printf(line);
}


#include <zlib.h>
#define CHUNK 0x100
#define OUT_CHUNK CHUNK*100
unsigned char gzip_in[CHUNK];
unsigned char gzip_out[OUT_CHUNK];
///* These are parameters to inflateInit2. See http://zlib.net/manual.html for the exact meanings. */
#define windowBits 15
#define ENABLE_ZLIB_GZIP 32
z_stream strm = {0};
z_stream init_gzip_stream(FILE* file,char* out){// unsigned     
        strm.zalloc = Z_NULL;
        strm.zfree = Z_NULL;
        strm.opaque = Z_NULL;
        strm.next_in = gzip_in;
        strm.avail_in = 0;
        strm.next_out = gzip_out;
        inflateInit2 (& strm, windowBits | ENABLE_ZLIB_GZIP);
    return strm;
}

bool inflate_gzip(FILE* file, z_stream strm,size_t bytes_read){
            strm.avail_in = (int)bytes_read;
            do {
                strm.avail_out = OUT_CHUNK;
                inflate (& strm, Z_NO_FLUSH);
//              printf ("%s",gzip_out);
            }while (strm.avail_out == 0);
            if (feof (file)) {
                inflateEnd (& strm);
                return false;
            }
    return true;// all OK
}


char* first_line=(char*)&gzip_out[0];
char* current_line=first_line;
char* next_line=first_line;
char hangover[1000];
bool readLine(FILE* infile,char* line,bool gzipped){
    if(!gzipped)
        return fgets(line, sizeof(line), infile) != NULL;
    else{
        bool ok=true;
        current_line=next_line;
        if(!current_line || strlen(current_line)==0 || next_line-current_line>OUT_CHUNK){
            current_line=first_line;
            size_t bytes_read = fread (gzip_in, sizeof (char), CHUNK, infile);
            ok=inflate_gzip(infile,strm,bytes_read);
            strcpy(line,hangover);
        }
        if(ok){
            next_line=strstr(current_line,"\n");
            if(next_line){
                next_line[0]=0;
                next_line++;
                strcpy(line+strlen(hangover),current_line);
                hangover[0]=0;
            }else{
                strcpy(hangover,current_line);
                line[0]=0;// skip that one!!
            }
        }
        return ok;
    }
}
Anona112
  • 3,724
  • 4
  • 21
  • 30
2

The zlib library supports decompressing files in memory in blocks, so you don't have to decompress the entire file in order to process it.

Amnon
  • 7,652
  • 2
  • 26
  • 34
1

You can't do that, because *.gz doesn't have "lines".

If compressed data has newlines, you'll have to decompress it. You don't have to decompress all data at once, you know, you can do it in chunks, and send strings back to main program when you encounter newline characters. *.gz can be decompressed using zlib.

SigTerm
  • 26,089
  • 6
  • 66
  • 115
-1

Chilkat (http://www.chilkatsoft.com/) has libraries to read compressed files from a C++, .Net, VB, ... application.

Patrick
  • 23,217
  • 12
  • 67
  • 130