4

I want to make a javascript routine that works like GzipInputStream in Java, to read very large gzip file on client side without using node.js.

This is my code (not working yet), inspired from here :

function ab2string(buf) {
   var str = "";
   var ab = new Uint16Array(buf);
   var abLen = ab.length;
   var CHUNK_SIZE = Math.pow(2, 16);
   var offset, len, subab;
   for (offset = 0; offset < abLen; offset += CHUNK_SIZE) {
      len = Math.min(CHUNK_SIZE, abLen-offset);
      subab = ab.subarray(offset, offset+len);
      str += String.fromCharCode.apply(null, subab);
   }
   return str;
}
function string2ab(str) {
  var buf = new ArrayBuffer(str.length*2); // 2 bytes for each char
  var bufView = new Uint16Array(buf);
  for (var i=0, strLen=str.length; i<strLen; i++) {
    bufView[i] = str.charCodeAt(i);
  }
  return buf;
}
function FileGzipStreamer() {
    var loopholeReader = new FileReader();
    var chunkReader = new FileReader(); 
    var delimiter = "\n".charCodeAt(0); 

    var expectedChunkSize = 500000; // Slice size to read
    var loopholeSize = 500;         // Slice size to search for line end

    var file = null;
    var fileSize;
    var loopholeStart;
    var loopholeEnd;
    var chunkStart;
    var chunkEnd;
    var allString;
    var lines;
    var thisForClosure = this;
    var handler;
    var fulltext=[];
    var fulltext2=[];
    var fextra=false;
    var fname=false;
    var fcomment=false;
    var fhcrc=false;
    var counter=0;
    var counter2=0;
    var binString=[];


    // Reading of loophole ended
    loopholeReader.onloadend = function(evt) {
        // Read error
        if (evt.target.readyState != FileReader.DONE) {
            handler(null, new Error("Not able to read loophole (start: )"));
            return;
        }
        binString=[];
        binString=evt.target.result.split('').map(function(e){return e.charCodeAt(0);});
        fulltext=fulltext.concat(binString);
        var len=fulltext.length;
        $("#conclusion").append("\n"+"Length="+len+"\n");
        var start=0;
        if (fulltext[0]==31 || fulltext[1]==139) {
            if (fulltext[2]==8) {
                start=10;
                if (Number(fulltext[3]&4)!=4 && Number(fulltext[3]&2)!=2 && Number(fulltext[3]&1)!=1 && Number(fulltext[3]&128)!=128) {
                    if (Number(fulltext[3]&32)==32) {
                        fextra=true;
                    }
                    if (Number(fulltext[3]&16)==16) {
                        fname=true;
                    }
                    if (Number(fulltext[3]&8)==8) {
                        fcomment=true;
                    }
                    if (Number(fulltext[3]&64)==64) {
                        fhcrc=true;
                    }
                }
                else {
                    $("#conclusion").append("Gzip file is invalid");
                }
                start=10
                if (fextra==true) {
                    incrementor=fulltext[start]+256*fulltext[start+1];
                    start+=incrementor+2; // 2 for xlen
                }
                if (fname==true) {
                    start+=1;
                    while(fulltext[start-1]!=0)
                        start+=1
                }
                if (fcomment==true) {
                    start+=1
                    while(fulltext[start-1]!=0)
                        start+=1
                }
                if (fhcrc==true) {
                    start+=2;
                }
                var uncompressed=zip_inflate(ab2string(fulltext.slice(28,len)));
                var splitline=uncompressed.split("\n");
                //$("#conclusion").append(splitline.length+"\n");
                var temp=counter;
                $("#conclusion").append("\n"+"Counter="+counter+", Splitlinelength="+splitline.length+"\n");
                var uncompressed2="";
                //var test=Math.random();
                //$("#conclusion").append(uncompressed);
                for (var i=temp;i<splitline.length-5; i++) {
                    counter+=1;
                    uncompressed2+=splitline[i]+"\n";
                    //if (splitline[i].indexOf("\n")!=-1)
                    //$("#conclusion").append(i+"start"+splitline[i]+"end\n");
                    $("#conclusion").append(splitline[i]);
                    $("#conclusion").append("\n");
                }
                var view = new DataView(string2ab(uncompressed2));
                var realLoopholeSize = loopholeEnd - loopholeStart;
                //$("#conclusion").append("1"+uncompressed+"\n\n\n");
                //$("#conclusion").append(realLoopholeSize+'--'+fulltext.length+'x');
                for(var i = realLoopholeSize - 1; i >= 0; i--) {
                    if (view.getInt8(i) == delimiter) {
                        chunkEnd = loopholeStart + i + 1;
                        var blob = file.slice(chunkStart, chunkEnd);
                        $("#conclusion").append(chunkStart+'xxz'+chunkEnd+'y');
                        chunkReader.readAsBinaryString(blob);
                        return;
                    }
                }

                // No delimiter found, looking in the next loophole
                $("#conclusion").append("test");
                loopholeStart = loopholeEnd;
                loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
                thisForClosure.getNextLine();
                //$("#conclusion").append(zip_inflate(String.fromCharCode.apply(null,fulltext.slice(start,len))));
            }
            else {
                $("#conclusion").append("Unknown compression method!");
            }
        }
        else{
            $("#conclusion").append("Not a gzipped file!");
        }
        //$("#conclusion").append(zip_inflate(String.fromCharCode.apply(null,fulltext)));
        //fulltext=fulltext.concat(arr2);
        //var theText=zip_inflate(String.fromCharCode.apply(null,fulltext.slice(start,len)));
        //$("#conclusion").append("yy"+loopholeEnd+'--'+loopholeStart);
        // No delimiter found, looking in the next loophole
        //loopholeStart = loopholeEnd;
        //loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);

        //thisForClosure.getNextLine();
    };

    // Reading of chunk ended
    chunkReader.onloadend = function(evt) {
        // Read error
        if (evt.target.readyState != FileReader.DONE) {
            handler(null, new Error("Not able to read loophole"));
            return;
        }
        var binString2=evt.target.result.split('').map(function(e){return e.charCodeAt(0);});
        $("#conclusion").append("text2="+binString+"\n");
        fulltext2=fulltext2.concat(binString2);
        var len2=fulltext2.length;
        var start2=0;
        if (fulltext2[0]==31 || fulltext2[1]==139) {
            if (fulltext2[2]==8) {
                start2=10;
                if (Number(fulltext2[3]&4)!=4 && Number(fulltext2[3]&2)!=2 && Number(fulltext2[3]&1)!=1 && Number(fulltext2[3]&128)!=128) {
                    if (Number(fulltext2[3]&32)==32) {
                        fextra=true;
                    }
                    if (Number(fulltext2[3]&16)==16) {
                        fname=true;
                    }
                    if (Number(fulltext2[3]&8)==8) {
                        fcomment=true;
                    }
                    if (Number(fulltext2[3]&64)==64) {
                        fhcrc=true;
                    }
                }   
                else {
                    $("#conclusion").append("Gzip file is invalid");
                }
                if (fextra==true) {
                    incrementor=fulltext2[start2]+256*fulltext2[start2+1];
                    start2+=incrementor+2; // 2 for xlen
                }
                if (fname==true) {
                    start2+=1;
                    while(fulltext2[start2-1]!=0)
                        start2+=1;
                }
                if (fcomment==true) {
                    start2+=1
                    while(fulltext2[start2-1]!=0)
                        start2+=1;
                }
                if (fhcrc==true) {
                    start2+=2;
                }
            }
        }
        //$("#conclusion").append(zip_inflate(String.fromCharCode.apply(null,binString)));
        //binString=binString.concat(arr2);
        var theText=zip_inflate(ab2string(fulltext2.slice(start2,len2)));
        //var temp=counter;
        //var splitline2=theText.split(/\r?\n/);
        //var uncompressed3="";
        //var test=Math.random();
        //for (var i=0;i<splitline2.length; i++) {
            //uncompressed3+=splitline2[i]+"\n";
            //$("#conclusion").append(splitline2[i]);
        //}

        //$("#conclusion").append("3"+theText+"\n\n\n");
        // Remove last new line in the end of chunk
        if (lines.length > 0 && lines[lines.length - 1] == "") {
            lines.pop();
        }
        var temp=0;
        var lines = theText.split(/\r?\n/);
        for (var i=temp;i<lines.length; i++) {
            //counter+=1;
            //uncompressed2+=splitline[i]+"\n";
            //if (splitline[i].indexOf("\n")!=-1)
            //$("#conclusion").append(i+"start"+splitline[i]+"end\n");
            $("#conclusion").append(lines[i]);
            $("#conclusion").append("\n");
        }
        chunkStart = chunkEnd;
        chunkEnd = Math.min(chunkStart, fileSize);
        loopholeStart = Math.min(chunkEnd, fileSize);
        loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize); 
        thisForClosure.getNextLine();
    };


    // Public: open file for reading
    this.open = function (fileToOpen, linesProcessed) {
        file = fileToOpen;
        fileSize = file.size;
        loopholeStart = 0;
        loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
        chunkStart = 0;
        chunkEnd = 0;
        lines = null;
        handler = linesProcessed;
    };

    // Public: start getting new line async
    this.getNextLine = function() {
        // File wasn't open
        if (file == null) {     
            handler(null, new Error("You must open a file first"));
            return;
        }
        // Some lines available
        if (lines != null) {
            var linesForClosure = lines;
            setTimeout(function() { handler(linesForClosure, null) }, 0);
            lines = null;
            return;
        }
        // End of File
        if (chunkStart == fileSize) {
            handler(null, null);
            return;
        }
        // File part bigger than expectedChunkSize is left
        if (loopholeStart < fileSize) {
            var blob = file.slice(loopholeStart, loopholeEnd);
            loopholeReader.readAsBinaryString(blob);
        }
        // All file can be read at once
        else {
            chunkEnd = fileSize;
            var blob = file.slice(chunkStart, fileSize);
            chunkReader.readAsBinaryString(blob);
        }
    };
};

The algorithm here looks simple: skip the header, and call inflate() routine like this on the compressed blocks. But since the gzip file is very large (tens or hundreds of GB), I need to inflate the compressed blocks pieces by pieces.

Is there any way to partition the compressed blocks and inflate on the fly like Java GzipInputStream, in JavaScript without using Node.js?

Community
  • 1
  • 1
dhany1024
  • 133
  • 2
  • 9
  • Trying to read a file several hundred GB in size in a single threaded environment like JS seems like madness to me. If you're running this in the browser, it will lock up for possibly _hours_ on most machines, I would imagine. And some browsers, like Firefox, will pop up an "unresponsive script" warning after not very long. – GregL Oct 21 '14 at 23:06
  • That's why I use WebWorker. And in my case, the gzip reading stops when a condition fulfilled (by terminating the WebWorker). It worked on reading uncompressed file like [this](http://stackoverflow.com/questions/24647563/reading-line-by-line-file-in-javascript-on-client-side). The question now is how to decode/inflate gzip part by part? – dhany1024 Oct 21 '14 at 23:13
  • I think you might find use with streams. That's what they are for. Streams have been ported to the browser through the browserify project: https://github.com/substack/stream-browserify – Sukima Oct 21 '14 at 23:16
  • Is there any "non Node.js" javascript solution? – dhany1024 Oct 21 '14 at 23:20

1 Answers1

3

In node, we can create a readable stream from a file (fs.createReadableStream()) and pipe it to the zlib.createGunzip(). The readable stream reads the data chunk-by-chunk and then pass it through to the gunzip sink. Hence, if we put a gzip-ed file into this setup; we will get the extracted data chunk-by-chunk.

With the browserify's help, we can do this inside the browser.

e.g. using this main.js file

// browserify automatically replaces the node's native zlib with this:
// https://www.npmjs.com/package/browserify-zlib
var zlib = require('zlib');

var drop = require('drag-and-drop-files');
var createReadStream = require('filereader-stream');

var gunzip = zlib.createGunzip();

drop(document.getElementById('drop'), function(files) {
  var first = files[0];
  createReadStream(first).pipe(gunzip);

  gunzip.on('data', function(data){
    // read the data chunk-by-chunk
    console.log(data.toString());
  });

  gunzip.on('end', function(){
    console.log('done');
  });
});

To make it work inside the browser, we should put the browserify charm on it.

$ browserify app.js > bundle.js

And then, we can load up an index.html with bundle.js in it (don't forget the drop-zone).

I quickly put together the poc (the streaming part, maybe we need to play with the webworker API to cope with a very big file) on this repo.

poc

diorahman
  • 53
  • 1
  • 9
  • 1
    Is there a simple way without all the fancy stuff? I need something like `import blah; new GZIPInputStream(blah blah);` in JavaScript. – Jus12 Jul 07 '16 at 06:36