I want to make a javascript routine that works like GzipInputStream in Java, to read very large gzip file on client side without using node.js.
This is my code (not working yet), inspired from here :
function ab2string(buf) {
var str = "";
var ab = new Uint16Array(buf);
var abLen = ab.length;
var CHUNK_SIZE = Math.pow(2, 16);
var offset, len, subab;
for (offset = 0; offset < abLen; offset += CHUNK_SIZE) {
len = Math.min(CHUNK_SIZE, abLen-offset);
subab = ab.subarray(offset, offset+len);
str += String.fromCharCode.apply(null, subab);
}
return str;
}
function string2ab(str) {
var buf = new ArrayBuffer(str.length*2); // 2 bytes for each char
var bufView = new Uint16Array(buf);
for (var i=0, strLen=str.length; i<strLen; i++) {
bufView[i] = str.charCodeAt(i);
}
return buf;
}
function FileGzipStreamer() {
var loopholeReader = new FileReader();
var chunkReader = new FileReader();
var delimiter = "\n".charCodeAt(0);
var expectedChunkSize = 500000; // Slice size to read
var loopholeSize = 500; // Slice size to search for line end
var file = null;
var fileSize;
var loopholeStart;
var loopholeEnd;
var chunkStart;
var chunkEnd;
var allString;
var lines;
var thisForClosure = this;
var handler;
var fulltext=[];
var fulltext2=[];
var fextra=false;
var fname=false;
var fcomment=false;
var fhcrc=false;
var counter=0;
var counter2=0;
var binString=[];
// Reading of loophole ended
loopholeReader.onloadend = function(evt) {
// Read error
if (evt.target.readyState != FileReader.DONE) {
handler(null, new Error("Not able to read loophole (start: )"));
return;
}
binString=[];
binString=evt.target.result.split('').map(function(e){return e.charCodeAt(0);});
fulltext=fulltext.concat(binString);
var len=fulltext.length;
$("#conclusion").append("\n"+"Length="+len+"\n");
var start=0;
if (fulltext[0]==31 || fulltext[1]==139) {
if (fulltext[2]==8) {
start=10;
if (Number(fulltext[3]&4)!=4 && Number(fulltext[3]&2)!=2 && Number(fulltext[3]&1)!=1 && Number(fulltext[3]&128)!=128) {
if (Number(fulltext[3]&32)==32) {
fextra=true;
}
if (Number(fulltext[3]&16)==16) {
fname=true;
}
if (Number(fulltext[3]&8)==8) {
fcomment=true;
}
if (Number(fulltext[3]&64)==64) {
fhcrc=true;
}
}
else {
$("#conclusion").append("Gzip file is invalid");
}
start=10
if (fextra==true) {
incrementor=fulltext[start]+256*fulltext[start+1];
start+=incrementor+2; // 2 for xlen
}
if (fname==true) {
start+=1;
while(fulltext[start-1]!=0)
start+=1
}
if (fcomment==true) {
start+=1
while(fulltext[start-1]!=0)
start+=1
}
if (fhcrc==true) {
start+=2;
}
var uncompressed=zip_inflate(ab2string(fulltext.slice(28,len)));
var splitline=uncompressed.split("\n");
//$("#conclusion").append(splitline.length+"\n");
var temp=counter;
$("#conclusion").append("\n"+"Counter="+counter+", Splitlinelength="+splitline.length+"\n");
var uncompressed2="";
//var test=Math.random();
//$("#conclusion").append(uncompressed);
for (var i=temp;i<splitline.length-5; i++) {
counter+=1;
uncompressed2+=splitline[i]+"\n";
//if (splitline[i].indexOf("\n")!=-1)
//$("#conclusion").append(i+"start"+splitline[i]+"end\n");
$("#conclusion").append(splitline[i]);
$("#conclusion").append("\n");
}
var view = new DataView(string2ab(uncompressed2));
var realLoopholeSize = loopholeEnd - loopholeStart;
//$("#conclusion").append("1"+uncompressed+"\n\n\n");
//$("#conclusion").append(realLoopholeSize+'--'+fulltext.length+'x');
for(var i = realLoopholeSize - 1; i >= 0; i--) {
if (view.getInt8(i) == delimiter) {
chunkEnd = loopholeStart + i + 1;
var blob = file.slice(chunkStart, chunkEnd);
$("#conclusion").append(chunkStart+'xxz'+chunkEnd+'y');
chunkReader.readAsBinaryString(blob);
return;
}
}
// No delimiter found, looking in the next loophole
$("#conclusion").append("test");
loopholeStart = loopholeEnd;
loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
thisForClosure.getNextLine();
//$("#conclusion").append(zip_inflate(String.fromCharCode.apply(null,fulltext.slice(start,len))));
}
else {
$("#conclusion").append("Unknown compression method!");
}
}
else{
$("#conclusion").append("Not a gzipped file!");
}
//$("#conclusion").append(zip_inflate(String.fromCharCode.apply(null,fulltext)));
//fulltext=fulltext.concat(arr2);
//var theText=zip_inflate(String.fromCharCode.apply(null,fulltext.slice(start,len)));
//$("#conclusion").append("yy"+loopholeEnd+'--'+loopholeStart);
// No delimiter found, looking in the next loophole
//loopholeStart = loopholeEnd;
//loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
//thisForClosure.getNextLine();
};
// Reading of chunk ended
chunkReader.onloadend = function(evt) {
// Read error
if (evt.target.readyState != FileReader.DONE) {
handler(null, new Error("Not able to read loophole"));
return;
}
var binString2=evt.target.result.split('').map(function(e){return e.charCodeAt(0);});
$("#conclusion").append("text2="+binString+"\n");
fulltext2=fulltext2.concat(binString2);
var len2=fulltext2.length;
var start2=0;
if (fulltext2[0]==31 || fulltext2[1]==139) {
if (fulltext2[2]==8) {
start2=10;
if (Number(fulltext2[3]&4)!=4 && Number(fulltext2[3]&2)!=2 && Number(fulltext2[3]&1)!=1 && Number(fulltext2[3]&128)!=128) {
if (Number(fulltext2[3]&32)==32) {
fextra=true;
}
if (Number(fulltext2[3]&16)==16) {
fname=true;
}
if (Number(fulltext2[3]&8)==8) {
fcomment=true;
}
if (Number(fulltext2[3]&64)==64) {
fhcrc=true;
}
}
else {
$("#conclusion").append("Gzip file is invalid");
}
if (fextra==true) {
incrementor=fulltext2[start2]+256*fulltext2[start2+1];
start2+=incrementor+2; // 2 for xlen
}
if (fname==true) {
start2+=1;
while(fulltext2[start2-1]!=0)
start2+=1;
}
if (fcomment==true) {
start2+=1
while(fulltext2[start2-1]!=0)
start2+=1;
}
if (fhcrc==true) {
start2+=2;
}
}
}
//$("#conclusion").append(zip_inflate(String.fromCharCode.apply(null,binString)));
//binString=binString.concat(arr2);
var theText=zip_inflate(ab2string(fulltext2.slice(start2,len2)));
//var temp=counter;
//var splitline2=theText.split(/\r?\n/);
//var uncompressed3="";
//var test=Math.random();
//for (var i=0;i<splitline2.length; i++) {
//uncompressed3+=splitline2[i]+"\n";
//$("#conclusion").append(splitline2[i]);
//}
//$("#conclusion").append("3"+theText+"\n\n\n");
// Remove last new line in the end of chunk
if (lines.length > 0 && lines[lines.length - 1] == "") {
lines.pop();
}
var temp=0;
var lines = theText.split(/\r?\n/);
for (var i=temp;i<lines.length; i++) {
//counter+=1;
//uncompressed2+=splitline[i]+"\n";
//if (splitline[i].indexOf("\n")!=-1)
//$("#conclusion").append(i+"start"+splitline[i]+"end\n");
$("#conclusion").append(lines[i]);
$("#conclusion").append("\n");
}
chunkStart = chunkEnd;
chunkEnd = Math.min(chunkStart, fileSize);
loopholeStart = Math.min(chunkEnd, fileSize);
loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
thisForClosure.getNextLine();
};
// Public: open file for reading
this.open = function (fileToOpen, linesProcessed) {
file = fileToOpen;
fileSize = file.size;
loopholeStart = 0;
loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
chunkStart = 0;
chunkEnd = 0;
lines = null;
handler = linesProcessed;
};
// Public: start getting new line async
this.getNextLine = function() {
// File wasn't open
if (file == null) {
handler(null, new Error("You must open a file first"));
return;
}
// Some lines available
if (lines != null) {
var linesForClosure = lines;
setTimeout(function() { handler(linesForClosure, null) }, 0);
lines = null;
return;
}
// End of File
if (chunkStart == fileSize) {
handler(null, null);
return;
}
// File part bigger than expectedChunkSize is left
if (loopholeStart < fileSize) {
var blob = file.slice(loopholeStart, loopholeEnd);
loopholeReader.readAsBinaryString(blob);
}
// All file can be read at once
else {
chunkEnd = fileSize;
var blob = file.slice(chunkStart, fileSize);
chunkReader.readAsBinaryString(blob);
}
};
};
The algorithm here looks simple: skip the header, and call inflate() routine like this on the compressed blocks. But since the gzip file is very large (tens or hundreds of GB), I need to inflate the compressed blocks pieces by pieces.
Is there any way to partition the compressed blocks and inflate on the fly like Java GzipInputStream, in JavaScript without using Node.js?