2

I am working on a web app (pure HTML/Javascript, no libraries) that does byte-level processing of a file (Huffman Encoding demo). It works beautifully (you do NOT want to know how long it took to get there), but my sense of completion is bothering me just a bit because I have to load the files to and from an ArrayBuffer instead of streaming from the HDD. There's also a filesize limitation, although it would admittedly take quite a long time to compress a 4GB file (the maximum that my data structures support).

Still, in the interest of making this app work on low-resource devices, how might I stream a file from a file input box (I need multiple passes for the frequency counting, filesize detection, and actual write) and to a browser download of some sort (that's in one pass at least, thankfully)?

Here are the relevant functions that handle it right now (I apologize for the globals :P):

//Load the file
  function startProcessingFile(){ //Loads the file and sets up a callback to start the main process when done.
    var ff=document.getElementById("file");//I am assuming that you don't need to see the HTML here. :D
    if (ff.files.length === 0) {
      displayError("No file selected");
    }
    else{
      displayStatus("Loading File...");
      var fr = new FileReader;
      fr.onload=function () {inp = new DataView(fr.result); boot();}
      fr.onerror=function () {displayError(fr.error)};
      fr.readAsArrayBuffer(ff.files[0]);
    }
  }

//A bit later on -- one of the functions that reads the data from the input file
function countTypes(c){ //counts the frequencies. c is # bytes processed.
  if (die){
    die=false;
    return;
  }
  var i=Math.ceil(inputSize/100.0);
  while (c<inputSize && i>0){
    var d=inp.getUint8(c);
    frequencies[d]=frequencies[d]+1;
    i--;
    c++;//Accidental, but funny.
  }
  var perc=100.0*c/inputSize;
  updateProgress(perc);
  if (c<inputSize){
    setTimeout(function () {countTypes(c);}, 0);
  }
  else{
    updateProgress(100);
    system_state++;
    taskHandle();
  }
}

//Here's where the file is read the last time and also where the bits come from that I want to save. If I could stream the data directly I could probably even get rid of the dry-run stage I currently need to count how many bytes to allocate for the output ArrayBuffer. I mean, Google Drive can download files without telling the browser the size, just whether it's done yet or not, so I'd assume that's a feature I could access here too. I'm just not sure how you actually gain access to a download from JS in the first place.
function encode(c,d){ //performs the Huffman encoding. 
//If d is true, does not actually write. c is # of bits processed so far.
  if (die){
    die=false;
    return;
  }
  var i=Math.ceil(inputSize/250.0);
  while (c<inputSize && i>0){
    var b=inp.getUint8(c);
    var seq;
    for (var j=0; j<table.length; j++){
      if (table[j].value===b){
        seq=table[j].code
      }
    }
    for (var j=0; j<seq.length; j++){
      writeBit(seq[j],d);
    }
    i--;
    c++;//Accidental, but funny.
  }
  var perc=100.0*c/inputSize;
  updateProgress(perc);
  if (c<inputSize){
    setTimeout(function () {encode(c,d);}, 0);
  }
  else{
    updateProgress(100);
    system_state++;
    taskHandle();
  }
}

//Finally, bit-level access for unaligned read/write so I can actually take advantage of the variable word size of the Huffman encoding (the read is used for decoding).
function readBit(){ //reads one bit (b) from the ArrayBuffer/DataView. The offset of 4 is for the filesize int.
  var data_byte=inp.getUint8(byte_index+4);
  var res=data_byte>>>bit_index;
  bit_index+=1;
  if (bit_index>7){
    bit_index=0;
    byte_index++;
  }
  return (res&1);
}

function writeBit(b,d){ //writes one bit (b) to the output Arraybuffer/Dataview. If d is true, does not actually write.
  if (d===false){ //i.e. not dry-run mode
    var bitmask=0xff;
    var flag=1<<bit_index;
    bitmask=bitmask^flag;
    current_byte=current_byte&bitmask;
    current_byte=current_byte|(b<<bit_index);
    output.setUint8(byte_index+4, current_byte);
  }
  bit_index+=1;
  if (bit_index>7){
    bit_index=0;
    byte_index++;
  }
}

function readByte(){ //reads a byte using readBit. Unaligned.
  var b=0;
  for (var i=0; i<8; i++){
    var t=readBit();
    b=b|(t<<i);
  }
  return b;
}

function writeByte(b,d){ //writes a byte using writeByte. Unaligned.
  for (var i=0; i<8; i++){
    var res=b>>>i;
    writeBit((res&1),d); 
  }
}

//And finally the download mechanism I'm using.
function downloadResult(){//download processed file with specified extension
  var blobObject = new Blob([output], {type: 'application/octet-stream'});
  var n=source_name.split('\\').pop().split('/').pop();
  if (doEncode){
    n=n+fext
  }else{
    n=n.replace(fext,"");
  }
  var a = document.createElement("a");
  a.setAttribute("href", URL.createObjectURL(blobObject));
  a.setAttribute("download", n);
  a.click();
  delete a;
  running=false;
  var b=document.getElementById("ac");
  if (b.classList.contains("activeNav")){
    clearRes();
  }
}

I basically want to rip most of that out and replace it with something that can read bytes or medium-ish chunks of data out of the file that the user selects, and then when it gets to the actual output stage, trickle that data byte-by-byte through a more-or-less vanilla download to their download folder.

I do know that multiple files can be selected in a file input box, so perhaps if it's possible to download to a subfolder I could work out how to make an in-browser file archiver for the heck of it. Wouldn't that be fun! ...Mind, I'm fairly sure it's not possible (I don't see why you shouldn't be able to create a subdirectory in the browser downloads folder from the webpage, but there's probably a security reason).

Let me know if you need to see more code, but as this is a class project I don't want to get accused of plagiarizing my own app...

2 Answers2

5

To read from the disk as a stream

you can use the Blob.stream() method which returns a ReadableStream from that Blob (or File).

inp.onchange = async (evt) => {
  const stream = inp.files[ 0 ].stream();
  const reader = stream.getReader();
  while( true ) {
    const { done, value } = await reader.read();
    if( done ) { break; }
    handleChunk( value );
  }
  console.log( "all done" );
};

function handleChunk( buf ) {
  console.log( "received a new buffer", buf.byteLength );
}
<input type="file" id="inp">

For older browsers that don't support this method, you can still read the File by chunks only using its .slice() method:

inp.onchange = async (evt) => {
  const file = inp.files[ 0 ];
  const chunksize = 64 * 1024;
  let offset = 0;
  while( offset < file.size ) {
    const chunkfile = await file.slice( offset, offset + chunksize );
    // Blob.arrayBuffer() can be polyfilled with a FileReader
    const chunk = await chunkfile.arrayBuffer();
    handleChunk( chunk );
    offset += chunksize;
  }
  console.log( "all done" );
};

function handleChunk( buf ) {
  console.log( "received a new buffer", buf.byteLength );
}
<input type="file" id="inp">

Writing to disk as stream however is a bit harder.

There is a great hack by Jimmy Wärting called StreamSaver.js which uses Service Workers. I'm not sure how far its browser support goes by though, and while awesome, it's still an "hack" and requires a Service Worker to run.

An easier way to do so is to use the being defined File System API which is currently only available in Chrome. You can see this Q/A for a code example.

Kaiido
  • 123,334
  • 13
  • 219
  • 285
  • Can you clarify the mechanism used in StreamSaver? I tried looking through that, but it's horribly complex, and my JS isn't _that_ good! Also, what input data types does it support? I need something I can write individual bytes of (as explained, I already have a way to get bit-level access if I have bytes). As for the blob.stream(), what governs the buffer size (I'd assume buffer is an ArrayBuffer so that I can run a DataView)? Most importantly, though, is it possible to read the input stream multiple times? I can't encode it until I have a frequency table, which requires reading it... –  Dec 01 '20 at 22:23
  • [How StreamSaver works](https://github.com/jimmywarting/StreamSaver.js#how-does-it-work). For the input types it accepts, its stdin is a [WritableStreamDefaultWriter](https://developer.mozilla.org/en-US/docs/Web/API/WritableStreamDefaultWriter), so it accepts *chunks* of binary data of any type and any length the same way. (Though writing every bytes individually will be slower than writing big chunks, so you may prefer buffering a bit (e.g a few KB) and writing these bigger buffers instead. – Kaiido Dec 02 '20 at 01:49
  • For `Blob.stream()` the chunk size is not defined by specs [yet](https://github.com/w3c/FileAPI/issues/144), and actually Chrome apparently has it variable now, while the first chunk is still 64KB, which also seems to be the default in FF. So you may have to check for the size of the received buffer if it's important for you. And you can't read the same input stream multiple times, but you can store the received chunks in a bigger ArrayBuffer if you need, and you can also require a new stream from the same file, or only a portion of it (using its .slice() method) – Kaiido Dec 02 '20 at 01:52
  • Here's a polyfill for Blob.stream based on @kaiido's answer above, it seems to work for me https://gist.github.com/dumbmatter/3b1d90bf7cfb6c5951bfffeefecf40aa - this assumes you have a ReadableStream polyfill already loaded, if necessary – dumbmatter Oct 19 '21 at 17:08
  • Is there a benefit of using `File.stream()` over just `File.slice()` when the goal is chunked reading? – nh2 Mar 24 '23 at 15:17
0

There is a streams API now already supported from the modern browsers in Javascript

Mozilla Streams MDN with samples

// setup your stream with the options, it will help handle the size limitations etc.
var readableStream = new ReadableStream(underlyingSource[, queuingStrategy]);

fetch("https://www.example.org/").then((response) => {
  const reader = response.body.getReader();
  const stream = new ReadableStream({
    start(controller) {
      // The following function handles each data chunk
      function push() {
        // "done" is a Boolean and value a "Uint8Array"
        reader.read().then(({ done, value }) => {
          // Is there no more data to read?
          if (done) {
            // Tell the browser that we have finished sending data
            controller.close();
            return;
          }

          // Get the data and send it to the browser via the controller
          controller.enqueue(value);
          push();
        });
      };
      
      push();
    }
  });

  return new Response(stream, { headers: { "Content-Type": "text/html" } });
});
Transformer
  • 6,963
  • 2
  • 26
  • 52