I need to read a huge file, remove some whitespace characters, compress the content and create a double SHA256 hash from it. To avoid loading the whole file into memory (which is impossible, as some of the files have several hundreds of mb) and to avoid a performance bottleneck, i wish to read the whole file only once. So i dediced to wrap a FileStream
and do the work. The DeflaterOutputStream
comes from the SharpZipLib.
public class DeflateAndHashStream : Stream
{
private readonly FileStream _input;
private readonly MemoryStream _compressedFile;
private readonly DeflaterOutputStream _deflate;
public DeflateAndHashStream(FileStream input)
{
_input = input;
_compressedFile = new MemoryStream();
_deflate = new DeflaterOutputStream(_compressedFile);
}
public override void Flush()
{
throw new NotImplementedException();
}
public override long Seek(long offset, SeekOrigin origin)
{
throw new NotImplementedException();
}
public override void SetLength(long value)
{
throw new NotImplementedException();
}
public override int Read(byte[] buffer, int offset, int count)
{
byte[] internalBuffer = new byte[buffer.Length];
//read bufferbytes from the file
int readedBytes = _input.Read(internalBuffer, 0, buffer.Length);
//Are we done reading the file?
if (readedBytes == 0) return 0;
//remove whitespaces from internalBuffer
//method takes bytearray and cut whitespace chars in place
//returns the count of removed characters
//so this line corrects the readed bytes
readedBytes -= RemoveWhitespace(ref internalBuffer);
//make the result available for stream chaining
buffer = internalBuffer;
//in parall compress the file into internal memorystream
_deflate.Write(internalBuffer, 0, internalBuffer.Length);
return readedBytes;
}
public byte[] GetCompressedData()
{
_deflate.Flush();
_deflate.Finish();
return _compressedFile.ToArray();
}
public override void Write(byte[] buffer, int offset, int count)
{
throw new NotImplementedException();
}
public override bool CanRead => _input.CanRead;
public override bool CanSeek => _input.CanSeek;
public override bool CanWrite => _input.CanWrite;
public override long Length => _input.Length;
public override long Position
{
get => _input.Position;
set => _input.Position = value;
}
private int RemoveWhitespace(ref byte[] digest)
{
var output = new MemoryStream();
int removedWhiteSpaces = 0;
foreach (var actualByte in digest)
switch (actualByte)
{
case 10:
case 13:
case 26:
// ignore this character
removedWhiteSpaces++;
break;
default:
output.WriteByte(actualByte);
break;
}
digest = output.ToArray();
return removedWhiteSpaces;
}
}
Im calling the DeflateAndHashStream
like this:
public string[] CreateHashAndZipFile(string filePath)
{
if(!File.Exists(filePath)) throw new FileNotFoundException();
string[] result = new string[2];
using (FileStream fs = new FileStream(filePath, FileMode.Open))
{
using (DeflateAndHashStream defhash = new DeflateAndHashStream(fs))
{
using (SHA256 sha = new SHA256Managed())
{
result[0] = Convert.ToBase64String(sha.ComputeHash(sha.ComputeHash(defhash)));
result[1] = Convert.ToBase64String(defhash.GetCompressedData());
}
}
}
return result;
}
Unfortunately this creates a completey different hash as:
string test1;
using (FileStream fs = new FileStream("test.txt", FileMode.Open))
{
using (SHA256Managed sha = new SHA256Managed())
{
test1 = Convert.ToBase64String(sha.ComputeHash(sha.ComputeHash(fs)));
}
}
With a File test.txt containing only a single A (0x65)
What am i doing wrong?