0

I need to read a huge file, remove some whitespace characters, compress the content and create a double SHA256 hash from it. To avoid loading the whole file into memory (which is impossible, as some of the files have several hundreds of mb) and to avoid a performance bottleneck, i wish to read the whole file only once. So i dediced to wrap a FileStream and do the work. The DeflaterOutputStream comes from the SharpZipLib.

public class DeflateAndHashStream : Stream
{
    private readonly FileStream _input;
    private readonly MemoryStream _compressedFile;
    private readonly DeflaterOutputStream _deflate;

    public DeflateAndHashStream(FileStream input)
    {
        _input = input;
        _compressedFile = new MemoryStream();
        _deflate = new DeflaterOutputStream(_compressedFile);
    }

    public override void Flush()
    {
        throw new NotImplementedException();
    }

    public override long Seek(long offset, SeekOrigin origin)
    {
        throw new NotImplementedException();
    }

    public override void SetLength(long value)
    {
        throw new NotImplementedException();
    }

    public override int Read(byte[] buffer, int offset, int count)
    {
        byte[] internalBuffer = new byte[buffer.Length];

        //read bufferbytes from the file
        int readedBytes = _input.Read(internalBuffer, 0, buffer.Length);

        //Are we done reading the file?
        if (readedBytes == 0) return 0;

        //remove whitespaces from internalBuffer
        //method takes bytearray and cut whitespace chars in place
        //returns the count of removed characters
        //so this line corrects the readed bytes
        readedBytes -= RemoveWhitespace(ref internalBuffer);

        //make the result available for stream chaining
        buffer = internalBuffer;
        //in parall compress the file into internal memorystream
        _deflate.Write(internalBuffer, 0, internalBuffer.Length);

        return readedBytes;
    }

    public byte[] GetCompressedData()
    {
        _deflate.Flush();
        _deflate.Finish();
        return _compressedFile.ToArray();
    }

    public override void Write(byte[] buffer, int offset, int count)
    {
        throw new NotImplementedException();
    }

    public override bool CanRead => _input.CanRead;
    public override bool CanSeek => _input.CanSeek;
    public override bool CanWrite => _input.CanWrite;
    public override long Length => _input.Length;
    public override long Position
    {
        get => _input.Position;
        set => _input.Position = value;
    }

    private int RemoveWhitespace(ref byte[] digest)
    {
        var output = new MemoryStream();
        int removedWhiteSpaces = 0;
        foreach (var actualByte in digest)
            switch (actualByte)
            {
                case 10:
                case 13:
                case 26:
                    // ignore this character
                    removedWhiteSpaces++;
                    break;

                default:
                    output.WriteByte(actualByte);
                    break;
            }

        digest = output.ToArray();
        return removedWhiteSpaces;
    }
}

Im calling the DeflateAndHashStream like this:

public string[] CreateHashAndZipFile(string filePath)
    {
        if(!File.Exists(filePath)) throw new FileNotFoundException();

        string[] result = new string[2];

        using (FileStream fs = new FileStream(filePath, FileMode.Open))
        {
            using (DeflateAndHashStream defhash = new DeflateAndHashStream(fs))
            {
                using (SHA256 sha = new SHA256Managed())
                {
                    result[0] = Convert.ToBase64String(sha.ComputeHash(sha.ComputeHash(defhash)));
                    result[1] = Convert.ToBase64String(defhash.GetCompressedData());
                }
            }
        }

        return result;
    }

Unfortunately this creates a completey different hash as:

string test1;
using (FileStream fs = new FileStream("test.txt", FileMode.Open))
{
    using (SHA256Managed sha = new SHA256Managed())
    {
        test1 = Convert.ToBase64String(sha.ComputeHash(sha.ComputeHash(fs)));
    }
}

With a File test.txt containing only a single A (0x65)

What am i doing wrong?

Kris
  • 512
  • 7
  • 16
  • Your `DeflateAndHashStream` potentially removes characters from the stream. Are you completely sure that your file does not contain a newline or linefeed at the end? – fredrik Jan 16 '20 at 11:02
  • Nope its a simple test file conainting a single A. – Kris Jan 16 '20 at 11:04
  • `buffer = internalBuffer;` will not do what you expect. It will only change `buffer` locally. – fredrik Jan 16 '20 at 11:08
  • Do i miss something? Isn't the buffer passed to Stream.Read() intended to get filled ? – Kris Jan 16 '20 at 11:14
  • Yes, but you can't replace it that way. You have to copy the data into it. – fredrik Jan 16 '20 at 11:14
  • See [docs](https://learn.microsoft.com/en-us/dotnet/csharp/programming-guide/classes-and-structs/passing-reference-type-parameters) for more information. – fredrik Jan 16 '20 at 11:15
  • Well you are right. And that solves the problem. Thank you very much. – Kris Jan 16 '20 at 11:17
  • I know the difference. Just thought buffer is declared as a out var... that was my mistake. – Kris Jan 16 '20 at 11:18

0 Answers0