2

I have a zip file which may contain files of the type I want, but I don't know yet. When I open these files (when unzipped) in notepad, the first 8 characters are always the same. Is there a way to find out whether the first 8 characters match without unzipping the whole (potentially several gb in size) file?

The files are often over 4gb, and have might have been compressed using deflate64. This means I can't use SharpZipLib or DotNetZip - I have tried both and had them fail on the same file.

I've been trying to use sevenzipsharp in the following way, but it only gave me zeros at the start of every file. Also e.Cancel did not cancel, so it ended up unzipping the whole thing into mstream anyway, which I would like to avoid.

SevenZipExtractor extractor = new SevenZipExtractor(zipfilename);

foreach (ArchiveFileInfo info in extractor.ArchiveFileData)
{
    bool isMyFileType = false;
    MemoryStream mstream = new MemoryStream();

    extractor.Extracting += (object sender, ProgressEventArgs e) =>
    {
        if (e.PercentDone * info.Size / 100 > 32)
        {
            // read the first 32 bytes
            byte[] buffer = new byte[32];

            if (mstream.Length >= 32)
                mstream.Read(buffer, 0, 32);
            else
                mstream.Read(buffer, 0, (int)mstream.Length);

            //bung the buffer into a streamreader
            MemoryStream memstream = new MemoryStream(buffer);
            StreamReader file = new StreamReader(memstream);

            //read the stream
            string filestart = "";
            for (int i = 0; i < 8; i++)
            {
                if (!file.EndOfStream)
                {
                    filestart = filestart + ((char)file.Read()).ToString();
                }                
            }

            isMyFileType = (filestart == "My8chars");

            e.Cancel = true;
        }
    };

    await CheckForMyFileType(info, mstream, extractor);


    if (isMyFileType)
    {
        //do stuff if it's the right file type

private Task CheckForMyFileType(ArchiveFileInfo info, MemoryStream mstream, SevenZipExtractor extractor)
{
    TaskCompletionSource<bool> tcs = new TaskCompletionSource<bool>();

    extractor.ExtractFile(info.FileName, mstream);

    tcs.SetResult(true);
    return tcs.Task;
}
H H
  • 263,252
  • 30
  • 330
  • 514
BeanFrog
  • 2,297
  • 12
  • 26
  • 1
    http://stackoverflow.com/questions/5967864/how-to-read-data-from-a-zip-file-without-having-to-unzip-the-entire-file – CodeCaster Jan 05 '16 at 11:14
  • @CodeCaster - I found that one, but the answers only tell you how to pick a certain file from an archive. I've got that already in my code example, I want to just unzip the first part of that file. – BeanFrog Jan 05 '16 at 11:25
  • One of the answers shows SharpZipLib's `zipfile.GetInputStream(item)`, which you can use to obtain a stream. The file will be extracted as you read from the stream. – CodeCaster Jan 05 '16 at 11:26
  • I'll give that a go - thanks! – BeanFrog Jan 05 '16 at 11:28
  • Unfortunately that doesn't help after all, because not all compression types are supported. My really large zip files use deflate64, which neither SharpZipLib nor DotNetZip seem to support. – BeanFrog Jan 06 '16 at 10:51
  • Although for people looking who wouldn't be bothered by that, both worked fine for files with supported compression. – BeanFrog Jan 06 '16 at 10:54
  • Alright, that's too bad. – CodeCaster Jan 06 '16 at 10:55

0 Answers0