I have several gz files on which decompression seems to not be properly working. I have selected one of those, this file is in a local folder and it came from an external source. I don't know the compression process.
I have created a python script to illustrate the situation. In order to have a reference file for testing I have uncompressed the gz file and compressed it again (using gzip on Ubuntu) to generate the same gz file in my computer. Those two files present different behaviors:
import gzip
import zlib
import hashlib
def md5(content):
m = hashlib.md5()
m.update(content)
return m.hexdigest()
def decompress_gzip_size(file_name):
with gzip.open(file_name, 'rb') as f_out:
f_content = f_out.read()
print(len(f_content), file_name)
print (md5(f_content), file_name)
def decompress_open_gzip_size(file_name):
with open(file_name, 'rb') as f_out:
f_content = f_out.read()
unzip_content = gzip.decompress(f_content)
print(len(unzip_content), file_name)
print (md5(unzip_content), file_name)
def decompress_zlib_size(file_name):
with open(file_name, 'rb') as f_out:
f_content = f_out.read()
unzip_content = zlib.decompress(f_content, 32)
print(len(unzip_content), file_name)
print (md5(unzip_content), file_name)
def decompress_zlib_obj(file_name):
decompress_obj = zlib.decompressobj(32)
with open(file_name, 'rb') as f_out:
f_content = f_out.read()
unzip_content = decompress_obj.decompress(f_content)
print(len(unzip_content), file_name)
print(len(decompress_obj.unused_data), 'Unused data')
print (md5(unzip_content), file_name)
external_file = 'external_source_compress.gz'
my_file = 'my-compress-file.gz'
print("decompress_gzip_size")
decompress_gzip_size(my_file)
decompress_gzip_size(external_file)
print("*" * 60)
print("decompress_open_gzip_size")
decompress_open_gzip_size(my_file)
decompress_open_gzip_size(external_file)
print("*" * 60)
print("decompress_zlib_size")
decompress_zlib_size(my_file)
decompress_zlib_size(external_file)
print("*" * 60)
print("decompress_zlib_obj")
decompress_zlib_obj(my_file)
decompress_zlib_obj(external_file)
print("*" * 60)
Execution output is:
decompress_gzip_size
167019534 my-compress-file.gz
a4dd17dd28b89f0b2c300b607cd1a8ba my-compress-file.gz
167019534 external_source_compress.gz
a4dd17dd28b89f0b2c300b607cd1a8ba external_source_compress.gz
************************************************************
decompress_open_gzip_size
167019534 my-compress-file.gz
a4dd17dd28b89f0b2c300b607cd1a8ba my-compress-file.gz
167019534 external_source_compress.gz
a4dd17dd28b89f0b2c300b607cd1a8ba external_source_compress.gz
************************************************************
decompress_zlib_size
167019534 my-compress-file.gz
a4dd17dd28b89f0b2c300b607cd1a8ba my-compress-file.gz
33408639 external_source_compress.gz
4f51ccc64a7baab5ee5e2ce31e816409 external_source_compress.gz
### SIZES AND MD5 DO NOT MATCH ###
************************************************************
decompress_zlib_obj
167019534 my-compress-file.gz
0 Unused data
a4dd17dd28b89f0b2c300b607cd1a8ba my-compress-file.gz
33408639 external_source_compress.gz
46765202 Unused data
4f51ccc64a7baab5ee5e2ce31e816409 external_source_compress.gz
### THERE IS SOME UNUSED DATA IN THE ORIGINAL FILE ###
************************************************************
NOTE: zlib decompression fails if window size parameters is any other than 32.
Without setting the window-size this happend:
Traceback (most recent call last):
File "decompress_python.py", line 53, in <module>
decompress_zlib_size(my_file)
File "decompress_python.py", line 26, in decompress_zlib_size
unzip_content = zlib.decompress(f_content)
zlib.error: Error -3 while decompressing data: incorrect header check
As it can be seen. Without 32 windows-size, the script crash, but with window-size 32 it finish, but the amount of read bytes is just a portion of the real amount of data.
File has only one element inside:
gzip -l external_source_compress.gz
compressed uncompressed ratio uncompressed_name
58609586 33410520 -75.4% external_source_compress
Could anyone help me to understand what is happening here? I'm truly lost. Thanks in advance.