I am writing a module that needs to be able to deal with a large number of zip file pretty fast. As such, I was going to use something implemented in C rather than Python (from which I'll be calling the extractor). To try and test which method would be fastest, I wrote a test script comparing linux's 'unzip' command vs the czipfile python module (wrapper around a c zip extractor). As a control, I used the native python zipfile module.
The script creates a zipfile that's around 100MB out of 100 ~1MB files. It looks at 3 scenarios. A) The files are all just random bytestrings. B)The files are just random hex characters C)The files are uniform random sentences with line breaks.
In all cases, the performance of zipfile (implemented in python) was on par with or significantly better than the two extractors implemented in c.
Any ideas why this could be happening? The script is attached. Requires czipfile and the 'unzip' command available in the shell.
from datetime import datetime
import zipfile
import czipfile
import os, binascii, random
class ZipTestError(Exception):
pass
class ZipTest:
procs = ['zipfile', 'czipfile', 'os']
type_map = {'r':'Random', 'h':'Random Hex', 's':'Sentences'}
# three types. t=='r' is random noise files directly out of urandom. t=='h' is urandom noise converted to ascii characters. t=='s' are randomly constructed sentences with line breaks.
def __init__(self):
print """Testing Random Byte Files:
"""
self.test('r')
self.test('h')
self.test('s')
@staticmethod
def rand_name():
return binascii.b2a_hex(os.urandom(10))
def make_file(self, t):
f_name = self.rand_name()
f = open(f_name, 'w')
if t == 'r':
f.write(os.urandom(1048576))
elif t == 'h':
f.write(binascii.b2a_hex(os.urandom(1048576)))
elif t == 's':
for i in range(76260):
ops = ['dog', 'cat', 'rat']
ops2 = ['meat', 'wood', 'fish']
n1 = int(random.random()*10) % 3
n2 = int(random.random()*10) % 3
sentence = """The {0} eats {1}
""".format(ops[n1], ops2[n2])
f.write(sentence)
else:
raise ZipTestError('Invalid Type')
f.close()
return f_name
#create a ~100MB zip file to test extraction on.
def create_zip_test(self, t):
self.file_names = []
self.output_names = []
for i in range(100):
self.file_names.append(self.make_file(t))
self.zip_name = self.rand_name()
output = zipfile.ZipFile(self.zip_name, 'w', zipfile.ZIP_DEFLATED)
for f in self.file_names:
output.write(f)
output.close()
def clean_up(self, rem_zip = False):
for f in self.file_names:
os.remove(f)
self.file_names = []
for f in self.output_names:
os.remove(f)
self.output_names = []
if rem_zip:
if getattr(self, 'zip_name', False):
os.remove(self.zip_name)
self.zip_name = False
def display_res(self, res, t):
print """
{0} results:
""".format(self.type_map[t])
for p in self.procs:
print"""
{0} = {1} milliseconds""".format(p, str(res[p]))
def test(self, t):
self.create_zip_test(t)
res = self.unzip()
self.display_res(res, t)
self.clean_up(rem_zip = True)
def unzip(self):
res = dict()
for p in self.procs:
self.clean_up()
res[p] = getattr(self, "unzip_with_{0}".format(p))()
return res
def unzip_with_zipfile(self):
return self.unzip_with_python(zipfile)
def unzip_with_czipfile(self):
return self.unzip_with_python(czipfile)
def unzip_with_python(self, mod):
f = open(self.zip_name)
zf = mod.ZipFile(f)
start = datetime.now()
op = './'
for name in zf.namelist():
zf.extract(name,op)
self.output_names.append(name)
end = datetime.now()
total = end-start
ms = total.microseconds
ms += (total.seconds) * 1000000
return ms /1000
def unzip_with_os(self):
f = open(self.zip_name)
start = datetime.now()
zf = zipfile.ZipFile(f)
for name in zf.namelist():
self.output_names.append(name)
os.system("unzip -qq {0}".format(f.name))
end = datetime.now()
total = end-start
ms = total.microseconds
ms += (total.seconds) * 1000000
return ms /1000
if __name__ == '__main__':
ZipTest()