The best way to figure something like this out is to just write a bunch of tests and use timeit
to see which is faster. I ran some tests below, but you should try this with your lexicon dict, as your results may vary.
If you want the times to be more stable (accurate), you can increase the number
argument to timeit
- it will just make the test take longer. Also, note that the value returned by timeit
is the total execution time, not the time per run.
testing with 10 keys...
serialize flat: 2.97198390961
serialize eval: 4.60271120071
serialize defaultdict: 20.3057091236
serialize dict: 20.2011070251
serialize defaultdict new pickle: 14.5152060986
serialize dict new pickle: 14.7755970955
serialize json: 13.5039670467
serialize cjson: 4.0456969738
unserialize flat: 1.29577493668
unserialize eval: 25.6548647881
unserialize defaultdict: 10.2215960026
unserialize dict: 10.208122015
unserialize defaultdict new pickle: 5.70747089386
unserialize dict new pickle: 5.69750404358
unserialize json: 5.34811091423
unserialize cjson: 1.50241613388
testing with 100 keys...
serialize flat: 2.91076397896
serialize eval: 4.72978711128
serialize defaultdict: 21.331786871
serialize dict: 21.3218340874
serialize defaultdict new pickle: 15.7140991688
serialize dict new pickle: 15.6440980434
serialize json: 14.3557379246
serialize cjson: 5.00576901436
unserialize flat: 1.6677339077
unserialize eval: 22.9142649174
unserialize defaultdict: 10.7773029804
unserialize dict: 10.7524499893
unserialize defaultdict new pickle: 6.13370203972
unserialize dict new pickle: 6.18057107925
unserialize json: 5.92281794548
unserialize cjson: 1.91151690483
Code:
import cPickle
import json
try:
import cjson # not Python standard library
except ImportError:
cjson = False
from collections import defaultdict
dd1 = defaultdict(list)
dd2 = defaultdict(list)
for i in xrange(1000000):
dd1[str(i % 10)].append(str(i))
dd2[str(i % 100)].append(str(i))
dt1 = dict(dd1)
dt2 = dict(dd2)
from timeit import timeit
def testdict(dd, dt):
def serialize_defaultdict():
with open('defaultdict.pickle', 'w') as f:
cPickle.dump(dd, f)
def serialize_p2_defaultdict():
with open('defaultdict.pickle2', 'w') as f:
cPickle.dump(dd, f, -1)
def serialize_dict():
with open('dict.pickle', 'w') as f:
cPickle.dump(dt, f)
def serialize_p2_dict():
with open('dict.pickle2', 'w') as f:
cPickle.dump(dt, f, -1)
def serialize_json():
with open('dict.json', 'w') as f:
json.dump(dt, f)
if cjson:
def serialize_cjson():
with open('dict.cjson', 'w') as f:
f.write(cjson.encode(dt))
def serialize_flat():
with open('dict.flat', 'w') as f:
f.write('\n'.join([' '.join([k] + v) for k, v in dt.iteritems()]))
def serialize_eval():
with open('dict.eval', 'w') as f:
f.write('\n'.join([k + '\t' + repr(v) for k, v in dt.iteritems()]))
def unserialize_defaultdict():
with open('defaultdict.pickle') as f:
assert cPickle.load(f) == dd
def unserialize_p2_defaultdict():
with open('defaultdict.pickle2') as f:
assert cPickle.load(f) == dd
def unserialize_dict():
with open('dict.pickle') as f:
assert cPickle.load(f) == dt
def unserialize_p2_dict():
with open('dict.pickle2') as f:
assert cPickle.load(f) == dt
def unserialize_json():
with open('dict.json') as f:
assert json.load(f) == dt
if cjson:
def unserialize_cjson():
with open('dict.cjson') as f:
assert cjson.decode(f.read()) == dt
def unserialize_flat():
with open('dict.flat') as f:
dtx = {}
for line in f:
vals = line.split()
dtx[vals[0]] = vals[1:]
assert dtx == dt
def unserialize_eval():
with open('dict.eval') as f:
dtx = {}
for line in f:
vals = line.split('\t')
dtx[vals[0]] = eval(vals[1])
assert dtx == dt
print 'serialize flat:', timeit(serialize_flat, number=10)
print 'serialize eval:', timeit(serialize_eval, number=10)
print 'serialize defaultdict:', timeit(serialize_defaultdict, number=10)
print 'serialize dict:', timeit(serialize_dict, number=10)
print 'serialize defaultdict new pickle:', timeit(serialize_p2_defaultdict, number=10)
print 'serialize dict new pickle:', timeit(serialize_p2_dict, number=10)
print 'serialize json:', timeit(serialize_json, number=10)
if cjson:
print 'serialize cjson:', timeit(serialize_cjson, number=10)
print 'unserialize flat:', timeit(unserialize_flat, number=10)
print 'unserialize eval:', timeit(unserialize_eval, number=10)
print 'unserialize defaultdict:', timeit(unserialize_defaultdict, number=10)
print 'unserialize dict:', timeit(unserialize_dict, number=10)
print 'unserialize defaultdict new pickle:', timeit(unserialize_p2_defaultdict, number=10)
print 'unserialize dict new pickle:', timeit(unserialize_p2_dict, number=10)
print 'unserialize json:', timeit(unserialize_json, number=10)
if cjson:
print 'unserialize cjson:', timeit(unserialize_cjson, number=10)
print 'testing with 10 keys...'
testdict(dd1, dt1)
print 'testing with 100 keys...'
testdict(dd2, dt2)