I have a large list of images stored as numpy matrices. The images have different sizes e.g.
import numpy as np
from numpy.random import rand
data = [rand(100,200), rand(1024, 768)]
I am looking for a way to store this list of matrices such that it can be read fast (writing the data can be slow). I tried pickle/numpy.savez, but reading the data was slower than loading the raw images again.
I think hdf5 may be fast, however I cannot figure out how to store this list. Not mandatory, but useful would data format which allows to append data such that the list does not have to in memory as a whole.
Edit: Based on the answers so far I tried to time some suggestions
data = [rand(1024, 768) for i in np.arange(100)]
def timenp():
np.savez("test.npz",*data)
d=np.load('test.npz')
loaded = [d[f] for f in d]
def timebinary():
with file("tmp.bin","wb") as f:
np.save(f, len(data))
for img in data:
np.save(f,img)
with file("tmp.bin","rb") as f:
n = np.load(f)
loaded = []
for i in np.arange(n):
loaded.append(np.load(f))
import h5py
def timeh5py():
with h5py.File('foo.hdf5','w') as f:
dt = h5py.special_dtype(vlen=np.dtype('float32'))
dset = f.create_dataset('data', (len(data),), dtype=dt)
shapes = f.create_dataset('shapes', (len(data), 2), dtype='int32')
dset[...] = [img.flatten() for img in data]
shapes[...] = [img.shape for img in data]
with h5py.File('foo.hdf5','r') as f:
loaded=[]
for (img, shape) in zip(f['data'],f['shapes']):
loaded.append(np.reshape(img,shape))
python -m cProfile timenp.py
452906 function calls (451141 primitive calls) in 9.256 seconds
python -m cProfile timebinary.py 73085 function calls (71340 primitive calls) in 4.945 seconds
python -m cProfile timeh5py.py
33151 function calls (32568 primitive calls) in 4.384 seconds