A computation process create a random number of numpy arrays and I want to save and load each array conveniently to the hard-disk. Importantly, the files are high dimensional and the shape of the array is random itself.
To save and load each array, I create the following interface:
class Filesaver:
def __init__(self, filename: str):
self._filename = filename
try:
f = open(self._filename, 'rb')
self._pos = self._get_len(f)
except FileNotFoundError:
self._pos = 0
def _get_len(self, file) -> int:
pos = 0
while True:
try:
pickle.load(file)
pos += 1
except EOFError:
break
return pos
def save(self, data: np.ndarray) -> None:
"""Pickles the obejct to hdd """
with open(self._filename, 'ab+') as f:
pickle.dump(data, f)
self._pos += 1
def __getitem__(self, idx: int) -> np.ndarray:
assert idx < self._pos, EOFError("File is not long enough")
with open(self._filename, 'rb') as f:
tmp = 0
while tmp <= idx:
data = pickle.load(f)
tmp += 1
return data
def __len__(self) -> int:
return self._pos
The interface allows appending another array to the existing database and also loading each element. However, the approach seems a bit strange to me: First, to load a array with the __getitem__
method, I have to cycle through the file and cannot open element idx
directly. Second, the initial number of elements in the save object is determined in the _get_len
method cycling through the file until an error occurs.
Does anybody have a better idea to save and load a random number (high-dimensional) of numpy arrays to and from the hard-disk?