I create an expandable earray of Nx4 columns. Some columns require float64 datatype, the others can be managed with int32. Is it possible to vary the data types among the columns? Right now I just use one (float64, below) for all, but it takes huge disk space for (>10 GB) files.
For example, how can I ensure column 1-2 elements are int32 and 3-4 elements are float64?
import tables
f1 = tables.open_file("table.h5", "w")
a = f1.create_earray(f1.root, "dataset_1", atom=tables.Float32Atom(), shape=(0, 4))
Here is a simplistic version of how I am appending using Earray:
Matrix = np.ones(shape=(10**6, 4))
if counter <= 10**6: # keep appending to Matrix until 10**6 rows
Matrix[s:s+length, 0:4] = chunk2[left:right] # chunk2 is input np.ndarray
s += length
# save to disk when rows = 10**6
if counter > 10**6:
a.append(Matrix[:s])
del Matrix
Matrix = np.ones(shape=(10**6, 4))
What are the cons for the following method?
import tables as tb
import numpy as np
filename = 'foo.h5'
f = tb.open_file(filename, mode='w')
int_app = f.create_earray(f.root, "col1", atom=tb.Int32Atom(), shape=(0,2), chunkshape=(3,2))
float_app = f.create_earray(f.root, "col2", atom=tb.Float64Atom(), shape=(0,2), chunkshape=(3,2))
# array containing ints..in reality it will be 10**6x2
arr1 = np.array([[1, 1],
[2, 2],
[3, 3]], dtype=np.int32)
# array containing floats..in reality it will be 10**6x2
arr2 = np.array([[1.1,1.2],
[1.1,1.2],
[1.1,1.2]], dtype=np.float64)
for i in range(3):
int_app.append(arr1)
float_app.append(arr2)
f.close()
print('\n*********************************************************')
print("\t\t Reading Now=> ")
print('*********************************************************')
c = tb.open_file('foo.h5', mode='r')
chunks1 = c.root.col1
chunks2 = c.root.col2
chunk1 = chunks1.read()
chunk2 = chunks2.read()
print(chunk1)
print(chunk2)