I have a few hundred large images (more than 100K x 200K
pixels). I am dividing each of these images into 256 x 256 patches and storing them all in a HDF5 file with the following structure:
Here is my code to recreate this HDF5 structure:
def save_to_hdf5(slide_name, patches, coords, labels, db_name , db_location):
with h5py.File(db_location + f'training{db_name}.h5' ,'a') as hf:
patient_index = "_".join(os.path.basename(slide_name).split('.')[0].split('_')[:2])
slide_index = "_".join(os.path.basename(slide_name).split('.')[0].split('_')[3])
slide_label = labels[os.path.basename(slide_name)]
grp = hf.require_group(patient_index)
subgrp = grp.require_group('wsi_{}'.format(slide_index))
for i, patch in enumerate(patches):
subsubgrp = subgrp.require_group('patch_{}'.format(i))
subsubgrp.create_dataset('image', np.shape(patch), data=patch, compression="gzip", compression_opts=7)#, chunks=True)
subsubgrp.create_dataset('label', np.shape(slide_label), data=slide_label)
subsubgrp.attrs["patch_coords"] = (coords[i][0], coords[i][1])
Now the size of the HDF5 file for some large images is even larger than the original image itself. I was wondering if I am doing something wrong with my group and dataset creation steps in the code?