Recursion to Find All Valid Paths to dataset(s)
The following code uses recursion to find valid data paths to all dataset(s). After getting the valid paths (terminating possible circular references after 3 repeats) I then can use a regular expression against the returned list (not shown) .
import numpy as np
import h5py
import collections
import warnings
def visit_data_sets(group, max_len_check=20, max_repeats=3):
# print(group.name)
# print(list(group.items()))
if len(group.name) > max_len_check:
# this section terminates a circular reference after 4 repeats. However it will
# incorrectly terminate a tree if the identical repetitive sequences of names are
# actually used in the tree.
name_list = group.name.split('/')
current_name = name_list[-1]
res_list = [i for i in range(len(name_list)) if name_list[i] == current_name]
res_deq = collections.deque(res_list)
res_deq.rotate(1)
res_deq2 = collections.deque(res_list)
diff = [res_deq2[i] - res_deq[i] for i in range(0, len(res_deq))]
if len(diff) >= max_repeats:
if diff[-1] == diff[-2]:
message = 'Terminating likely circular reference "{}"'.format(group.name)
warnings.warn(message, UserWarning)
print()
return []
dataset_list = list()
for key, value in group.items():
if isinstance(value, h5py.Dataset):
current_path = group.name + '/{}'.format(key)
dataset_list.append(current_path)
elif isinstance(value, h5py.Group):
dataset_list += visit_data_sets(value)
else:
print('Unhandled class name {}'.format(value.__class__.__name__))
return dataset_list
def visit_callback(name, object):
print('Visiting name = "{}", object name = "{}"'.format(name, object.name))
return None
hdf_fptr = h5py.File('link_test.hdf5', mode='w')
group1 = hdf_fptr.require_group('/junk/group1')
group1a = hdf_fptr.require_group('/junk/group1/group1a')
# group1a1 = hdf_fptr.require_group('/junk/group1/group1a/group1ai')
group2 = hdf_fptr.require_group('/junk/group2')
group3 = hdf_fptr.require_group('/junk/group3')
# create a circular reference
group1ai = group1a['group1ai'] = group1
avect = np.arange(0,12.3, 1.0)
dset = group1.create_dataset('avect', data=avect)
group2['alias'] = dset
group3['alias3'] = h5py.SoftLink(dset.name)
print('\nThis demonstrates "h5py visititems" visiting Root with subgroups containing a Hard Link and Soft Link to "avect"')
print('Visiting Root - {}'.format(hdf_fptr.name))
hdf_fptr.visititems(visit_callback)
print('\nThis demonstrates "h5py visititems" visiting "group2" with a Hard Link to "avect"')
print('Visiting Group - {}'.format(group2.name))
group2.visititems(visit_callback)
print('\nThis demonstrates "h5py visititems" visiting "group3" with a Soft Link to "avect"')
print('Visiting Group - {}'.format(group3.name))
group3.visititems(visit_callback)
print('\n\nNow demonstrate recursive visit of Root looking for datasets')
print('using the function "visit_data_sets" in this code snippet.\n')
data_paths = visit_data_sets(hdf_fptr)
for data_path in data_paths:
print('Data Path = "{}"'.format(data_path))
hdf_fptr.close()
The following output shows how "visititems" works, or for my purposes fails to work, in identifying all valid paths while the recursion meets my needs and possibly yours.
This demonstrates "h5py visititems" visiting Root with subgroups containing a Hard Link and Soft Link to "avect"
Visiting Root - /
Visiting name = "junk", object name = "/junk"
Visiting name = "junk/group1", object name = "/junk/group1"
Visiting name = "junk/group1/avect", object name = "/junk/group1/avect"
Visiting name = "junk/group1/group1a", object name = "/junk/group1/group1a"
Visiting name = "junk/group2", object name = "/junk/group2"
Visiting name = "junk/group3", object name = "/junk/group3"
This demonstrates "h5py visititems" visiting "group2" with a Hard Link to "avect"
Visiting Group - /junk/group2
Visiting name = "alias", object name = "/junk/group2/alias"
This demonstrates "h5py visititems" visiting "group3" with a Soft Link to "avect"
Visiting Group - /junk/group3
Now demonstrate recursive visit of Root looking for datasets
using the function "visit_data_sets" in this code snippet.
link_ref_test.py:26: UserWarning: Terminating likely circular reference "/junk/group1/group1a/group1ai/group1a/group1ai/group1a"
warnings.warn(message, UserWarning)
Data Path = "/junk/group1/avect"
Data Path = "/junk/group1/group1a/group1ai/avect"
Data Path = "/junk/group1/group1a/group1ai/group1a/group1ai/avect"
Data Path = "/junk/group2/alias"
Data Path = "/junk/group3/alias3"
The first "Data Path" result is the original dataset. The second and third are references to the original dataset caused by a circular reference. The fourth result is a Hard Link and the fifth is a Soft Link to the original dataset.