Assuming we load images from disk using tensorflow dataset like this:
paths = tf.data.Dataset.list_files("/path/to/dataset/train-*.png")
images = paths.map(load_img_from_path, num_parallel_calls=tf.data.AUTOTUNE)
In this special case: Would we gain any benefit (speed) in using interleave() instead of map()?
P.s.: I asked a broader question here, but the question above was not answered. Or more specifically the existing answer only compared the scenario above with another scenario using a tf.records dataset.
In the example below, interleave() is actually significantly slower than map(), but I'm probably misunderstanding something...
PATHS_IMG = glob.glob(pattern)
BATCH_SIZE = 4
def main():
measure_time(func_map) # ~20s
measure_time(func_interleave) # ~60s
measure_time(func_map) # ~20s
def measure_time(func_parse):
t0 = time.time()
# setup dataset and iterate over it
ds = tf.data.Dataset.from_tensor_slices(PATHS_IMG)
ds = func_parse(ds)
ds = ds.batch(BATCH_SIZE)
ds = ds.prefetch(1) # same time measurements if we set ds.prefetch(4)
for i, item in enumerate(ds):
print(f"{i}/{len(PATHS_IMG)//BATCH_SIZE}: len(item)={len(item)} and item[0].shape={item[0].shape}")
# return
print(f"It took {time.time() - t0} seconds")
def func_map(ds):
ds = ds.map(_parse_tf, num_parallel_calls=BATCH_SIZE)
return ds
def func_interleave(ds):
ds = ds.interleave(
lambda p: tf.data.Dataset.from_tensors(_parse_tf(p)),
num_parallel_calls=BATCH_SIZE,
)
return ds
def _parse_tf(path):
def _parse_np(path):
img = PIL.Image.open(path) # uint8
img = np.array(img, dtype=np.float32)
img = img[:512, :512, :]
return img
return tf.numpy_function(_parse_np, [path], tf.float32)