2

Assuming we load images from disk using tensorflow dataset like this:

paths = tf.data.Dataset.list_files("/path/to/dataset/train-*.png")
images = paths.map(load_img_from_path, num_parallel_calls=tf.data.AUTOTUNE)

In this special case: Would we gain any benefit (speed) in using interleave() instead of map()?


P.s.: I asked a broader question here, but the question above was not answered. Or more specifically the existing answer only compared the scenario above with another scenario using a tf.records dataset.


In the example below, interleave() is actually significantly slower than map(), but I'm probably misunderstanding something...

PATHS_IMG = glob.glob(pattern)
BATCH_SIZE = 4

def main():
    measure_time(func_map)  # ~20s
    measure_time(func_interleave)  # ~60s
    measure_time(func_map)  # ~20s


def measure_time(func_parse):
    t0 = time.time()

    # setup dataset and iterate over it
    ds = tf.data.Dataset.from_tensor_slices(PATHS_IMG)
    ds = func_parse(ds)
    ds = ds.batch(BATCH_SIZE)
    ds = ds.prefetch(1)  # same time measurements if we set ds.prefetch(4)
    for i, item in enumerate(ds):
        print(f"{i}/{len(PATHS_IMG)//BATCH_SIZE}: len(item)={len(item)} and item[0].shape={item[0].shape}")

    # return
    print(f"It took {time.time() - t0} seconds")

def func_map(ds):
    ds = ds.map(_parse_tf, num_parallel_calls=BATCH_SIZE)
    return ds

def func_interleave(ds):
    ds = ds.interleave(
        lambda p: tf.data.Dataset.from_tensors(_parse_tf(p)),
        num_parallel_calls=BATCH_SIZE,
    )
    return ds


def _parse_tf(path):
    def _parse_np(path):
        img = PIL.Image.open(path)  # uint8
        img = np.array(img, dtype=np.float32)
        img = img[:512, :512, :]
        return img

    return tf.numpy_function(_parse_np, [path], tf.float32)
gebbissimo
  • 2,137
  • 2
  • 25
  • 35

0 Answers0