17

In tensorflow 1.12 there is the Dataset.zip function: documented here.

However, I was wondering if there is a dataset unzip function which will return back the original two datasets.

# NOTE: The following examples use `{ ... }` to represent the
# contents of a dataset.
a = { 1, 2, 3 }
b = { 4, 5, 6 }
c = { (7, 8), (9, 10), (11, 12) }
d = { 13, 14 }

# The nested structure of the `datasets` argument determines the
# structure of elements in the resulting dataset.
Dataset.zip((a, b)) == { (1, 4), (2, 5), (3, 6) }
Dataset.zip((b, a)) == { (4, 1), (5, 2), (6, 3) }

# The `datasets` argument may contain an arbitrary number of
# datasets.
Dataset.zip((a, b, c)) == { (1, 4, (7, 8)),
                            (2, 5, (9, 10)),
                            (3, 6, (11, 12)) }

# The number of elements in the resulting dataset is the same as
# the size of the smallest dataset in `datasets`.
Dataset.zip((a, d)) == { (1, 13), (2, 14) }

I would like to have the following

dataset = Dataset.zip((a, d)) == { (1, 13), (2, 14) }
a, d = dataset.unzip()
Ouwen Huang
  • 1,037
  • 2
  • 9
  • 25

4 Answers4

24

My workaround was to just use map, not sure if there might be interest in a syntax sugar function for unzip later though.

a = dataset.map(lambda a, b: a)
b = dataset.map(lambda a, b: b)
Ouwen Huang
  • 1,037
  • 2
  • 9
  • 25
3

TensorFlow's get_single_element() is finally around which can be used to unzip datasets (as asked in the question above).

This avoids the need of generating and using an iterator using .map() or iter() (which could be costly for big datasets).

get_single_element() returns a tensor (or a tuple or dict of tensors) encapsulating all the members of the dataset. We need to pass all the members of the dataset batched into a single element.

This can be used to get features as a tensor-array, or features and labels as a tuple or dictionary (of tensor-arrays) depending upon how the original dataset was created.

import tensorflow as tf

a = [ 1, 2, 3 ]
b = [ 4, 5, 6 ]
c = [ (7, 8), (9, 10), (11, 12) ]
d = [ 13, 14 ]
# Creating datasets from lists
ads = tf.data.Dataset.from_tensor_slices(a)
bds = tf.data.Dataset.from_tensor_slices(b)
cds = tf.data.Dataset.from_tensor_slices(c)
dds = tf.data.Dataset.from_tensor_slices(d)

list(tf.data.Dataset.zip((ads, bds)).as_numpy_iterator()) == [ (1, 4), (2, 5), (3, 6) ] # True
list(tf.data.Dataset.zip((bds, ads)).as_numpy_iterator()) == [ (4, 1), (5, 2), (6, 3) ] # True

# Let's zip and unzip ads and dds
x = tf.data.Dataset.zip((ads, dds))
xa, xd = tf.data.Dataset.get_single_element(x.batch(len(x)))
xa = list(xa.numpy())
xd = list(xd.numpy())
print(xa, xd) # [1,2] [13, 14] # notice how xa is now different from a because ads was curtailed when zip was done above.
d == xd # True
manisar
  • 103
  • 1
  • 7
1

Building on Ouwen Huang's answer, this function seems to work for arbitrary datasets:

def split_datasets(dataset):
    tensors = {}
    names = list(dataset.element_spec.keys())
    for name in names:
        tensors[name] = dataset.map(lambda x: x[name])

    return tensors
markemus
  • 1,702
  • 15
  • 23
1

I have written a more general unzip function for tf.data.Dataset pipelines, which also handles the recursive case where a pipeline has multiple levels of zipping.

import tensorflow as tf


def tfdata_unzip(
    tfdata: tf.data.Dataset,
    *,
    recursive: bool=False,
    eager_numpy: bool=False,
    num_parallel_calls: int=tf.data.AUTOTUNE,
):
    """
    Unzip a zipped tf.data pipeline.

    Args:
        tfdata: the :py:class:`tf.data.Dataset`
            to unzip.

        recursive: Set to ``True`` to recursively unzip
            multiple layers of zipped pipelines.
            Defaults to ``False``.

        eager_numpy: Set this to ``True`` to return
            Python lists of primitive types or
            :py:class:`numpy.array` objects. Defaults
            to ``False``.

        num_parallel_calls: The level of parallelism to
            each time we ``map()`` over a
            :py:class:`tf.data.Dataset`.

    Returns:
        Returns a Python list of either
             :py:class:`tf.data.Dataset` or NumPy
             arrays.
    """
    if isinstance(tfdata.element_spec, tf.TensorSpec):
        if eager_numpy:
            return list(tfdata.as_numpy_iterator())
        return tfdata
        
    
    def tfdata_map(i: int) -> list:
        return tfdata.map(
            lambda *cols: cols[i],
            deterministic=True,
            num_parallel_calls=num_parallel_calls,
        )

    if isinstance(tfdata.element_spec, tuple):
        num_columns = len(tfdata.element_spec)
        if recursive:
            return [
                tfdata_unzip(
                    tfdata_map(i),
                    recursive=recursive,
                    eager_numpy=eager_numpy,
                    num_parallel_calls=num_parallel_calls,
                )
                for i in range(num_columns)
            ]
        else:
            return [
                tfdata_map(i)
                for i in range(num_columns)
            ]

    raise ValueError(
        "Unknown tf.data.Dataset element_spec: " +
        str(tfdata.element_spec)
    )

Here is how tfdata_unzip() works, given these example datasets:

>>> import numpy as np

>>> baby = tf.data.Dataset.from_tensor_slices([
    np.array([1,2]),
    np.array([3,4]),
    np.array([5,6]),
])
>>> baby.element_spec
TensorSpec(shape=(2,), dtype=tf.int64, name=None)
TensorSpec(shape=(2,), dtype=tf.int64, name=None)

>>> parent = tf.data.Dataset.zip((baby, baby))
>>> parent.element_spec
(TensorSpec(shape=(2,), dtype=tf.int64, name=None),
 TensorSpec(shape=(2,), dtype=tf.int64, name=None))

>>> grandparent = tf.data.Dataset.zip((parent, parent))
>>> grandparent.element_spec
((TensorSpec(shape=(2,), dtype=tf.int64, name=None),
  TensorSpec(shape=(2,), dtype=tf.int64, name=None)),
 (TensorSpec(shape=(2,), dtype=tf.int64, name=None),
  TensorSpec(shape=(2,), dtype=tf.int64, name=None)))

This is what tfdata_unzip() returns on the above baby, parent, and grandparent datasets:

>>> tfdata_unzip(baby)
<TensorSliceDataset shapes: (2,), types: tf.int64>

>>> tfdata_unzip(parent)
[<ParallelMapDataset shapes: (2,), types: tf.int64>,
 <ParallelMapDataset shapes: (2,), types: tf.int64>]

>>> tfdata_unzip(grandparent)
[<ParallelMapDataset shapes: ((2,), (2,)), types: (tf.int64, tf.int64)>,
 <ParallelMapDataset shapes: ((2,), (2,)), types: (tf.int64, tf.int64)>]

>>> tfdata_unzip(grandparent, recursive=True)
[[<ParallelMapDataset shapes: (2,), types: tf.int64>,
  <ParallelMapDataset shapes: (2,), types: tf.int64>],
 [<ParallelMapDataset shapes: (2,), types: tf.int64>,
  <ParallelMapDataset shapes: (2,), types: tf.int64>]]

>>> tfdata_unzip(grandparent, recursive=True, eager_numpy=True)
[[[array([1, 2]), array([3, 4]), array([5, 6])],
  [array([1, 2]), array([3, 4]), array([5, 6])]],
 [[array([1, 2]), array([3, 4]), array([5, 6])],
  [array([1, 2]), array([3, 4]), array([5, 6])]]]

James Mishra
  • 4,249
  • 4
  • 30
  • 35