I seem to be having some issues with creating a TensorFlow iterator object to feed in to a deep learning model. I am still very new to TensorFlow in general, but had a working example that I now can't seem to replicate.
The code itself seems to fail when attempting to iterate through my created tensorflow dataset using either iter(train_dataset)
OR for i in train_dataset: print i
. The shape and component that the error refers to is my mask layer. As you can see in the code below, I squeeze the np.array to remove the last dimension prior to one-hot encoding, then use tf.one_hot
to transform the categorical data. Printing the shape before and after this confirms that I have replaced the 1 dimension multi-value mask with a dimension containing 9 separate arrays.
I am attempting to train a semantic segmentation model using a multispectral (12-channel) image and a mask layer containing 9 values (labeled 0 -8).
I have looked at as many shape-related tensorflow tickets such as here and here. The closest thing I can find to my issue is here, however this refers to a different stage when they're actually applying the model - at this point I'm not that far through my script, i'm simply trying to read the pipeline I've just created!
Code is:
import os
import cv2
import numpy as np
from matplotlib import pyplot as plt
import segmentation_models as sm
from tensorflow.keras.metrics import MeanIoU
import random
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.models import load_model
import tifffile
from keras.utils import to_categorical
from sklearn.preprocessing import MinMaxScaler
import glob
import rasterio
from rasterio.plot import reshape_as_raster, reshape_as_image
import argparse
import time
import shutil
# create argument parser for easier training
parser = argparse.ArgumentParser(
description="Prepare mask and image for tensorflow deep learning training."
)
parser.add_argument(
"-t", "--tensorDir", help="Path of the tensorflow directory where prepare_imagery.py placed image/ mask patches.",
required=True, dest="tensorDir", metavar="DIRECTORY"
)
parser.add_argument(
"-n", "--modelName", help="Name of dl model to create (.hdf5 extension required)",
required=True, dest="modelName", metavar="'STRING.HDF5'"
)
args = parser.parse_args()
# assign and sort variables
root_directory = os.path.abspath(args.tensorDir)
os.chdir(root_directory)
model_name = args.modelName
train_img_dir = "./train_images/train/"
train_mask_dir = "./train_masks/train/"
val_img_dir = "./val_images/val/"
val_mask_dir = "./val_masks/val/"
train_images = glob.glob(train_img_dir + '*.tif')
train_masks = glob.glob(train_mask_dir + '*.tif')
val_images = glob.glob(val_img_dir + '*.tif')
val_masks = glob.glob(val_mask_dir + '*.tif')
train_images.sort()
train_masks.sort()
val_images.sort()
val_masks.sort()
num_images = len(os.listdir(train_img_dir))
print(f'Total number of images used to train model is: {num_images}')
print(f'Data type of image is: {rasterio.open(train_images[0]).dtypes}')
print(f'Data type of mask is: {rasterio.open(train_masks[0]).dtypes}')
seed=24
batch_size= 16
n_classes= 9
scaler = MinMaxScaler()
BACKBONE = 'resnet34'
def read_image(path):
x = rasterio.open(path).read()
x = reshape_as_image(x)
# x = x / 255.0 #normalise
# x = x.astype(np.float32) #convert to float
return x
def read_mask(path):
x = rasterio.open(path).read()
x = reshape_as_image(x)
# x = x / 255.0 # normalise
x = np.expand_dims(x, axis=-1) # add channel size to shape of np array
# x = x.astype(np.float32) #convert to float
return x
preprocess_input = sm.get_preprocessing(BACKBONE)
def mask_to_categorical(image, mask, n_class):
mask = tf.squeeze(mask, axis=-1)
#print(mask.shape)
mask = tf.one_hot(tf.cast(mask, tf.uint8), n_class)
#print(mask.shape)
mask = tf.cast(mask, tf.float32)
return image, mask
def preprocess(x, y):
def f(x, y):
x = x.decode()
y = y.decode()
x = read_image(x)
y = read_mask(y)
return x, y
images, masks = tf.numpy_function(f, [x, y], [tf.uint16, tf.uint8])
images.set_shape([256,256,12])
masks.set_shape([256,256,1])
images, masks = mask_to_categorical(images, masks, n_classes)
return images, masks
def tf_dataset(x, y, batch):
dataset = tf.data.Dataset.from_tensor_slices((x, y))
dataset = dataset.shuffle(buffer_size=1000)
dataset = dataset.map(preprocess)
dataset = dataset.batch(batch)
dataset = dataset.prefetch(2)
dataset = dataset.repeat()
return dataset
train_dataset = tf_dataset(train_images, train_masks, batch_size)
val_dataset = tf_dataset(val_images, val_masks, batch_size)
print(f'DATASET IS: {train_dataset}')
train_img_gen = iter(train_dataset)
val_img_gen = iter(val_dataset)
print(val_img_gen.element_spec)
x, y = train_img_gen.get_next()
Traceback message is:
2023-06-01 12:39:35.367510: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-01 12:39:35.416638: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-01 12:39:35.973884: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
Segmentation Models: using `keras` framework.
Total number of images used to train model is: 156
Data type of image is: ('uint16', 'uint16', 'uint16', 'uint16', 'uint16', 'uint16', 'uint16', 'uint16', 'uint16', 'uint16', 'uint16', 'uint16')
Data type of mask is: ('uint8',)
2023-06-01 12:39:38.371259: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-01 12:39:38.417293: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-01 12:39:38.417509: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-01 12:39:38.419055: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-01 12:39:38.419228: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-01 12:39:38.419356: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-01 12:39:38.924564: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-01 12:39:38.924756: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-01 12:39:38.924899: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-01 12:39:38.925012: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4069 MB memory: -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6
DATASET IS: <_RepeatDataset element_spec=(TensorSpec(shape=(None, 256, 256, 12), dtype=tf.uint16, name=None), TensorSpec(shape=(None, 256, 256, 9), dtype=tf.float32, name=None))>
2023-06-01 12:39:39.095075: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype string and shape [156]
[[{{node Placeholder/_1}}]]
2023-06-01 12:39:39.095393: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype string and shape [156]
[[{{node Placeholder/_1}}]]
2023-06-01 12:39:39.130062: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [52]
[[{{node Placeholder/_0}}]]
2023-06-01 12:39:39.130386: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype string and shape [52]
[[{{node Placeholder/_1}}]]
(TensorSpec(shape=(None, 256, 256, 12), dtype=tf.uint16, name=None), TensorSpec(shape=(None, 256, 256, 9), dtype=tf.float32, name=None))
Traceback (most recent call last):
File "/home/bwright/Documents/passive/lulc/scripts/train_model.py", line 155, in <module>
x, y = train_img_gen.get_next()
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/bwright/anaconda3/envs/tensorflow/lib/python3.11/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 850, in get_next
return self._next_internal()
^^^^^^^^^^^^^^^^^^^^^
File "/home/bwright/anaconda3/envs/tensorflow/lib/python3.11/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 780, in _next_internal
ret = gen_dataset_ops.iterator_get_next(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/bwright/anaconda3/envs/tensorflow/lib/python3.11/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 3016, in iterator_get_next
_ops.raise_from_not_ok_status(e, name)
File "/home/bwright/anaconda3/envs/tensorflow/lib/python3.11/site-packages/tensorflow/python/framework/ops.py", line 7262, in raise_from_not_ok_status
raise core._status_to_exception(e) from None # pylint: disable=protected-access
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tensorflow.python.framework.errors_impl.InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Incompatible shapes at component 1: expected [?,256,256,9] but got [16,256,256,1,9]. [Op:IteratorGetNext]
I realise there also seems to be a problem with the GPU, but one problem at a time!