Suppose, my data is as follows (we have 90041 rows of data here):
2.268 7.042 5.781 5.399 5.373 5.423 -9.118 5.488 5.166 4.852 7.470 6.452 6.069 0 0 0 1 0 1 1 3 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2.101 5.781 5.399 5.373 5.423 5.247 5.488 5.166 4.852 5.164 6.452 6.069 6.197 0 1 1 3 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2.222 5.399 5.373 5.423 5.247 5.485 5.166 4.852 5.164 4.943 6.069 6.197 6.434 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2.416 5.373 5.423 5.247 5.485 6.675 4.852 5.164 4.943 8.103 6.197 6.434 8.264 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3.028 5.423 5.247 5.485 6.675 6.372 5.164 4.943 8.103 -9.152 6.434 8.264 9.047 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-1.235 5.247 5.485 6.675 6.372 5.669 4.943 8.103 -9.152 -8.536 8.264 9.047 11.954 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-0.953 5.485 6.675 6.372 5.669 5.304 8.103 -9.152 -8.536 5.433 9.047 11.954 6.703 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2.233 6.675 6.372 5.669 5.304 5.461 -9.152 -8.536 5.433 4.924 11.954 6.703 6.407 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2.313 6.372 5.669 5.304 5.461 5.265 -8.536 5.433 4.924 5.007 6.703 6.407 6.088 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
2.314 5.669 5.304 5.461 5.265 5.379 5.433 4.924 5.007 5.057 6.407 6.088 6.410 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
... ... ...
... ... ...
Now, let us look into the following source code:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import sys, random
import time
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import numpy as np
def load_data_func(fname: str, yyy_index: int, **selection):
i = 0
file = open(fname)
if "top_n_lines" in selection:
lines = [next(file) for _ in range(int(selection["top_n_lines"]))]
elif "random_n_lines" in selection:
tmp_lines = file.readlines()
lines = random.sample(tmp_lines, int(selection["random_n_lines"]))
else:
lines = file.readlines()
data_x, data_y = [], []
for l in lines:
row = l.strip().split() # return a list of words from the line.
x = [float(ix) for ix in row[yyy_index+1:]] # convert 3rd to 20th word into a vector of float numbers.
y = float(row[yyy_index]) # select the 7th column.
data_x.append(x) # append the vector into 'data_x'
data_y.append(y) # append the vector into 'data_y'
# END for l in lines
num_rows = len(data_x)
print("row size = ", len(data_x[0]))
given_fraction = selection.get("validation_part", 1.0)
if given_fraction > 0.9999:
valid_x, valid_y = data_x, data_y
else:
n = int(num_rows * given_fraction)
data_x, data_y = data_x[n:], data_y[n:]
valid_x, valid_y = data_x[:n], data_y[:n]
# END of if-else block
print("size of x = ", len(data_x))
print("size of y = ", len(data_y))
tx = tf.convert_to_tensor(data_x, dtype=tf.float32)
ty = tf.convert_to_tensor(data_y, dtype=tf.float32)
vx = tf.convert_to_tensor(valid_x, dtype=tf.float32)
vy = tf.convert_to_tensor(valid_y, dtype=tf.float32)
return tx, ty, vx, vy
# END of the function
When I call it like the following:
train_x, train_y, validate_x, validate_y = \
load_data_func(
fname="data_file.dat",
yyy_index=6,
random_n_lines=90000,
validation_part=0.2
)
print("row count", len(train_x))
print("col count", len(train_x[0]))
I get the following error:
my_user@my_remote_server:~/my_project_dir$ python3 load_data_test.py
row size = 40
size of x = 72000
size of y = 72000
Traceback (most recent call last):
File "load_data_test.py", line 74, in <module>
validation_part=0.2
File "load_data_test.py", line 58, in load_data_func
tx = tf.convert_to_tensor(data_x, dtype=tf.float32)
File "/usr/local/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py", line 206, in wrapper
return target(*args, **kwargs)
File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 1431, in convert_to_tensor_v2_with_dispatch
value, dtype=dtype, dtype_hint=dtype_hint, name=name)
File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 1441, in convert_to_tensor_v2
as_ref=False)
File "/usr/local/lib/python3.7/site-packages/tensorflow/python/profiler/trace.py", line 163, in wrapped
return func(*args, **kwargs)
File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 1566, in convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py", line 339, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py", line 265, in constant
allow_broadcast=True)
File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py", line 276, in _constant_impl
return _constant_eager_impl(ctx, value, dtype, shape, verify_shape)
File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py", line 301, in _constant_eager_impl
t = convert_to_eager_tensor(value, ctx, dtype)
File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py", line 98, in convert_to_eager_tensor
return ops.EagerTensor(value, ctx.device_name, dtype)
ValueError: Can't convert non-rectangular Python sequence to Tensor.
my_user@my_remote_server:~/my_project_dir$
Interestingly, the following calls do not show any error:
train_x, train_y, validate_x, validate_y = \
load_data_func(
fname="data_file.dat",
yyy_index=6,
top_n_lines=90000, #<============
validation_part=0.2
)
train_x, train_y, validate_x, validate_y = \
load_data_k(
fname="data_file.dat",
yyy_index=6,
random_n_lines=60000, #<=============
validation_part=0.2
)
What am I doing wrong?