Recently faced with such a problem: ValueError: array of size zero up to the maximum of the cast operation, having no identity.
import tensorflow as tf
import unicodedata
import string
import numpy as np
import re
import matplotlib.pyplot as plt
keras = tf.keras
class Lang(object):
def __init__(self, name):
self.name = name
self.word2int = {} #maps words to integers
self.word2count = {} #maps words to their total number in the corpus
self.int2word = {0 : "SOS", 1 : "EOS"} #maps integers to tokens (just the opposite of word2int but has some initial values. EOS means End of Sentence and it's a token used to indicate the end of a sentence. Every sentence is going to have an EOS token. SOS means Start of Sentence and is used to indicate the start of a sentence.)
self.n_words = 2 #Intial number of tokens (EOS and SOS)
def addWord(self, word):
if word not in self.word2int:
self.word2int[word] = self.n_words
self.word2count[word] = 1
self.int2word[self.n_words] = word
self.n_words += 1
else:
self.word2count[word] += 1
def addSentence(self, sentence):
for word in sentence.split(" "):
self.addWord(word)
def unicodeToAscii(s):
return "".join(c for c in unicodedata.normalize("NFD", s) \
if unicodedata.category(c) != "Mn")
def normalizeString(s):
s = unicodeToAscii(s.lower().strip())
s = re.sub(r"([!.?])", r" \1", s)
s = re.sub(r"[^a-zA-Z?.!]+", " ", s)
return s
def load_dataset():
with open("en_fr.txt",'r') as f:
lines = f.readlines()
pairs = [[normalizeString(pair) for pair in
line.strip().split('\t')] for line in lines]
return pairs
def sentencetoIndexes(sentence, lang):
indexes = [lang.word2int[word] for word in sentence.split()]
indexes.append(EOS_token)
return indexes
SOS_token = 0
EOS_token = 1
pairs = load_dataset()
MAX_LENGTH = 50
def sentencetoIndexes(sentence, lang):
indexes = [lang.word2int[word] for word in sentence.split()]
indexes.append(EOS_token)
return indexes
def filterPair(p):
try:
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
except:
return False
def filterPairs(pairs):
return [pair for pair in range(len(pairs)) if filterPair(pair)]
pairs = filterPairs(pairs)
def build_lang(lang1, lang2, max_length=50):
input_lang = Lang(lang1)
output_lang = Lang(lang2)
input_seq = []
output_seq = []
for pair in pairs:
input_lang.addSentence(pair[1])
output_lang.addSentence(pair[0])
for pair in pairs:
input_seq.append(sentencetoIndexes(pair[1], input_lang))
output_seq.append(sentencetoIndexes(pair[0], output_lang))
return (
keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_length, padding='post', truncating='post'),
keras.preprocessing.sequence.pad_sequences(output_seq, padding='post', truncating='post'),
input_lang, output_lang
)
And when doing
input_tensor, output_tensor, input_lang, output_lang = build_lang('en', 'fra')
Gives this error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-12-259feac15862> in <module>
----> 1 input_tensor, output_tensor, input_lang, output_lang = build_lang('en', 'fra')
<ipython-input-10-d20934657bc2> in build_lang(lang1, lang2, max_length)
12 output_seq.append(sentencetoIndexes(pair[0], output_lang))
13 return keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_length, padding='post',
---> 14 truncating='post'), keras.preprocessing.sequence.pad_sequences(output_seq, padding='post', truncating='post'), input_lang, output_lang
c:\users\zealottv\appdata\local\programs\python\python38\lib\site-packages\tensorflow\python\keras\preprocessing\sequence.py in pad_sequences(sequences, maxlen, dtype, padding, truncating, value)
154 or in case of invalid shape for a `sequences` entry.
155 """
--> 156 return sequence.pad_sequences(
157 sequences, maxlen=maxlen, dtype=dtype,
158 padding=padding, truncating=truncating, value=value)
c:\users\zealottv\appdata\local\programs\python\python38\lib\site-packages\keras_preprocessing\sequence.py in pad_sequences(sequences, maxlen, dtype, padding, truncating, value)
75
76 if maxlen is None:
---> 77 maxlen = np.max(lengths)
78
79 is_dtype_str = np.issubdtype(dtype, np.str_) or np.issubdtype(dtype, np.unicode_)
<__array_function__ internals> in amax(*args, **kwargs)
c:\users\zealottv\appdata\local\programs\python\python38\lib\site-packages\numpy\core\fromnumeric.py in amax(a, axis, out, keepdims, initial, where)
2665 5
2666 """
-> 2667 return _wrapreduction(a, np.maximum, 'max', axis, None, out,
2668 keepdims=keepdims, initial=initial, where=where)
2669
c:\users\zealottv\appdata\local\programs\python\python38\lib\site-packages\numpy\core\fromnumeric.py in _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs)
88 return reduction(axis=axis, out=out, **passkwargs)
89
---> 90 return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
91
92
ValueError: zero-size array to reduction operation maximum which has no identity
The full code can be downloaded here.