Error: IndexError: index 6319 is out of bounds for axis 0 with size 0

Question

Th code below is taken from https://github.com/arunarn2/HierarchicalAttentionNetworks/blob/master/HierarchicalAttn.py with a few minor tweaks. Although I understand what the error means, I am not able to figure out how it is creeping in the following code and how to rectify it. I have been stuck on this for quite some time, would really appreciate some help. Thanks!

(This is the entire code)

maxlen = 100
max_sentences = 15
max_words = 20000
embedding_dim = 100
validation_split = 0.2
reviews = []
labels = []
texts = []
glove_dir = "./glove.6B"
embeddings_index = {}


# class defining the custom attention layer
class HierarchicalAttentionNetwork(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(HierarchicalAttentionNetwork, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim,)))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(HierarchicalAttentionNetwork, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))

        ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        weighted_input = x * K.expand_dims(ait)
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]


def remove_html(str_a):
    p = re.compile(r'<.*?>')
    return p.sub('', str_a)


# replace all non-ASCII (\x00-\x7F) characters with a space
def replace_non_ascii(str_a):
    return re.sub(r'[^\x00-\x7f]', r'', str_a)


# Tokenization/string cleaning for dataset
def clean_str(string):
    string= string.decode("utf-8")
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()



input_data = pd.read_csv(io.BytesIO(uploaded['labeledTrainData.tsv']), sep='\t')

for idx in range(input_data.review.shape[0]):
    text = BeautifulSoup(input_data.review[idx], features="html5lib")
    text = clean_str(text.get_text().encode('ascii', 'ignore'))
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)
    np.append(labels, input_data.sentiment[idx])

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), max_sentences, maxlen), dtype='int32')

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j < max_sentences:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < maxlen and tokenizer.word_index[word] < max_words:
                    data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

if np.any(np.array(labels)):
    labels = np_utils.to_categorical(np.array(labels))
#labels = to_categorical(np.asarray(labels))

print('Shape of reviews (data) tensor:', data.shape)
print('Shape of sentiment (label) tensor:', np.shape(labels))

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = np.asarray(labels)[indices.astype(int)]
#labels = labels[indices]
nb_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in training and validation set')
print (y_train.sum(axis=0))
print (y_val.sum(axis=0))


f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
                            input_length=maxlen, trainable=True, mask_zero=True)

sentence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
lstm_word = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
attn_word = HierarchicalAttentionNetwork(100)(lstm_word)
sentenceEncoder = Model(sentence_input, attn_word)

review_input = Input(shape=(max_sentences, maxlen), dtype='int32')
review_encoder = TimeDistributed(sentenceEncoder)(review_input)
lstm_sentence = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
attn_sentence = HierarchicalAttentionNetwork(100)(lstm_sentence)
preds = Dense(2, activation='softmax')(attn_sentence)
model = Model(review_input, preds)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

print("model fitting - Hierachical attention network")
model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=10, batch_size=100)

The full error:

Please share full error message including which line raises the error. — Kaveh, Sep 06 '21 at 12:34
In addition of the full error traceback, could you provide what you already tried ? — Alexandre Mahdhaoui, Sep 06 '21 at 12:57
with size 0 says there is no data in labels, print(labels) and check — harry, Sep 08 '21 at 12:46
I would probably bet that its a simple array access error though. label[n] should probably be label[0][n] or something along those lines. As harry mentioned, could you confirm that labeels is valid? — Jason Chia, Sep 10 '21 at 08:57

jccampanero · Answer 1 · 2021-09-13T15:52:21.213

Python is complaining because you are trying to access by index the labels array but it is empty, as displayed in the console output:

Shape of sentiment (label) tensor: (0,)

The problem is in this line:

np.append(labels, input_data.sentiment[idx])

In the original code you cited, a new value is appended to the labels list. This change occurs in place, the original list is modified. On the contrary, as indicated in the numpy documentation, when describing the value returned by np.append, being arr the original array:

A copy of arr with values appended to axis. Note that append does not occur in-place: a new array is allocated and filled.

i.e., your original labels list, an empty array, is never modified, and this causes the error when trying to access the array by index later in the code.

If you want to achieve a similar behavior, you need to modify your code like this:

labels = np.append(labels, input_data.sentiment[idx])

Please, be aware that this operation will be very inefficient for the reason explained, it is much better to append the sentiment result directly to the original labels list as in the original code:

labels.append(input_data.sentiment[idx])

Please, see this related SO question as well.

@huy Were you able to test the proposed solution? Did it work? — jccampanero, Sep 18 '21 at 18:33

Error: IndexError: index 6319 is out of bounds for axis 0 with size 0

1 Answers1