Error pandas._libs.hashtable.Int64HashTable.get_item when creating cluster

Question

I've been working in this project long time ago and I have some trouble analyzing the clusters. Basically, i'm reading some data from an csv file with 4000 records using panda read_csv() (a excel file exported as csv), then I clean the extracted data by removing the punctuation, tokenizing and stemming, in the following step, I create the Tdidf Matrix and, using k-means, I make the clusters.

I've used the following libs:

word_tokenize, SnowballStemmer, TfidfVectorizer, cosine_similarity, KMeans, MDS. With python 3.

from __future__ import print_function
import os
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
from string import punctuation
import numpy as np


# Creación de un corpus de documentos a partir de una ruta (PATH) donde se encuentran varios documentos.
# Se genera una lista de textos de cada documentos junto con su nombre de archivo de origen

def CrearCorpus(path):

    df = pd.read_csv('./llamadas.csv', usecols=['motivo', 'respuesta'], delimiter=';')

    corpus = []

    for i in range(1, 4050):
        problema = str(df['motivo'][i])
        solucion = str(df['respuesta'][i])
        problema_final = problema + ' ' + solucion
        corpus.append([problema_final, 'document ' + str(i + 1)])
    return (corpus)

# Eliminar "stopwords" de un texto

def _RemoveStopwords(sentence):
    word_tokens = word_tokenize(sentence)
    stop_words = set(stopwords.words('spanish'))
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = ""
    for w in word_tokens:
       if w not in stop_words:
           filtered_sentence = filtered_sentence + " "+w
    return(filtered_sentence)

# Lee  cada uno de los documentos desde el corpus y genera los textos y sus identificadores (titles)

def read_documents(path):
    corpus = CrearCorpus(path)
    documents = []
    titles = []
    for c in range(len(corpus)):
        (doc, fn) = corpus[c]
        titles.append(fn)
        documents.append(doc)
    return ((documents, titles))


# Elimina puntuación de los documentos

def removePuntuaction(documents):
    translator = str.maketrans('', '', punctuation)
    for i in range(len(documents)):
        documents[i] = documents[i].translate(translator)
    return (documents)


# Realizar lematización de un texto en Español

def Stemmer(text):
    stemmer = SnowballStemmer('spanish')
    words_stem = stemmer.stem(text)
    return (words_stem)


# Realiza tokenización y lematización de un texto

def tokenize_and_stem(textdata):
    text = word_tokenize(textdata)
    lista = []
    for elem in text:
        word = elem.lower()
        nuevo = Stemmer(word)
        lista.append(nuevo)
    return (lista)


# Realiza tokenización de un un texto

def tokenize_only(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = tokens
    return filtered_tokens


# Crear vocabulario de términos a partir del corpus de documentos
# Se crea una tabla (FRAME) que representa un "vocabulario" de las palabras de cada documento

def crear_vocabulario(documents):
    # crear dos listas, una lematizada y otra con tokens
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for i in documents:
        allwords_stemmed = tokenize_and_stem(i)
        totalvocab_stemmed.extend(allwords_stemmed)
        allwords_tokenized = tokenize_only(i)
        totalvocab_tokenized.extend(allwords_tokenized)
    vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_stemmed)
    return (vocab_frame)


# Crea una matrix tf x idf a partir de los textos "tokenizados" y lematizados

def crear_matriz_tfidf():
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                       min_df=0.2,
                                       use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    terms = tfidf_vectorizer.get_feature_names()
    return ((tfidf_matrix, terms))


# Realiza clustering K-means de la matrix tf x idf (entrega los clusters)

def clustering(tfidf_matrix, num_clusters):
    km = KMeans(n_clusters=num_clusters)  # Crea objeto de datos KMeans
    km.fit(tfidf_matrix)  # Realiza K-means propiamente tal
    clusters = km.labels_.tolist()

    return ((clusters, km))


# Muestra estadísticas de los clusters y objetos (documentos y palabras) generados

def cluster_stats(clusters, titles, km, tfidf_matrix):

    # abrimos el archivo csv
    df = pd.read_csv('./llamadas.csv', usecols=['Plataforma'], delimiter=';')

    films = {'title': titles, 'documents': documents, 'cluster': clusters}
    frame = pd.DataFrame(films, index=[clusters], columns=['title', 'cluster'])
    frame['cluster'].value_counts()  # number of docs per cluster
    print("Top terminos por cluster:")
    print()
    # Ordenar centros de clusters segun proximidad al centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    # analizamos cluster por cluster
    for i in range(num_clusters):

        print("Palabras de Cluster %d:" % i, end='')
        for ind in order_centroids[i, :5]:  # replace 6 with n words per cluster
            print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'),
                  end=',')
        print()
        print()

        # variable para ver el largo de la cantidad de documentos involucrados en el cluster
        list_row = []

        print("IDs de cluster %d:" % i, end='')
        for title in frame.loc[i]['title'].values.tolist():
            print(' %s,' % title, end='')

        # extraemos la linea de cada documento para analizar de que plataforma es
        count_snd = 0
        count_mateonet = 0
        print()
        numbers_rows = frame.loc[i]['title'].values.tolist()
        for index in range(0, len(numbers_rows)):
            row = numbers_rows[index].strip('document ')
            list_row.append(int(row))

        # contamos cuantos elementos son de que plataforma
        for cols in range(0, len(list_row)):
            value_platform = df['Plataforma'][list_row[cols]]
            if value_platform == 'SND ':
                count_snd = count_snd + 1
            if value_platform == 'Mateonet ':
                count_mateonet = count_mateonet + 1

        # indices por plataforma
        print()
        print('Plataforma SND: ' + str(count_snd))
        print()
        print('Plataforma Mateonet: ' + str(count_mateonet))
        print()
        print('Cantidad de preguntas: ' + str(len(frame.loc[i]['title'].values.tolist())))
        print()
        print()

#MAIN

# Ajustar esta variable con la ruta a un directorio que contenga varios documentos
# Give the location of the file
PATH = "./llamadas.csv"

num_clusters = 100 # Maximo numero de clusters es 5

(documents, titles) = read_documents(PATH)
documents = removePuntuaction(documents)
vocab_frame = crear_vocabulario(documents)

print('Existen ' + str(vocab_frame.shape[0]) + ' itemes en vocab_frame')

(tfidf_matrix, terms) = crear_matriz_tfidf()
(clusters, km) = clustering(tfidf_matrix, num_clusters)
cluster_stats(clusters, titles, km, tfidf_matrix)

https://github.com/felipefuller/faq/blob/master/data_analysis.py

I can create up to 167 clusters, but when I update it to create 168 or more clusters it raises the following errors:

Traceback (most recent call last):
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2657, in get_loc
    return self._engine.get_loc(key)
  File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 987, in pandas._libs.hashtable.Int64HashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 993, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 167

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/felipefuller/dev/faq/cluster.py", line 324, in <module>
    cluster_stats(clusters, titles, km, tfidf_matrix)
  File "/Users/felipefuller/dev/faq/cluster.py", line 187, in cluster_stats
    if (len(frame.loc[i]['title'].values.tolist()) >= 250):
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexing.py", line 1500, in __getitem__
    return self._getitem_axis(maybe_callable, axis=axis)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexing.py", line 1913, in _getitem_axis
    return self._get_label(key, axis=axis)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexing.py", line 141, in _get_label
    return self.obj._xs(label, axis=axis)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/generic.py", line 3583, in xs
    drop_level=drop_level)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/multi.py", line 2571, in get_loc_level
    indexer = self._get_level_indexer(key, level=level)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/multi.py", line 2652, in _get_level_indexer
    code = level_index.get_loc(key)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 987, in pandas._libs.hashtable.Int64HashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 993, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 167

So it should be able to manage easily more than 167 clusters, because i'm using more than 4000 records.

Thank you!

actually you could check if the len of the frame dataframe is > 168?is that the case? — JacoSolari, Sep 05 '19 at 09:12
Please don't repost questions that you already posted yesterday: https://stackoverflow.com/q/57777963/1060350 — Has QUIT--Anony-Mousse, Sep 05 '19 at 14:10

score 0 · Answer 1 · answered Sep 05 '19 at 14:09

0

Clearly there is an error in you code.

Your probably try to access a non existent column, not row.

Fix your code - we can't because we don't have it.

answered Sep 05 '19 at 14:09

Has QUIT--Anony-Mousse

76,138
12
138
194

It's not so much about seeing the full code - which will take too much time to read through, but your ability to identify the actual location of the error (line 187) from your error backtrace. – Has QUIT--Anony-Mousse Sep 06 '19 at 00:38

Error pandas._libs.hashtable.Int64HashTable.get_item when creating cluster

1 Answers1