0

I've been working in this project long time ago and I have some trouble analyzing the clusters. Basically, i'm reading some data from an csv file with 4000 records using panda read_csv() (a excel file exported as csv), then I clean the extracted data by removing the punctuation, tokenizing and stemming, in the following step, I create the Tdidf Matrix and, using k-means, I make the clusters.

I've used the following libs:

word_tokenize, SnowballStemmer, TfidfVectorizer, cosine_similarity, KMeans, MDS. With python 3.

from __future__ import print_function
import os
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
from string import punctuation
import numpy as np


# Creación de un corpus de documentos a partir de una ruta (PATH) donde se encuentran varios documentos.
# Se genera una lista de textos de cada documentos junto con su nombre de archivo de origen

def CrearCorpus(path):

    df = pd.read_csv('./llamadas.csv', usecols=['motivo', 'respuesta'], delimiter=';')

    corpus = []

    for i in range(1, 4050):
        problema = str(df['motivo'][i])
        solucion = str(df['respuesta'][i])
        problema_final = problema + ' ' + solucion
        corpus.append([problema_final, 'document ' + str(i + 1)])
    return (corpus)

# Eliminar "stopwords" de un texto

def _RemoveStopwords(sentence):
    word_tokens = word_tokenize(sentence)
    stop_words = set(stopwords.words('spanish'))
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = ""
    for w in word_tokens:
       if w not in stop_words:
           filtered_sentence = filtered_sentence + " "+w
    return(filtered_sentence)

# Lee  cada uno de los documentos desde el corpus y genera los textos y sus identificadores (titles)

def read_documents(path):
    corpus = CrearCorpus(path)
    documents = []
    titles = []
    for c in range(len(corpus)):
        (doc, fn) = corpus[c]
        titles.append(fn)
        documents.append(doc)
    return ((documents, titles))


# Elimina puntuación de los documentos

def removePuntuaction(documents):
    translator = str.maketrans('', '', punctuation)
    for i in range(len(documents)):
        documents[i] = documents[i].translate(translator)
    return (documents)


# Realizar lematización de un texto en Español

def Stemmer(text):
    stemmer = SnowballStemmer('spanish')
    words_stem = stemmer.stem(text)
    return (words_stem)


# Realiza tokenización y lematización de un texto

def tokenize_and_stem(textdata):
    text = word_tokenize(textdata)
    lista = []
    for elem in text:
        word = elem.lower()
        nuevo = Stemmer(word)
        lista.append(nuevo)
    return (lista)


# Realiza tokenización de un un texto

def tokenize_only(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = tokens
    return filtered_tokens


# Crear vocabulario de términos a partir del corpus de documentos
# Se crea una tabla (FRAME) que representa un "vocabulario" de las palabras de cada documento

def crear_vocabulario(documents):
    # crear dos listas, una lematizada y otra con tokens
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for i in documents:
        allwords_stemmed = tokenize_and_stem(i)
        totalvocab_stemmed.extend(allwords_stemmed)
        allwords_tokenized = tokenize_only(i)
        totalvocab_tokenized.extend(allwords_tokenized)
    vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_stemmed)
    return (vocab_frame)


# Crea una matrix tf x idf a partir de los textos "tokenizados" y lematizados

def crear_matriz_tfidf():
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                       min_df=0.2,
                                       use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    terms = tfidf_vectorizer.get_feature_names()
    return ((tfidf_matrix, terms))


# Realiza clustering K-means de la matrix tf x idf (entrega los clusters)

def clustering(tfidf_matrix, num_clusters):
    km = KMeans(n_clusters=num_clusters)  # Crea objeto de datos KMeans
    km.fit(tfidf_matrix)  # Realiza K-means propiamente tal
    clusters = km.labels_.tolist()

    return ((clusters, km))


# Muestra estadísticas de los clusters y objetos (documentos y palabras) generados

def cluster_stats(clusters, titles, km, tfidf_matrix):

    # abrimos el archivo csv
    df = pd.read_csv('./llamadas.csv', usecols=['Plataforma'], delimiter=';')

    films = {'title': titles, 'documents': documents, 'cluster': clusters}
    frame = pd.DataFrame(films, index=[clusters], columns=['title', 'cluster'])
    frame['cluster'].value_counts()  # number of docs per cluster
    print("Top terminos por cluster:")
    print()
    # Ordenar centros de clusters segun proximidad al centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    # analizamos cluster por cluster
    for i in range(num_clusters):

        print("Palabras de Cluster %d:" % i, end='')
        for ind in order_centroids[i, :5]:  # replace 6 with n words per cluster
            print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'),
                  end=',')
        print()
        print()

        # variable para ver el largo de la cantidad de documentos involucrados en el cluster
        list_row = []

        print("IDs de cluster %d:" % i, end='')
        for title in frame.loc[i]['title'].values.tolist():
            print(' %s,' % title, end='')

        # extraemos la linea de cada documento para analizar de que plataforma es
        count_snd = 0
        count_mateonet = 0
        print()
        numbers_rows = frame.loc[i]['title'].values.tolist()
        for index in range(0, len(numbers_rows)):
            row = numbers_rows[index].strip('document ')
            list_row.append(int(row))

        # contamos cuantos elementos son de que plataforma
        for cols in range(0, len(list_row)):
            value_platform = df['Plataforma'][list_row[cols]]
            if value_platform == 'SND ':
                count_snd = count_snd + 1
            if value_platform == 'Mateonet ':
                count_mateonet = count_mateonet + 1

        # indices por plataforma
        print()
        print('Plataforma SND: ' + str(count_snd))
        print()
        print('Plataforma Mateonet: ' + str(count_mateonet))
        print()
        print('Cantidad de preguntas: ' + str(len(frame.loc[i]['title'].values.tolist())))
        print()
        print()

#MAIN

# Ajustar esta variable con la ruta a un directorio que contenga varios documentos
# Give the location of the file
PATH = "./llamadas.csv"

num_clusters = 100 # Maximo numero de clusters es 5

(documents, titles) = read_documents(PATH)
documents = removePuntuaction(documents)
vocab_frame = crear_vocabulario(documents)

print('Existen ' + str(vocab_frame.shape[0]) + ' itemes en vocab_frame')

(tfidf_matrix, terms) = crear_matriz_tfidf()
(clusters, km) = clustering(tfidf_matrix, num_clusters)
cluster_stats(clusters, titles, km, tfidf_matrix)

https://github.com/felipefuller/faq/blob/master/data_analysis.py

I can create up to 167 clusters, but when I update it to create 168 or more clusters it raises the following errors:

Traceback (most recent call last):
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2657, in get_loc
    return self._engine.get_loc(key)
  File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 987, in pandas._libs.hashtable.Int64HashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 993, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 167

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/felipefuller/dev/faq/cluster.py", line 324, in <module>
    cluster_stats(clusters, titles, km, tfidf_matrix)
  File "/Users/felipefuller/dev/faq/cluster.py", line 187, in cluster_stats
    if (len(frame.loc[i]['title'].values.tolist()) >= 250):
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexing.py", line 1500, in __getitem__
    return self._getitem_axis(maybe_callable, axis=axis)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexing.py", line 1913, in _getitem_axis
    return self._get_label(key, axis=axis)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexing.py", line 141, in _get_label
    return self.obj._xs(label, axis=axis)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/generic.py", line 3583, in xs
    drop_level=drop_level)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/multi.py", line 2571, in get_loc_level
    indexer = self._get_level_indexer(key, level=level)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/multi.py", line 2652, in _get_level_indexer
    code = level_index.get_loc(key)
  File "/Users/felipefuller/.virtualenvs/faq/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 987, in pandas._libs.hashtable.Int64HashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 993, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 167

So it should be able to manage easily more than 167 clusters, because i'm using more than 4000 records.

Thank you!

1 Answers1

0

Clearly there is an error in you code.

Your probably try to access a non existent column, not row.

Fix your code - we can't because we don't have it.

Has QUIT--Anony-Mousse
  • 76,138
  • 12
  • 138
  • 194
  • It's not so much about seeing the full code - which will take too much time to read through, but your ability to identify the actual location of the error (line 187) from your error backtrace. – Has QUIT--Anony-Mousse Sep 06 '19 at 00:38