I'm trying to create a dendrogram that looks like this, i.e. that has words a the bottom which correspond to each cluster.
import pandas as pd
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import TruncatedSVD
import numpy as np
# Data set
url = 'https://raw.githubusercontent.com/holtzy/The-Python-Graph-Gallery/master/static/data/mtcars.csv'
df = pd.read_csv(url)
df = df.set_index('model')
# Calculate the distance between each sample
Z = linkage(df, 'ward')
# Plot with Custom leaves
dendrogram(Z, leaf_rotation=90, leaf_font_size=8, labels=df.index)
# Show the graph
plt.show()
However I'm doing this for a large natural language dataset, so I'm trying to reduce the dataset with LSA using TruncatedSVD; otherwise my machine crashes.
Following the scikit learn documentation here I can plot a dendrogram but the labels correspond to the number of counts of samples under each node. I would like a cluster label for each node instead that I can read as in the motorcars example.
How do I do that? Would really appreciate any help/insight.
Code for where I've got to so far; note I'm not working with the IMDB dataset, it's just as an example dataset to make the code replicable.
df = pd.read_csv(
'https://raw.githubusercontent.com/peetck/IMDB-Top1000-Movies/master/IMDB-Movie-Data.csv'
)
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df.Description)
TSVD = TruncatedSVD(n_components=10)
X_reduced = TSVD.fit_transform(X)
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
model = model.fit(X_reduced)
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]).astype(float)
# Plot the corresponding dendrogram
out = dendrogram(linkage_matrix, **kwargs)
plot_dendrogram(model, p=3, truncate_mode='level', leaf_rotation=90)