I am having this same issue. I think that a closer solution is to freeze the model into a file and import the model and then cluster a new predict phrase, I think if the vectorizer and kmeans clustering is initialized every single time the program it will run, it seems to order the clusters in a different order every time and the hashmap will not activate correclty and give you a different number every time the function is called
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
# Sample array of string sentences
df = pd.read_csv('/workspaces/codespaces-flask//data/shuffled.csv')
df = shuffle(df)
sentences = df['text'].values
# Convert the sentences into TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
# Perform K-Means clustering
kmeans = KMeans(n_clusters=8, random_state=42)
clusters = kmeans.fit_predict(X)
output = zip(sentences, clusters)
# Print the cluster assignments for each sentence
for sentence, cluster in zip(sentences, clusters):
print("Sentence:", sentence, "Cluster:", cluster)
df = pd.DataFrame(output)
db_file_name = '/workspaces/codespaces-flask/ThrAive/data/database1.db'
conn = sqlite3.connect(db_file_name)
cursor = conn.cursor()
cursor.execute("SELECT journal_text FROM Journal JOIN User ON Journal.id
= user.id
rows = cursor.fetchall()
conn.commit()
conn.close()
df1 = pd.DataFrame(rows)
df1 = df1.applymap(lambda x: " ".join(x.split()) if isinstance(x, str)
else x)
entry = df1
entry = entry
print(entry)
entry = entry[0].iloc[-1].lower()
entry = [entry]
new_X = vectorizer.transform(entry)
# Predict the cluster assignments for the new sentences
new_clusters = kmeans.predict(new_X)
for entry, new_cluster in zip(entry, new_clusters):
print("Sentence:", entry, "Cluster:", new_cluster)
zipper = zip(entry, new_clusters)
df = pd.DataFrame(zipper)
df = df.applymap(lambda x: " ".join(x.split()) if isinstance(x, str)
else x)
df = df.to_string( header=False, index=False)
entry = df
output = entry
numbers = ['0', '1', '2', '3', '4','5','6','7','8']
names =
# Create a dictionary that maps numbers to names
number_to_name = {number: name for number, name in zip(numbers, names)}
print(output[-1])
output = number_to_name[output[-1]]
json_string = json.dumps(str(output))
I think that the solution is saving the model to disk
import pickle
# Train a scikit-learn model
model = ///
# Save the model to disk
with open('model.pkl', 'wb') as file:
pickle.dump(model, file)
and then load the pickle file and test it on the k-means without re-initializing the cluster.