I've been playing with the below script:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import textract
import os
folder_to_scan = '/media/sf_Documents/clustering'
dict_of_docs = {}
# Gets all the files to scan with textract
for root, sub, files in os.walk(folder_to_scan):
for file in files:
full_path = os.path.join(root, file)
print(f'Processing {file}')
try:
text = textract.process(full_path)
dict_of_docs[file] = text
except Exception as e:
print(e)
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(dict_of_docs.values())
true_k = 3
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print("Cluster %d:" % i,)
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind],)
It scans a folder of images that are scanned documents, extracts the text then clusters the text. I know for a fact there are 3 different types of documents, so I set the true_k to 3. But what if I had a folder of unknown documents where there could be anythings from 1 to 100s of different document types.