K nearest neighbors produces really good clusters for co-occurrence matrices such as the one you have there. I put together a quick blog post on clustering semantically similar words using K mean, but here's the quick code:
from __future__ import division
from sklearn.cluster import KMeans
from numbers import Number
from pandas import DataFrame
import sys, codecs, numpy
class autovivify_list(dict):
'''Pickleable class to replicate the functionality of collections.defaultdict'''
def __missing__(self, key):
value = self[key] = []
return value
def __add__(self, x):
'''Override addition for numeric types when self is empty'''
if not self and isinstance(x, Number):
return x
raise ValueError
def __sub__(self, x):
'''Also provide subtraction method'''
if not self and isinstance(x, Number):
return -1 * x
raise ValueError
def build_word_vector_matrix(vector_file, n_words):
'''Iterate over the GloVe array read from sys.argv[1] and return its vectors and labels as arrays'''
numpy_arrays = []
labels_array = []
with codecs.open(vector_file, 'r', 'utf-8') as f:
for c, r in enumerate(f):
sr = r.split()
labels_array.append(sr[0])
numpy_arrays.append( numpy.array([float(i) for i in sr[1:]]) )
if c == n_words:
return numpy.array( numpy_arrays ), labels_array
return numpy.array( numpy_arrays ), labels_array
def find_word_clusters(labels_array, cluster_labels):
'''Read in the labels array and clusters label and return the set of words in each cluster'''
cluster_to_words = autovivify_list()
for c, i in enumerate(cluster_labels):
cluster_to_words[ i ].append( labels_array[c] )
return cluster_to_words
if __name__ == "__main__":
input_vector_file = sys.argv[1]
n_words = int(sys.argv[2])
reduction_factor = float(sys.argv[3])
clusters_to_make = int( n_words * reduction_factor )
df, labels_array = build_word_vector_matrix(input_vector_file, n_words)
kmeans_model = KMeans(init='k-means++', n_clusters=clusters_to_make, n_init=10)
kmeans_model.fit(df)
cluster_labels = kmeans_model.labels_
cluster_inertia = kmeans_model.inertia_
cluster_to_words = find_word_clusters(labels_array, cluster_labels)
for c in cluster_to_words:
print cluster_to_words[c]
If you save this script as cluster_vectors.py, you can run:
wget http://www-nlp.stanford.edu/data/glove.6B.300d.txt.gz
gunzip glove.6B.300d.txt.gz
python cluster_vectors.py glove.6B.300d.txt 10000 .1
To read the first 10000 lines of the GloVe word vectors (semantic word vectors inferred from term cooccurrence) and cluster those words into 10000 * .1 = 1000 clusters. The clusters will look something like this:
[u'Chicago', u'Boston', u'Houston', u'Atlanta', u'Dallas', u'Denver', u'Philadelphia', u'Baltimore', u'Cleveland', u'Pittsburgh', u'Buffalo', u'Cincinnati', u'Louisville', u'Milwaukee', u'Memphis', u'Indianapolis', u'Auburn', u'Dame']
[u'Product', u'Products', u'Shipping', u'Brand', u'Customer', u'Items', u'Retail', u'Manufacturer', u'Supply', u'Cart', u'SKU', u'Hardware', u'OEM', u'Warranty', u'Brands']
[u'home', u'house', u'homes', u'houses', u'housing', u'offices', u'household', u'acres', u'residence']
[...]
[u'Night', u'Disney', u'Magic', u'Dream', u'Ultimate', u'Fantasy', u'Theme', u'Adventure', u'Cruise', u'Potter', u'Angels', u'Adventures', u'Dreams', u'Wonder', u'Romance', u'Mystery', u'Quest', u'Sonic', u'Nights']
I hope this helps!