I found this module that uses the Same Size Constrained K-Means Heuristics: Use Heuristics methods to reach the same size clustering where it gives the same size of the group.
I start with pip install size-constrained-clustering
or pip install git+https://github.com/jingw2/size_constrained_clustering.git
, and you can use minmax flow
or Heuristics
.
n_samples = 2000
n_clusters = 3
X = np.random.rand(n_samples, 2)
model = equal.SameSizeKMeansMinCostFlow(n_clusters)
#model = equal.SameSizeKMeansHeuristics(n_clusters)
model.fit(X)
centers = model.cluster_centers_
labels = model.labels_
If you have a problem with the size-constrained-clustering
module, you can use these classes, but you need to install k-means-constrained
pip install k-means-constrained
SameSizeKMeansMinCostFlow
class
from k_means_constrained import KMeansConstrained
import warnings
import base
from scipy.spatial.distance import cdist
class SameSizeKMeansMinCostFlow(base.Base):
def __init__(self, n_clusters, max_iters=1000, distance_func=cdist, random_state=42):
'''
Args:
n_clusters (int): number of clusters
max_iters (int): maximum iterations
distance_func (object): callable function with input (X, centers) / None, by default is l2-distance
random_state (int): random state to initiate, by default it is 42
'''
super(SameSizeKMeansMinCostFlow, self).__init__(n_clusters, max_iters, distance_func)
self.clf = None
def fit(self, X):
n_samples, n_features = X.shape
minsize = n_samples // self.n_clusters
maxsize = (n_samples + self.n_clusters - 1) // self.n_clusters
clf = KMeansConstrained(self.n_clusters, size_min=minsize,
size_max=maxsize)
if minsize != maxsize:
warnings.warn("Cluster minimum and maximum size are {} and {}, respectively".format(minsize, maxsize))
clf.fit(X)
self.clf = clf
self.cluster_centers_ = self.clf.cluster_centers_
self.labels_ = self.clf.labels_
def predict(self, X):
return self.clf.predict(X)
base
class
#!usr/bin/python 3.7
#-*-coding:utf-8-*-
'''
@file: base.py, base for clustering algorithm
@Author: Jing Wang (jingw2@foxmail.com)
@Date: 06/07/2020
'''
from scipy.spatial.distance import cdist
import numpy as np
import warnings
import scipy.sparse as sp
import os
import sys
path = os.path.dirname(os.path.abspath(__file__))
sys.path.append(path)
from k_means_constrained.sklearn_import.utils.extmath import stable_cumsum
class Base(object):
def __init__(self, n_clusters, max_iters, distance_func=cdist):
'''
Base Cluster object
Args:
n_clusters (int): number of clusters
max_iters (int): maximum iterations
distance_func (callable function): distance function callback
'''
assert isinstance(n_clusters, int)
assert n_clusters >= 1
assert isinstance(max_iters, int)
assert max_iters >= 1
self.n_clusters = n_clusters
self.max_iters = max_iters
if distance_func is not None and not callable(distance_func):
raise Exception("Distance function is not callable")
self.distance_func = distance_func
def fit(self, X):
pass
def predict(self, X):
pass
def k_init(X, n_clusters, x_squared_norms, random_state=42, distance_func=cdist, n_local_trials=None):
"""Init n_clusters seeds according to k-means++
Parameters
----------
X : array or sparse matrix, shape (n_samples, n_features)
The data to pick seeds for. To avoid memory copy, the input data
should be double precision (dtype=np.float64).
n_clusters : integer
The number of seeds to choose
x_squared_norms : array, shape (n_samples,)
Squared Euclidean norm of each data point.
random_state : int, RandomState instance
The generator used to initialize the centers. Use an int to make the
randomness deterministic.
See :term:`Glossary <random_state>`.
n_local_trials : integer, optional
The number of seeding trials for each center (except the first),
of which the one reducing inertia the most is greedily chosen.
Set to None to make the number of trials depend logarithmically
on the number of seeds (2+log(k)); this is the default.
Notes
-----
Selects initial cluster centers for k-mean clustering in a smart way
to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
"k-means++: the advantages of careful seeding". ACM-SIAM symposium
on Discrete algorithms. 2007
Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
which is the implementation used in the aforementioned paper.
"""
n_samples, n_features = X.shape
centers = np.empty((n_clusters, n_features), dtype=X.dtype)
assert x_squared_norms is not None, 'x_squared_norms None in _k_init'
# Set the number of local seeding trials if none is given
if n_local_trials is None:
# This is what Arthur/Vassilvitskii tried, but did not report
# specific results for other than mentioning in the conclusion
# that it helped.
n_local_trials = 2 + int(np.log(n_clusters))
# Pick first center randomly
center_id = random_state.randint(n_samples)
if sp.issparse(X):
centers[0] = X[center_id].toarray()
else:
centers[0] = X[center_id]
# Initialize list of closest distances and calculate current potential
closest_dist_sq = distance_func(
centers[0, np.newaxis], X)
current_pot = closest_dist_sq.sum()
# Pick the remaining n_clusters-1 points
for c in range(1, n_clusters):
# Choose center candidates by sampling with probability proportional
# to the squared distance to the closest existing center
rand_vals = random_state.random_sample(n_local_trials) * current_pot
candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
rand_vals)
# XXX: numerical imprecision can result in a candidate_id out of range
np.clip(candidate_ids, None, closest_dist_sq.size - 1,
out=candidate_ids)
# Compute distances to center candidates
# distance_to_candidates = euclidean_distances(
# X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)
distance_to_candidates = distance_func(X[candidate_ids], X)
# update closest distances squared and potential for each candidate
np.minimum(closest_dist_sq, distance_to_candidates,
out=distance_to_candidates)
candidates_pot = distance_to_candidates.sum(axis=1)
# Decide which candidate is the best
best_candidate = np.argmin(candidates_pot)
current_pot = candidates_pot[best_candidate]
closest_dist_sq = distance_to_candidates[best_candidate]
best_candidate = candidate_ids[best_candidate]
# Permanently add best center candidate found in local tries
if sp.issparse(X):
centers[c] = X[best_candidate].toarray()
else:
centers[c] = X[best_candidate]
return centers