So I am trying to import a module/script (.py file) into a Jupyter notebook, mainly for readability and conciseness. But then, when I try to run the class in the script, I get the following Error message:
NameError Traceback (most recent call last)
<ipython-input-48-4d8cbba46ed0> in <module>()
8
9 test_KMeans = KMeans(k=3, maxiter=1000, tol=1e-9)
---> 10 cluster_center = test_KMeans.fit(X)
11 clusters = test_KMeans.predict(X)
12
~/KMeans.py in fit(self, X)
42 #Choose k random rows of X as the initial cluster centers.
43 initial_cluster_centers = []
---> 44
45 sample = np.random.randint(0,m,size=k)
46
NameError: name 'maxiter' is not defined
Here is my script:
import numpy as np
from sklearn.decomposition import PCA
k = 3
maxiter = 1000
tol = 1e-9
class KMeans:
"""A K-Means object class. Implements basic k-means clustering.
Attributes:
k (int): The number of clusters
maxiter (int): The maximum number of iterations
tol (float): A convergence tolerance
"""
def __init__(self, k, maxiter, tol):
"""Set the paramters.
Parameters:
k (int): The number of clusters
maxiter (int): The maximum number of iterations
tol (float): A convergence tolerance
"""
k = 3
maxiter = 1000
tol = 1e-9
self.k = k # Initialize some attributes.
self.maxiter = maxiter
self.tol = tol
def fit(self, X):
"""Accepts an mxn matrix X of m data points with n features.
"""
m,n = X.shape
k = 3
maxiter = 1000
tol = 1e-9
self.m = m
self.n = n
#Choose k random rows of X as the initial cluster centers.
initial_cluster_centers = []
sample = np.random.randint(0,m,size=k)
initial_cluster_centers = X[sample, :]
# Run the k-means iteration until consecutive centers are within the convergence tolerance, or until
# iterating the maximum number of times.
iterations = 0
old_cluster = np.zeros(initial_cluster_centers.shape)
new_cluster = initial_cluster_centers
while iterations < maxiter or np.linalg.norm(old_cluster - new_cluster) >= tol:
#assign each data point to the cluster center that is closest, forming k clusters
clusters = np.zeros(m)
for i in range(0,m):
distances = np.linalg.norm(X[i] - initial_cluster_centers, ord=2, axis=1) # axis=1 was crucial
cluster = np.argmin(distances) #in getting this to work
clusters[i] = cluster
# Store the old/initial centroid values
old_cluster = np.copy(new_cluster)
#Recompute the cluster centers as the means of the new clusters
for i in range(k):
points = [X[j] for j in range(m) if clusters[j] == i]
new_cluster[i] = np.mean(points, axis=0)
#If a cluster is empty, reassign the cluster center as a random row of X.
if new_cluster[i] == []:
new_cluster[i] = X[np.random.randint(0,m,size=1)]
iterations += 1
#Save the cluster centers as attributes.
self.new_cluster = new_cluster
#print("New cluster centers:\n", new_cluster)
return new_cluster
def predict(self, X):
"""Accept an l × n matrix X of data.
"""
# Return an array of l integers where the ith entry indicates which
# cluster center the ith row of X is closest to.
clusters = np.zeros(self.m)
for i in range(0,self.m):
distances = np.linalg.norm(X[i] - self.new_cluster, ord=2, axis=1)
cluster = np.argmin(distances)
clusters[i] = cluster
print("\nClusters:", clusters)
return clusters
Then I attempt to do the following:
from KMeans import KMeans
X = features_scaled
# k = 3
# maxiter = 1000
# tol = 1e-9
test_KMeans = KMeans(k=3, maxiter=1000, tol=1e-9)
cluster_center = test_KMeans.fit(X)
clusters = test_KMeans.predict(X)
pca = PCA(n_components=2)
pr_components = pca.fit_transform(X) # these are the first 2 principal components
#plot the first two principal components as a scatter plot, where the color of each point is det by the clusters
plt.scatter(pr_components[:,0], pr_components[:,1],
c=clusters, edgecolor='none', alpha=0.5, #color by clusters
cmap=plt.cm.get_cmap('tab10', 3))
plt.xlabel('principal component 1')
plt.ylabel('principal component 2')
plt.colorbar()
plt.title("K-Means Clustering:")
plt.show()
Upon running the above section of code, I get the NameError I described. I don't understand why it is telling me that maxiter
is not defined. You'll see I defined the variables k, maxiter, tol
multiple times in the script trying to get it to work, but nothing has. I had self.maxiter
and self.tol
at one point but that didn't fix it either.
I know this code works because I have used it multiple times now. Originally I just defined those variables k, maxiter, and tol.. then instantiated the class and called the fit and predict methods, and since they were stored as attributes with self, everything worked fine. but now that I try to import it as a module I have no idea why it is not working.
Thanks for your help!
EDIT: Here is what my code would look like in a single cell in a Jupyter notebook.. It does run and work in this case:
from sklearn.decomposition import PCA
class KMeans:
"""A K-Means object class. Implements basic k-means clustering.
Attributes:
k (int): The number of clusters
maxiter (int): The maximum number of iterations
tol (float): A convergence tolerance
"""
def __init__(self, k, maxiter, tol):
"""Set the paramters.
Parameters:
k (int): The number of clusters
maxiter (int): The maximum number of iterations
tol (float): A convergence tolerance
"""
self.k = k # Initialize some attributes.
self.maxiter = maxiter
self.tol = tol
def fit(self, X):
"""Accepts an mxn matrix X of m data points with n features.
"""
m,n = X.shape
self.m = m
self.n = n
#Choose k random rows of X as the initial cluster centers.
initial_cluster_centers = []
sample = np.random.randint(0,m,size=self.k)
initial_cluster_centers = X[sample, :]
# Run the k-means iteration until consecutive centers are within the convergence tolerance, or until
# iterating the maximum number of times.
iterations = 0
old_cluster = np.zeros(initial_cluster_centers.shape)
new_cluster = initial_cluster_centers
while iterations < maxiter or np.linalg.norm(old_cluster - new_cluster) >= tol:
#assign each data point to the cluster center that is closest, forming k clusters
clusters = np.zeros(m)
for i in range(0,m):
distances = np.linalg.norm(X[i] - initial_cluster_centers, ord=2, axis=1) # axis=1 was crucial
cluster = np.argmin(distances) #in getting this to work
clusters[i] = cluster
# Store the old/initial centroid values
old_cluster = np.copy(new_cluster)
#Recompute the cluster centers as the means of the new clusters
for i in range(k):
points = [X[j] for j in range(m) if clusters[j] == i]
new_cluster[i] = np.mean(points, axis=0)
#If a cluster is empty, reassign the cluster center as a random row of X.
if new_cluster[i] == []:
new_cluster[i] = X[np.random.randint(0,m,size=1)]
iterations += 1
#Save the cluster centers as attributes.
self.new_cluster = new_cluster
#print("New cluster centers:\n", new_cluster)
return new_cluster
def predict(self, X):
"""Accept an l × n matrix X of data.
"""
# Return an array of l integers where the ith entry indicates which
# cluster center the ith row of X is closest to.
clusters = np.zeros(self.m)
for i in range(0,self.m):
distances = np.linalg.norm(X[i] - self.new_cluster, ord=2, axis=1)
cluster = np.argmin(distances)
clusters[i] = cluster
print("\nClusters:", clusters)
return clusters
X = features_scaled
k = 3
maxiter = 1000
tol = 1e-9
test_KMeans = KMeans(k,maxiter,tol)
test_KMeans.fit(X)
clusters = test_KMeans.predict(X)
pca = PCA(n_components=2)
pr_components = pca.fit_transform(X) # these are the first 2 principal components
#plot the first two principal components as a scatter plot, where the color of each point is det by the clusters
plt.scatter(pr_components[:,0], pr_components[:,1],
c=clusters, edgecolor='none', alpha=0.5, #color by clusters
cmap=plt.cm.get_cmap('tab10', 3))
plt.xlabel('principal component 1')
plt.ylabel('principal component 2')
plt.colorbar()
plt.title("K-Means Clustering:")
plt.show()