""" From scratch implementation of K means clustering which is a unsupervised clustering method that works by iteratively computing new centroids and moving centroids to the center of the new formed clusters. Programmed by Aladdin Persson * 2020-05-28 Initial coding """ import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_blobs class KMeansClustering: def __init__(self, X, num_clusters): self.K = num_clusters self.max_iterations = 100 self.plot_figure = True self.num_examples = X.shape[0] self.num_features = X.shape[1] def initialize_random_centroids(self, X): centroids = np.zeros((self.K, self.num_features)) for k in range(self.K): centroid = X[np.random.choice(range(self.num_examples))] centroids[k] = centroid return centroids def create_clusters(self, X, centroids): # Will contain a list of the points that are associated with that specific cluster clusters = [[] for _ in range(self.K)] # Loop through each point and check which is the closest cluster for point_idx, point in enumerate(X): closest_centroid = np.argmin( np.sqrt(np.sum((point - centroids) ** 2, axis=1)) ) clusters[closest_centroid].append(point_idx) return clusters def calculate_new_centroids(self, clusters, X): centroids = np.zeros((self.K, self.num_features)) for idx, cluster in enumerate(clusters): new_centroid = np.mean(X[cluster], axis=0) centroids[idx] = new_centroid return centroids def predict_cluster(self, clusters, X): y_pred = np.zeros(self.num_examples) for cluster_idx, cluster in enumerate(clusters): for sample_idx in cluster: y_pred[sample_idx] = cluster_idx return y_pred def plot_fig(self, X, y): plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral) plt.show() def fit(self, X): centroids = self.initialize_random_centroids(X) for it in range(self.max_iterations): clusters = self.create_clusters(X, centroids) previous_centroids = centroids centroids = self.calculate_new_centroids(clusters, X) diff = centroids - previous_centroids if not diff.any(): print("Termination criterion satisfied") break # Get label predictions y_pred = self.predict_cluster(clusters, X) if self.plot_figure: self.plot_fig(X, y_pred) return y_pred if __name__ == "__main__": np.random.seed(10) num_clusters = 3 X, _ = make_blobs(n_samples=1000, n_features=2, centers=num_clusters) Kmeans = KMeansClustering(X, num_clusters) y_pred = Kmeans.fit(X)