Initial commit

2026-02-21 11:18:01 +00:00 · 2021-01-30 21:49:15 +01:00
commit 65b8c80495
432 changed files with 1290844 additions and 0 deletions
--- a/ML/algorithms/kmeans/kmeansclustering.py
+++ b/ML/algorithms/kmeans/kmeansclustering.py
@@ -0,0 +1,97 @@
+"""
+From scratch implementation of K means clustering which is a unsupervised 
+clustering  method that works by iteratively computing new centroids and 
+moving centroids to the center of the new formed clusters.
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-05-28 Initial coding
+
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.datasets import make_blobs
+
+
+class KMeansClustering:
+    def __init__(self, X, num_clusters):
+        self.K = num_clusters
+        self.max_iterations = 100
+        self.plot_figure = True
+        self.num_examples = X.shape[0]
+        self.num_features = X.shape[1]
+
+    def initialize_random_centroids(self, X):
+        centroids = np.zeros((self.K, self.num_features))
+
+        for k in range(self.K):
+            centroid = X[np.random.choice(range(self.num_examples))]
+            centroids[k] = centroid
+
+        return centroids
+
+    def create_clusters(self, X, centroids):
+        # Will contain a list of the points that are associated with that specific cluster
+        clusters = [[] for _ in range(self.K)]
+
+        # Loop through each point and check which is the closest cluster
+        for point_idx, point in enumerate(X):
+            closest_centroid = np.argmin(
+                np.sqrt(np.sum((point - centroids) ** 2, axis=1))
+            )
+            clusters[closest_centroid].append(point_idx)
+
+        return clusters
+
+    def calculate_new_centroids(self, clusters, X):
+        centroids = np.zeros((self.K, self.num_features))
+        for idx, cluster in enumerate(clusters):
+            new_centroid = np.mean(X[cluster], axis=0)
+            centroids[idx] = new_centroid
+
+        return centroids
+
+    def predict_cluster(self, clusters, X):
+        y_pred = np.zeros(self.num_examples)
+
+        for cluster_idx, cluster in enumerate(clusters):
+            for sample_idx in cluster:
+                y_pred[sample_idx] = cluster_idx
+
+        return y_pred
+
+    def plot_fig(self, X, y):
+        plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
+        plt.show()
+
+    def fit(self, X):
+        centroids = self.initialize_random_centroids(X)
+
+        for it in range(self.max_iterations):
+            clusters = self.create_clusters(X, centroids)
+
+            previous_centroids = centroids
+            centroids = self.calculate_new_centroids(clusters, X)
+
+            diff = centroids - previous_centroids
+
+            if not diff.any():
+                print("Termination criterion satisfied")
+                break
+
+        # Get label predictions
+        y_pred = self.predict_cluster(clusters, X)
+
+        if self.plot_figure:
+            self.plot_fig(X, y_pred)
+
+        return y_pred
+
+
+if __name__ == "__main__":
+    np.random.seed(10)
+    num_clusters = 3
+    X, _ = make_blobs(n_samples=1000, n_features=2, centers=num_clusters)
+
+    Kmeans = KMeansClustering(X, num_clusters)
+    y_pred = Kmeans.fit(X)