Initial commit

2026-02-21 11:18:01 +00:00 · 2021-01-30 21:49:15 +01:00
commit 65b8c80495
432 changed files with 1290844 additions and 0 deletions
--- a/ML/algorithms/knn/pycache/utils.cpython-37.pyc
+++ b/ML/algorithms/knn/pycache/utils.cpython-37.pyc
--- a/ML/algorithms/knn/example_data/data.txt
+++ b/ML/algorithms/knn/example_data/data.txt
@@ -0,0 +1,90 @@
+1.1107,   -2.1079
+-0.5498,    0.0943
+-0.0382,    1.8829
+0.0555,   -0.6139
+0.5870,   -1.2067
+0.5453,    0.2509
+-0.3927,   -0.6220
+-1.1905,   -1.8785
+-0.4240,    0.7772
+-0.7139,    1.5846
+-0.8883,    2.1408
+-0.6922,    0.0993
+1.4350,    1.2334
+-0.7576,    0.7386
+-1.1144,   -1.7059
+0.6612,   -1.7296
+-2.1381,   -0.0600
+1.3857,    1.2178
+-1.4951,    0.0373
+0.8029,    0.9739
+1.5607,    1.5862
+0.8563,   -1.4245
+0.0397,   -1.3799
+1.2331,    1.7421
+-2.0015,    0.8355
+-0.3428,   -0.4780
+-0.8891,    1.2634
+0.3832,   -0.1189
+0.4172,    1.0132
+-0.8695,   -0.7947
+2.9737,    3.6438
+3.7680,    1.8649
+0.1166,    0.9435
+0.6896,    3.9160
+1.2234,    2.9899
+2.3009,    0.4150
+3.7693,    3.8027
+1.9450,    3.4208
+0.9290,    3.3611
+5.0027,    2.7870
+1.0101,    1.8737
+2.0751,    2.2628
+1.9113,    3.6777
+2.3127,    3.9130
+1.9392,    2.3976
+3.1218,    2.5495
+1.7032,    1.1509
+0.4212,    3.5322
+2.7686,    0.9402
+2.1696,    2.9285
+0.3380,    2.0947
+3.6886,    0.4054
+2.6315,    3.1962
+-0.5332,    3.1421
+0.3380,    3.0801
+1.4030,    1.1841
+2.8739,    2.7777
+1.1254,    3.2404
+0.0988,    1.9522
+0.3688,    2.8904
+1.4758,   -1.6387
+1.9289,   -1.8191
+2.5741,   -1.3213
+2.1917,   -1.2852
+0.8358,   -2.3349
+2.6863,   -1.8834
+3.1102,   -0.4854
+3.7073,   -0.6466
+3.6394,   -0.4097
+0.5365,   -3.6555
+2.9295,   -0.3819
+0.8168,   -3.1133
+1.3432,   -1.7717
+1.1039,   -2.2261
+1.3754,   -2.2236
+0.6757,   -2.5379
+-0.2029,   -3.8420
+2.4210,   -1.9788
+1.0335,   -2.6042
+0.9638,   -2.9449
+-0.8198,   -5.4449
+1.9552,   -1.5530
+0.3505,   -3.1887
+2.4943,   -1.8116
+1.9761,   -1.0664
+0.5994,   -3.0513
+2.2076,   -1.6728
+1.9941,   -1.8826
+1.7487,   -2.9644
+1.4160,   -2.4234
--- a/ML/algorithms/knn/example_data/targets.txt
+++ b/ML/algorithms/knn/example_data/targets.txt
@@ -0,0 +1,90 @@
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
+3
--- a/ML/algorithms/knn/knn.py
+++ b/ML/algorithms/knn/knn.py
@@ -0,0 +1,110 @@
+"""
+Implementation of K-nearest neighbor (KNN) from scratch
+where you can either use 2-loops (inefficient), 1-loop (better)
+or a heavily vectorized zero-loop implementation.
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-24 Initial coding
+"""
+
+import numpy as np
+
+
+class KNearestNeighbor:
+    def __init__(self, k):
+        self.k = k
+        self.eps = 1e-8
+
+    def train(self, X, y):
+        self.X_train = X
+        self.y_train = y
+
+    def predict(self, X_test, num_loops=0):
+        if num_loops == 0:
+            distances = self.compute_distance_vectorized(X_test)
+
+        elif num_loops == 1:
+            distances = self.compute_distance_one_loop(X_test)
+
+        else:
+            distances = self.compute_distance_two_loops(X_test)
+
+        return self.predict_labels(distances)
+
+    def compute_distance_two_loops(self, X_test):
+        """
+        Inefficient naive implementation, use only
+        as a way of understanding what kNN is doing
+        """
+
+        num_test = X_test.shape[0]
+        num_train = self.X_train.shape[0]
+        distances = np.zeros((num_test, num_train))
+
+        for i in range(num_test):
+            for j in range(num_train):
+                # (Taking sqrt is not necessary: min distance won't change since sqrt is monotone)
+                distances[i, j] = np.sqrt(
+                    self.eps + np.sum((X_test[i, :] - self.X_train[j, :]) ** 2)
+                )
+
+        return distances
+
+    def compute_distance_one_loop(self, X_test):
+        """
+        Much better than two-loops but not as fast as fully vectorized version.
+        Utilize Numpy broadcasting in X_train - X_test[i,:]
+        """
+        num_test = X_test.shape[0]
+        num_train = self.X_train.shape[0]
+        distances = np.zeros((num_test, num_train))
+
+        for i in range(num_test):
+            # (Taking sqrt is not necessary: min distance won't change since sqrt is monotone)
+            distances[i, :] = np.sqrt(
+                self.eps + np.sum((self.X_train - X_test[i, :]) ** 2, axis=1)
+            )
+
+        return distances
+
+    def compute_distance_vectorized(self, X_test):
+        """
+        Can be tricky to understand this, we utilize heavy
+        vecotorization as well as numpy broadcasting.
+        Idea: if we have two vectors a, b (two examples)
+        and for vectors we can compute (a-b)^2 = a^2 - 2a (dot) b + b^2
+        expanding on this and doing so for every vector lends to the 
+        heavy vectorized formula for all examples at the same time.
+        """
+        X_test_squared = np.sum(X_test ** 2, axis=1, keepdims=True)
+        X_train_squared = np.sum(self.X_train ** 2, axis=1, keepdims=True)
+        two_X_test_X_train = np.dot(X_test, self.X_train.T)
+
+        # (Taking sqrt is not necessary: min distance won't change since sqrt is monotone)
+        return np.sqrt(
+            self.eps + X_test_squared - 2 * two_X_test_X_train + X_train_squared.T
+        )
+
+    def predict_labels(self, distances):
+        num_test = distances.shape[0]
+        y_pred = np.zeros(num_test)
+
+        for i in range(num_test):
+            y_indices = np.argsort(distances[i, :])
+            k_closest_classes = self.y_train[y_indices[: self.k]].astype(int)
+            y_pred[i] = np.argmax(np.bincount(k_closest_classes))
+
+        return y_pred
+
+
+if __name__ == "__main__":
+    X = np.loadtxt("example_data/data.txt", delimiter=",")
+    y = np.loadtxt("example_data/targets.txt")
+
+    X = np.array([[1, 1], [3, 1], [1, 4], [2, 4], [3, 3], [5, 1]])
+    y = np.array([0, 0, 0, 1, 1, 1])
+
+    KNN = KNearestNeighbor(k=1)
+    KNN.train(X, y)
+    y_pred = KNN.predict(X, num_loops=0)
+    print(f"Accuracy: {sum(y_pred == y) / y.shape[0]}")