Initial commit

This commit is contained in:
Aladdin Persson
2021-01-30 21:49:15 +01:00
commit 65b8c80495
432 changed files with 1290844 additions and 0 deletions

View File

@@ -0,0 +1,282 @@
"""
Author: Philip Andreadis
e-mail: philip_andreadis@hotmail.com
Implementation of Decision Tree model from scratch.
Metric used to apply the split on the data is the Gini index which is calculated for each feature's single value
in order to find the best split on each step. This means there is room for improvement performance wise as this
process is O(n^2) and can be reduced to linear complexity.
Parameters of the model:
max_depth (int): Maximum depth of the decision tree
min_node_size (int): Minimum number of instances a node can have. If this threshold is exceeded the node is terminated
Both are up to the user to set.
Input dataset to train() function must be a numpy array containing both feature and label values.
"""
from collections import Counter
import numpy as np
class DecisionTree:
def __init__(self, max_depth, min_node_size):
self.max_depth = max_depth
self.min_node_size = min_node_size
self.final_tree = {}
"""
This function calculates the gini index of a split in the dataset
Firstly the gini score is calculated for each child note and the resulting Gini is the weighted sum of gini_left and gini_right
Parameters:
child_nodes (list of np arrays): The two groups of instances resulting from the split
Returns:
float:Gini index of the split
"""
def calculate_gini(self, child_nodes):
n = 0
# Calculate number of all instances of the parent node
for node in child_nodes:
n = n + len(node)
gini = 0
# Calculate gini index for each child node
for node in child_nodes:
m = len(node)
# Avoid division by zero if a child node is empty
if m == 0:
continue
# Create a list with each instance's class value
y = []
for row in node:
y.append(row[-1])
# Count the frequency for each class value
freq = Counter(y).values()
node_gini = 1
for i in freq:
node_gini = node_gini - (i / m) ** 2
gini = gini + (m / n) * node_gini
return gini
"""
This function splits the dataset on certain value of a feature
Parameters:
feature_index (int): Index of selected feature
threshold : Value of the feature split point
Returns:
np.array: Two new groups of split instances
"""
def apply_split(self, feature_index, threshold, data):
instances = data.tolist()
left_child = []
right_child = []
for row in instances:
if row[feature_index] < threshold:
left_child.append(row)
else:
right_child.append(row)
left_child = np.array(left_child)
right_child = np.array(right_child)
return left_child, right_child
"""
This function finds the best split on the dataset on each iteration of the algorithm by evaluating
all possible splits and applying the one with the minimum Gini index.
Parameters:
data: Dataset
Returns node (dict): Dictionary with the index of the splitting feature and its value and the two child nodes
"""
def find_best_split(self, data):
num_of_features = len(data[0]) - 1
gini_score = 1000
f_index = 0
f_value = 0
# Iterate through each feature and find minimum gini score
for column in range(num_of_features):
for row in data:
value = row[column]
l, r = self.apply_split(column, value, data)
children = [l, r]
score = self.calculate_gini(children)
# print("Candidate split feature X{} < {} with Gini score {}".format(column,value,score))
if score < gini_score:
gini_score = score
f_index = column
f_value = value
child_nodes = children
# print("Chosen feature is {} and its value is {} with gini index {}".format(f_index,f_value,gini_score))
node = {"feature": f_index, "value": f_value, "children": child_nodes}
return node
"""
This function calculates the most frequent class value in a group of instances
Parameters:
node: Group of instances
Returns : Most common class value
"""
def calc_class(self, node):
# Create a list with each instance's class value
y = []
for row in node:
y.append(row[-1])
# Find most common class value
occurence_count = Counter(y)
return occurence_count.most_common(1)[0][0]
"""
Recursive function that builds the decision tree by applying split on every child node until they become terminal.
Cases to terminate a node is: i.max depth of tree is reached ii.minimum size of node is not met iii.child node is empty
Parameters:
node: Group of instances
depth (int): Current depth of the tree
"""
def recursive_split(self, node, depth):
l, r = node["children"]
del node["children"]
if l.size == 0:
c_value = self.calc_class(r)
node["left"] = node["right"] = {"class_value": c_value, "depth": depth}
return
elif r.size == 0:
c_value = self.calc_class(l)
node["left"] = node["right"] = {"class_value": c_value, "depth": depth}
return
# Check if tree has reached max depth
if depth >= self.max_depth:
# Terminate left child node
c_value = self.calc_class(l)
node["left"] = {"class_value": c_value, "depth": depth}
# Terminate right child node
c_value = self.calc_class(r)
node["right"] = {"class_value": c_value, "depth": depth}
return
# process left child
if len(l) <= self.min_node_size:
c_value = self.calc_class(l)
node["left"] = {"class_value": c_value, "depth": depth}
else:
node["left"] = self.find_best_split(l)
self.recursive_split(node["left"], depth + 1)
# process right child
if len(r) <= self.min_node_size:
c_value = self.calc_class(r)
node["right"] = {"class_value": c_value, "depth": depth}
else:
node["right"] = self.find_best_split(r)
self.recursive_split(node["right"], depth + 1)
"""
Apply the recursive split algorithm on the data in order to build the decision tree
Parameters:
X (np.array): Training data
Returns tree (dict): The decision tree in the form of a dictionary.
"""
def train(self, X):
# Create initial node
tree = self.find_best_split(X)
# Generate the rest of the tree via recursion
self.recursive_split(tree, 1)
self.final_tree = tree
return tree
"""
Prints out the decision tree.
Parameters:
tree (dict): Decision tree
"""
def print_dt(self, tree, depth=0):
if "feature" in tree:
print(
"\nSPLIT NODE: feature #{} < {} depth:{}\n".format(
tree["feature"], tree["value"], depth
)
)
self.print_dt(tree["left"], depth + 1)
self.print_dt(tree["right"], depth + 1)
else:
print(
"TERMINAL NODE: class value:{} depth:{}".format(
tree["class_value"], tree["depth"]
)
)
"""
This function outputs the class value of the instance given based on the decision tree created previously.
Parameters:
tree (dict): Decision tree
instance(id np.array): Single instance of data
Returns (float): predicted class value of the given instance
"""
def predict_single(self, tree, instance):
if not tree:
print("ERROR: Please train the decision tree first")
return -1
if "feature" in tree:
if instance[tree["feature"]] < tree["value"]:
return self.predict_single(tree["left"], instance)
else:
return self.predict_single(tree["right"], instance)
else:
return tree["class_value"]
"""
This function outputs the class value for each instance of the given dataset.
Parameters:
X (np.array): Dataset with labels
Returns y (np.array): array with the predicted class values of the dataset
"""
def predict(self, X):
y_predict = []
for row in X:
y_predict.append(self.predict_single(self.final_tree, row))
return np.array(y_predict)
if __name__ == "__main__":
# # test dataset
# X = np.array([[1, 1,0], [3, 1, 0], [1, 4, 0], [2, 4, 1], [3, 3, 1], [5, 1, 1]])
# y = np.array([0, 0, 0, 1, 1, 1])
train_data = np.loadtxt("example_data/data.txt", delimiter=",")
train_y = np.loadtxt("example_data/targets.txt")
# Build tree
dt = DecisionTree(5, 1)
tree = dt.train(train_data)
y_pred = dt.predict(train_data)
print(f"Accuracy: {sum(y_pred == train_y) / train_y.shape[0]}")
# Print out the decision tree
# dt.print_dt(tree)

View File

@@ -0,0 +1,90 @@
1.1107, -2.1079, 1
-0.5498, 0.0943, 1
-0.0382, 1.8829,1
0.0555, -0.6139,1
0.5870, -1.2067,1
0.5453, 0.2509,1
-0.3927, -0.6220,1
-1.1905, -1.8785,1
-0.4240, 0.7772,1
-0.7139, 1.5846,1
-0.8883, 2.1408,1
-0.6922, 0.0993,1
1.4350, 1.2334,1
-0.7576, 0.7386,1
-1.1144, -1.7059,1
0.6612, -1.7296,1
-2.1381, -0.0600,1
1.3857, 1.2178,1
-1.4951, 0.0373,1
0.8029, 0.9739,1
1.5607, 1.5862,1
0.8563, -1.4245,1
0.0397, -1.3799,1
1.2331, 1.7421,1
-2.0015, 0.8355,1
-0.3428, -0.4780,1
-0.8891, 1.2634,1
0.3832, -0.1189,1
0.4172, 1.0132,1
-0.8695, -0.7947,1
2.9737, 3.6438,2
3.7680, 1.8649,2
0.1166, 0.9435,2
0.6896, 3.9160,2
1.2234, 2.9899,2
2.3009, 0.4150,2
3.7693, 3.8027,2
1.9450, 3.4208,2
0.9290, 3.3611,2
5.0027, 2.7870,2
1.0101, 1.8737,2
2.0751, 2.2628,2
1.9113, 3.6777,2
2.3127, 3.9130,2
1.9392, 2.3976,2
3.1218, 2.5495,2
1.7032, 1.1509,2
0.4212, 3.5322,2
2.7686, 0.9402,2
2.1696, 2.9285,2
0.3380, 2.0947,2
3.6886, 0.4054,2
2.6315, 3.1962,2
-0.5332, 3.1421,2
0.3380, 3.0801,2
1.4030, 1.1841,2
2.8739, 2.7777,2
1.1254, 3.2404,2
0.0988, 1.9522,2
0.3688, 2.8904,2
1.4758, -1.6387,3
1.9289, -1.8191,3
2.5741, -1.3213,3
2.1917, -1.2852,3
0.8358, -2.3349,3
2.6863, -1.8834,3
3.1102, -0.4854,3
3.7073, -0.6466,3
3.6394, -0.4097,3
0.5365, -3.6555,3
2.9295, -0.3819,3
0.8168, -3.1133,3
1.3432, -1.7717,3
1.1039, -2.2261,3
1.3754, -2.2236,3
0.6757, -2.5379,3
-0.2029, -3.8420,3
2.4210, -1.9788,3
1.0335, -2.6042,3
0.9638, -2.9449,3
-0.8198, -5.4449,3
1.9552, -1.5530,3
0.3505, -3.1887,3
2.4943, -1.8116,3
1.9761, -1.0664,3
0.5994, -3.0513,3
2.2076, -1.6728,3
1.9941, -1.8826,3
1.7487, -2.9644,3
1.4160, -2.4234,3

View File

@@ -0,0 +1,90 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3

View File

@@ -0,0 +1,97 @@
"""
From scratch implementation of K means clustering which is a unsupervised
clustering method that works by iteratively computing new centroids and
moving centroids to the center of the new formed clusters.
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-05-28 Initial coding
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
class KMeansClustering:
def __init__(self, X, num_clusters):
self.K = num_clusters
self.max_iterations = 100
self.plot_figure = True
self.num_examples = X.shape[0]
self.num_features = X.shape[1]
def initialize_random_centroids(self, X):
centroids = np.zeros((self.K, self.num_features))
for k in range(self.K):
centroid = X[np.random.choice(range(self.num_examples))]
centroids[k] = centroid
return centroids
def create_clusters(self, X, centroids):
# Will contain a list of the points that are associated with that specific cluster
clusters = [[] for _ in range(self.K)]
# Loop through each point and check which is the closest cluster
for point_idx, point in enumerate(X):
closest_centroid = np.argmin(
np.sqrt(np.sum((point - centroids) ** 2, axis=1))
)
clusters[closest_centroid].append(point_idx)
return clusters
def calculate_new_centroids(self, clusters, X):
centroids = np.zeros((self.K, self.num_features))
for idx, cluster in enumerate(clusters):
new_centroid = np.mean(X[cluster], axis=0)
centroids[idx] = new_centroid
return centroids
def predict_cluster(self, clusters, X):
y_pred = np.zeros(self.num_examples)
for cluster_idx, cluster in enumerate(clusters):
for sample_idx in cluster:
y_pred[sample_idx] = cluster_idx
return y_pred
def plot_fig(self, X, y):
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.show()
def fit(self, X):
centroids = self.initialize_random_centroids(X)
for it in range(self.max_iterations):
clusters = self.create_clusters(X, centroids)
previous_centroids = centroids
centroids = self.calculate_new_centroids(clusters, X)
diff = centroids - previous_centroids
if not diff.any():
print("Termination criterion satisfied")
break
# Get label predictions
y_pred = self.predict_cluster(clusters, X)
if self.plot_figure:
self.plot_fig(X, y_pred)
return y_pred
if __name__ == "__main__":
np.random.seed(10)
num_clusters = 3
X, _ = make_blobs(n_samples=1000, n_features=2, centers=num_clusters)
Kmeans = KMeansClustering(X, num_clusters)
y_pred = Kmeans.fit(X)

Binary file not shown.

View File

@@ -0,0 +1,90 @@
1.1107, -2.1079
-0.5498, 0.0943
-0.0382, 1.8829
0.0555, -0.6139
0.5870, -1.2067
0.5453, 0.2509
-0.3927, -0.6220
-1.1905, -1.8785
-0.4240, 0.7772
-0.7139, 1.5846
-0.8883, 2.1408
-0.6922, 0.0993
1.4350, 1.2334
-0.7576, 0.7386
-1.1144, -1.7059
0.6612, -1.7296
-2.1381, -0.0600
1.3857, 1.2178
-1.4951, 0.0373
0.8029, 0.9739
1.5607, 1.5862
0.8563, -1.4245
0.0397, -1.3799
1.2331, 1.7421
-2.0015, 0.8355
-0.3428, -0.4780
-0.8891, 1.2634
0.3832, -0.1189
0.4172, 1.0132
-0.8695, -0.7947
2.9737, 3.6438
3.7680, 1.8649
0.1166, 0.9435
0.6896, 3.9160
1.2234, 2.9899
2.3009, 0.4150
3.7693, 3.8027
1.9450, 3.4208
0.9290, 3.3611
5.0027, 2.7870
1.0101, 1.8737
2.0751, 2.2628
1.9113, 3.6777
2.3127, 3.9130
1.9392, 2.3976
3.1218, 2.5495
1.7032, 1.1509
0.4212, 3.5322
2.7686, 0.9402
2.1696, 2.9285
0.3380, 2.0947
3.6886, 0.4054
2.6315, 3.1962
-0.5332, 3.1421
0.3380, 3.0801
1.4030, 1.1841
2.8739, 2.7777
1.1254, 3.2404
0.0988, 1.9522
0.3688, 2.8904
1.4758, -1.6387
1.9289, -1.8191
2.5741, -1.3213
2.1917, -1.2852
0.8358, -2.3349
2.6863, -1.8834
3.1102, -0.4854
3.7073, -0.6466
3.6394, -0.4097
0.5365, -3.6555
2.9295, -0.3819
0.8168, -3.1133
1.3432, -1.7717
1.1039, -2.2261
1.3754, -2.2236
0.6757, -2.5379
-0.2029, -3.8420
2.4210, -1.9788
1.0335, -2.6042
0.9638, -2.9449
-0.8198, -5.4449
1.9552, -1.5530
0.3505, -3.1887
2.4943, -1.8116
1.9761, -1.0664
0.5994, -3.0513
2.2076, -1.6728
1.9941, -1.8826
1.7487, -2.9644
1.4160, -2.4234

View File

@@ -0,0 +1,90 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3

110
ML/algorithms/knn/knn.py Normal file
View File

@@ -0,0 +1,110 @@
"""
Implementation of K-nearest neighbor (KNN) from scratch
where you can either use 2-loops (inefficient), 1-loop (better)
or a heavily vectorized zero-loop implementation.
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-24 Initial coding
"""
import numpy as np
class KNearestNeighbor:
def __init__(self, k):
self.k = k
self.eps = 1e-8
def train(self, X, y):
self.X_train = X
self.y_train = y
def predict(self, X_test, num_loops=0):
if num_loops == 0:
distances = self.compute_distance_vectorized(X_test)
elif num_loops == 1:
distances = self.compute_distance_one_loop(X_test)
else:
distances = self.compute_distance_two_loops(X_test)
return self.predict_labels(distances)
def compute_distance_two_loops(self, X_test):
"""
Inefficient naive implementation, use only
as a way of understanding what kNN is doing
"""
num_test = X_test.shape[0]
num_train = self.X_train.shape[0]
distances = np.zeros((num_test, num_train))
for i in range(num_test):
for j in range(num_train):
# (Taking sqrt is not necessary: min distance won't change since sqrt is monotone)
distances[i, j] = np.sqrt(
self.eps + np.sum((X_test[i, :] - self.X_train[j, :]) ** 2)
)
return distances
def compute_distance_one_loop(self, X_test):
"""
Much better than two-loops but not as fast as fully vectorized version.
Utilize Numpy broadcasting in X_train - X_test[i,:]
"""
num_test = X_test.shape[0]
num_train = self.X_train.shape[0]
distances = np.zeros((num_test, num_train))
for i in range(num_test):
# (Taking sqrt is not necessary: min distance won't change since sqrt is monotone)
distances[i, :] = np.sqrt(
self.eps + np.sum((self.X_train - X_test[i, :]) ** 2, axis=1)
)
return distances
def compute_distance_vectorized(self, X_test):
"""
Can be tricky to understand this, we utilize heavy
vecotorization as well as numpy broadcasting.
Idea: if we have two vectors a, b (two examples)
and for vectors we can compute (a-b)^2 = a^2 - 2a (dot) b + b^2
expanding on this and doing so for every vector lends to the
heavy vectorized formula for all examples at the same time.
"""
X_test_squared = np.sum(X_test ** 2, axis=1, keepdims=True)
X_train_squared = np.sum(self.X_train ** 2, axis=1, keepdims=True)
two_X_test_X_train = np.dot(X_test, self.X_train.T)
# (Taking sqrt is not necessary: min distance won't change since sqrt is monotone)
return np.sqrt(
self.eps + X_test_squared - 2 * two_X_test_X_train + X_train_squared.T
)
def predict_labels(self, distances):
num_test = distances.shape[0]
y_pred = np.zeros(num_test)
for i in range(num_test):
y_indices = np.argsort(distances[i, :])
k_closest_classes = self.y_train[y_indices[: self.k]].astype(int)
y_pred[i] = np.argmax(np.bincount(k_closest_classes))
return y_pred
if __name__ == "__main__":
X = np.loadtxt("example_data/data.txt", delimiter=",")
y = np.loadtxt("example_data/targets.txt")
X = np.array([[1, 1], [3, 1], [1, 4], [2, 4], [3, 3], [5, 1]])
y = np.array([0, 0, 0, 1, 1, 1])
KNN = KNearestNeighbor(k=1)
KNN.train(X, y)
y_pred = KNN.predict(X, num_loops=0)
print(f"Accuracy: {sum(y_pred == y) / y.shape[0]}")

View File

@@ -0,0 +1,62 @@
"""
Implementation of Linear Regression using Gradient Descent.
Let m = #training examples, n = #number of features Sizes differ
a little bit from blog notation. It takes as input the following:
y is R^(1 x m), X is R^(n x m), w is R^(n x 1)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-03 Initial coding
* 2020-04-25 Updated comments, and small changes in code
"""
import numpy as np
class LinearRegression:
def __init__(self, print_cost=False):
self.learning_rate = 0.01
self.total_iterations = 1000
self.print_cost = print_cost
def y_hat(self, X, w):
return np.dot(w.T, X)
def cost(self, yhat, y):
C = 1 / self.m * np.sum(np.power(yhat - y, 2))
return C
def gradient_descent(self, w, X, y, yhat):
dCdW = 2 / self.m * np.dot(X, (yhat - y).T)
w = w - self.learning_rate * dCdW
return w
def main(self, X, y):
# Add x1 = 1
ones = np.ones((1, X.shape[1]))
X = np.append(ones, X, axis=0)
self.m = X.shape[1]
self.n = X.shape[0]
w = np.zeros((self.n, 1))
for it in range(self.total_iterations + 1):
yhat = self.y_hat(X, w)
cost = self.cost(yhat, y)
if it % 2000 == 0 and self.print_cost:
print(f"Cost at iteration {it} is {cost}")
w = self.gradient_descent(w, X, y, yhat)
return w
if __name__ == "__main__":
X = np.random.rand(1, 500)
y = 3 * X + 5 + np.random.randn(1, 500) * 0.1
regression = LinearRegression()
w = regression.main(X, y)

View File

@@ -0,0 +1,28 @@
"""
Implementation of Linear Regression using the Normal Equation.
Let m = #training examples, n = #number of features and the
input shapes are y is R^(m x 1), X is R^(m x n), w is R^(n x 1).
Using these shapes, the normal equation implementation is
exactly as the derived formula :)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-25 Initial coding
"""
import numpy as np
def linear_regression_normal_equation(X, y):
ones = np.ones((X.shape[0], 1))
X = np.append(ones, X, axis=1)
W = np.dot(np.linalg.pinv(np.dot(X.T, X)), np.dot(X.T, y))
return W
if __name__ == "__main__":
# Run a small test example: y = 5x (approximately)
m, n = 500, 1
X = np.random.rand(m, n)
y = 5 * X + np.random.randn(m, n) * 0.1
W = linear_regression_normal_equation(X, y)

View File

@@ -0,0 +1,70 @@
"""
From scratch implementation of Logistic Regression
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-05-24 Initial coding
"""
import numpy as np
from sklearn.datasets import make_blobs
class LogisticRegression:
def __init__(self, X, learning_rate=0.1, num_iters=10000):
self.lr = learning_rate
self.num_iters = num_iters
# m for #training_examples, n for #features
self.m, self.n = X.shape
def train(self, X, y):
# init weights
self.weights = np.zeros((self.n, 1))
self.bias = 0
for it in range(self.num_iters + 1):
# calculate hypothesis
y_predict = self.sigmoid(np.dot(X, self.weights) + self.bias)
# calculate cost
cost = (
-1
/ self.m
* np.sum(y * np.log(y_predict) + (1 - y) * np.log(1 - y_predict))
)
# back prop / gradient calculations
dw = 1 / self.m * np.dot(X.T, (y_predict - y))
db = 1 / self.m * np.sum(y_predict - y)
# gradient descent update step
self.weights -= self.lr * dw
self.bias -= self.lr * db
# print cost sometimes
if it % 1000 == 0:
print(f"Cost after iteration {it}: {cost}")
return self.weights, self.bias
def predict(self, X):
y_predict = self.sigmoid(np.dot(X, self.weights) + self.bias)
y_predict_labels = y_predict > 0.5
return y_predict_labels
def sigmoid(self, z):
return 1 / (1 + np.exp(-z))
if __name__ == "__main__":
np.random.seed(1)
X, y = make_blobs(n_samples=1000, centers=2)
y = y[:, np.newaxis]
logreg = LogisticRegression(X)
w, b = logreg.train(X, y)
y_predict = logreg.predict(X)
print(f"Accuracy: {np.sum(y==y_predict)/X.shape[0]}")

View File

@@ -0,0 +1,90 @@
1.1107, -2.1079
-0.5498, 0.0943
-0.0382, 1.8829
0.0555, -0.6139
0.5870, -1.2067
0.5453, 0.2509
-0.3927, -0.6220
-1.1905, -1.8785
-0.4240, 0.7772
-0.7139, 1.5846
-0.8883, 2.1408
-0.6922, 0.0993
1.4350, 1.2334
-0.7576, 0.7386
-1.1144, -1.7059
0.6612, -1.7296
-2.1381, -0.0600
1.3857, 1.2178
-1.4951, 0.0373
0.8029, 0.9739
1.5607, 1.5862
0.8563, -1.4245
0.0397, -1.3799
1.2331, 1.7421
-2.0015, 0.8355
-0.3428, -0.4780
-0.8891, 1.2634
0.3832, -0.1189
0.4172, 1.0132
-0.8695, -0.7947
2.9737, 3.6438
3.7680, 1.8649
0.1166, 0.9435
0.6896, 3.9160
1.2234, 2.9899
2.3009, 0.4150
3.7693, 3.8027
1.9450, 3.4208
0.9290, 3.3611
5.0027, 2.7870
1.0101, 1.8737
2.0751, 2.2628
1.9113, 3.6777
2.3127, 3.9130
1.9392, 2.3976
3.1218, 2.5495
1.7032, 1.1509
0.4212, 3.5322
2.7686, 0.9402
2.1696, 2.9285
0.3380, 2.0947
3.6886, 0.4054
2.6315, 3.1962
-0.5332, 3.1421
0.3380, 3.0801
1.4030, 1.1841
2.8739, 2.7777
1.1254, 3.2404
0.0988, 1.9522
0.3688, 2.8904
1.4758, -1.6387
1.9289, -1.8191
2.5741, -1.3213
2.1917, -1.2852
0.8358, -2.3349
2.6863, -1.8834
3.1102, -0.4854
3.7073, -0.6466
3.6394, -0.4097
0.5365, -3.6555
2.9295, -0.3819
0.8168, -3.1133
1.3432, -1.7717
1.1039, -2.2261
1.3754, -2.2236
0.6757, -2.5379
-0.2029, -3.8420
2.4210, -1.9788
1.0335, -2.6042
0.9638, -2.9449
-0.8198, -5.4449
1.9552, -1.5530
0.3505, -3.1887
2.4943, -1.8116
1.9761, -1.0664
0.5994, -3.0513
2.2076, -1.6728
1.9941, -1.8826
1.7487, -2.9644
1.4160, -2.4234

View File

@@ -0,0 +1,90 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3

View File

@@ -0,0 +1,67 @@
"""
Naive Bayes Classifier Implementation from scratch
To run the code structure the code in the following way:
X be size: (num_training_examples, num_features)
y be size: (num_classes, )
Where the classes are 0, 1, 2, etc. Then an example run looks like:
NB = NaiveBayes(X, y)
NB.fit(X)
predictions = NB.predict(X)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-21 Initial coding
"""
import numpy as np
class NaiveBayes:
def __init__(self, X, y):
self.num_examples, self.num_features = X.shape
self.num_classes = len(np.unique(y))
self.eps = 1e-6
def fit(self, X):
self.classes_mean = {}
self.classes_variance = {}
self.classes_prior = {}
for c in range(self.num_classes):
X_c = X[y == c]
self.classes_mean[str(c)] = np.mean(X_c, axis=0)
self.classes_variance[str(c)] = np.var(X_c, axis=0)
self.classes_prior[str(c)] = X_c.shape[0] / X.shape[0]
def predict(self, X):
probs = np.zeros((self.num_examples, self.num_classes))
for c in range(self.num_classes):
prior = self.classes_prior[str(c)]
probs_c = self.density_function(
X, self.classes_mean[str(c)], self.classes_variance[str(c)]
)
probs[:, c] = probs_c + np.log(prior)
return np.argmax(probs, 1)
def density_function(self, x, mean, sigma):
# Calculate probability from Gaussian density function
const = -self.num_features / 2 * np.log(2 * np.pi) - 0.5 * np.sum(
np.log(sigma + self.eps)
)
probs = 0.5 * np.sum(np.power(x - mean, 2) / (sigma + self.eps), 1)
return const - probs
if __name__ == "__main__":
X = np.loadtxt("example_data/data.txt", delimiter=",")
y = np.loadtxt("example_data/targets.txt") - 1
NB = NaiveBayes(X, y)
NB.fit(X)
y_pred = NB.predict(X)
print(f"Accuracy: {sum(y_pred==y)/X.shape[0]}")

View File

@@ -0,0 +1,174 @@
"""
Simple two-layered Neural Network from scratch implementation.
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-28 Initial coding
"""
import numpy as np
from utils import create_dataset, plot_contour
class NeuralNetwork:
def __init__(self, X, y):
# m for #training examples and n for #features
self.m, self.n = X.shape
# regularization term lambd (lambda is reserved keyword)
self.lambd = 1e-3
self.learning_rate = 0.1
# Define size of first hidden-layer and second hidden layer (output layer)
self.h1 = 25
self.h2 = len(np.unique(y))
def init_kaiming_weights(self, l0, l1):
# Kaiming weights
w = np.random.randn(l0, l1) * np.sqrt(2.0 / l0)
b = np.zeros((1, l1))
return w, b
def forward_prop(self, X, parameters):
W2 = parameters["W2"]
W1 = parameters["W1"]
b2 = parameters["b2"]
b1 = parameters["b1"]
# forward prop
a0 = X
z1 = np.dot(a0, W1) + b1
# apply nonlinearity (relu)
a1 = np.maximum(0, z1)
z2 = np.dot(a1, W2) + b2
# softmax on the last layer
scores = z2
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# cache values from forward pass to use for backward pass
cache = {"a0": X, "probs": probs, "a1": a1}
return cache, probs
def compute_cost(self, y, probs, parameters):
W2 = parameters["W2"]
W1 = parameters["W1"]
y = y.astype(int)
data_loss = np.sum(-np.log(probs[np.arange(self.m), y]) / self.m)
reg_loss = 0.5 * self.lambd * np.sum(W1 * W1) + 0.5 * self.lambd * np.sum(
W2 * W2
)
# total cost J
total_cost = data_loss + reg_loss
return total_cost
def back_prop(self, cache, parameters, y):
# Unpack from parameters
W2 = parameters["W2"]
W1 = parameters["W1"]
b2 = parameters["b2"]
b1 = parameters["b1"]
# Unpack from forward prop
a0 = cache["a0"]
a1 = cache["a1"]
probs = cache["probs"]
dz2 = probs
dz2[np.arange(self.m), y] -= 1
dz2 /= self.m
# backprop through values dW2 and db2
dW2 = np.dot(a1.T, dz2) + self.lambd * W2
db2 = np.sum(dz2, axis=0, keepdims=True)
# Back to the (only) hidden layer in this case
dz1 = np.dot(dz2, W2.T)
dz1 = dz1 * (a1 > 0)
# backprop through values dW1, db1
dW1 = np.dot(a0.T, dz1) + self.lambd * W1
db1 = np.sum(dz1, axis=0, keepdims=True)
grads = {"dW1": dW1, "dW2": dW2, "db1": db1, "db2": db2}
return grads
def update_parameters(self, parameters, grads):
learning_rate = self.learning_rate
W2 = parameters["W2"]
W1 = parameters["W1"]
b2 = parameters["b2"]
b1 = parameters["b1"]
dW2 = grads["dW2"]
dW1 = grads["dW1"]
db2 = grads["db2"]
db1 = grads["db1"]
# Do gradient descent step
W2 -= learning_rate * dW2
W1 -= learning_rate * dW1
b2 -= learning_rate * db2
b1 -= learning_rate * db1
# store back weights in parameters
parameters = {"W1": W1, "W2": W2, "b1": b1, "b2": b2}
return parameters
def main(self, X, y, num_iter=10000):
# initialize our weights
W1, b1 = self.init_kaiming_weights(self.n, self.h1)
W2, b2 = self.init_kaiming_weights(self.h1, self.h2)
# pack parameters into a dictionary
parameters = {"W1": W1, "W2": W2, "b1": b1, "b2": b2}
# How many gradient descent updates we want to do
for it in range(num_iter + 1):
# forward prop
cache, probs = self.forward_prop(X, parameters)
# calculate cost
cost = self.compute_cost(y, probs, parameters)
# print cost sometimes
if it % 2500 == 0:
print(f"At iteration {it} we have a cost of {cost}")
# back prop
grads = self.back_prop(cache, parameters, y)
# update parameters
parameters = self.update_parameters(parameters, grads)
return parameters
if __name__ == "__main__":
# Generate dataset
X, y = create_dataset(300, K=3)
y = y.astype(int)
# Train network
NN = NeuralNetwork(X, y)
trained_parameters = NN.main(X, y)
# Get trained parameters
W2 = trained_parameters["W2"]
W1 = trained_parameters["W1"]
b2 = trained_parameters["b2"]
b1 = trained_parameters["b1"]
# Plot the decision boundary (for nice visualization)
plot_contour(X, y, NN, trained_parameters)

View File

@@ -0,0 +1,50 @@
"""
These were (shamelessly) taken from cs231n course github code.
I believe these were coded by Andrej Karpathy so credit goes to him
for coding these.
"""
import numpy as np
import matplotlib.pyplot as plt
def create_dataset(N, K=2):
N = 100 # number of points per class
D = 2
X = np.zeros((N * K, D)) # data matrix (each row = single example)
y = np.zeros(N * K) # class labels
for j in range(K):
ix = range(N * j, N * (j + 1))
r = np.linspace(0, 1, N) # radius
t = np.linspace(j * 4, (j + 1) * 4, N) + np.random.randn(N) * 0.2
X[ix] = np.c_[r * np.sin(t), r * np.cos(t)]
y[ix] = j
# lets visualize the data:
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.show()
return X, y
def plot_contour(X, y, model, parameters):
# plot the resulting classifier
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
points = np.c_[xx.ravel(), yy.ravel()]
# forward prop with our trained parameters
_, Z = model.forward_prop(points, parameters)
# classify into highest prob
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral, alpha=0.8)
# plt the points
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
# fig.savefig('spiral_net.png')

View File

@@ -0,0 +1,90 @@
1.1107, -2.1079, 1
-0.5498, 0.0943, 1
-0.0382, 1.8829,1
0.0555, -0.6139,1
0.5870, -1.2067,1
0.5453, 0.2509,1
-0.3927, -0.6220,1
-1.1905, -1.8785,1
-0.4240, 0.7772,1
-0.7139, 1.5846,1
-0.8883, 2.1408,1
-0.6922, 0.0993,1
1.4350, 1.2334,1
-0.7576, 0.7386,1
-1.1144, -1.7059,1
0.6612, -1.7296,1
-2.1381, -0.0600,1
1.3857, 1.2178,1
-1.4951, 0.0373,1
0.8029, 0.9739,1
1.5607, 1.5862,1
0.8563, -1.4245,1
0.0397, -1.3799,1
1.2331, 1.7421,1
-2.0015, 0.8355,1
-0.3428, -0.4780,1
-0.8891, 1.2634,1
0.3832, -0.1189,1
0.4172, 1.0132,1
-0.8695, -0.7947,1
2.9737, 3.6438,2
3.7680, 1.8649,2
0.1166, 0.9435,2
0.6896, 3.9160,2
1.2234, 2.9899,2
2.3009, 0.4150,2
3.7693, 3.8027,2
1.9450, 3.4208,2
0.9290, 3.3611,2
5.0027, 2.7870,2
1.0101, 1.8737,2
2.0751, 2.2628,2
1.9113, 3.6777,2
2.3127, 3.9130,2
1.9392, 2.3976,2
3.1218, 2.5495,2
1.7032, 1.1509,2
0.4212, 3.5322,2
2.7686, 0.9402,2
2.1696, 2.9285,2
0.3380, 2.0947,2
3.6886, 0.4054,2
2.6315, 3.1962,2
-0.5332, 3.1421,2
0.3380, 3.0801,2
1.4030, 1.1841,2
2.8739, 2.7777,2
1.1254, 3.2404,2
0.0988, 1.9522,2
0.3688, 2.8904,2
1.4758, -1.6387,3
1.9289, -1.8191,3
2.5741, -1.3213,3
2.1917, -1.2852,3
0.8358, -2.3349,3
2.6863, -1.8834,3
3.1102, -0.4854,3
3.7073, -0.6466,3
3.6394, -0.4097,3
0.5365, -3.6555,3
2.9295, -0.3819,3
0.8168, -3.1133,3
1.3432, -1.7717,3
1.1039, -2.2261,3
1.3754, -2.2236,3
0.6757, -2.5379,3
-0.2029, -3.8420,3
2.4210, -1.9788,3
1.0335, -2.6042,3
0.9638, -2.9449,3
-0.8198, -5.4449,3
1.9552, -1.5530,3
0.3505, -3.1887,3
2.4943, -1.8116,3
1.9761, -1.0664,3
0.5994, -3.0513,3
2.2076, -1.6728,3
1.9941, -1.8826,3
1.7487, -2.9644,3
1.4160, -2.4234,3

View File

@@ -0,0 +1,100 @@
701,478,227,863,963,2
96,147,210,493,586,2
798,143,431,541,94,1
233,146,667,886,771,1
668,815,628,429,387,3
718,456,883,281,840,1
182,837,144,664,460,2
882,533,203,776,56,3
648,715,288,619,293,1
178,951,965,164,1,3
270,432,457,978,794,1
335,219,596,763,231,1
47,477,78,423,616,3
324,969,514,55,722,2
824,571,159,516,594,2
837,667,957,150,508,3
833,945,311,12,859,1
536,280,21,292,518,1
943,55,709,269,425,1
593,178,861,130,26,3
54,165,3,638,816,2
637,861,423,855,98,1
222,502,427,944,732,1
8,465,403,376,761,2
184,602,673,825,741,1
639,677,204,385,236,2
176,843,479,952,898,2
125,626,553,74,1000,3
302,495,294,362,169,2
131,912,803,232,852,1
117,609,290,133,357,2
207,812,788,182,494,1
954,76,257,620,844,1
287,266,195,30,344,3
440,590,324,868,969,3
831,290,228,586,971,1
567,734,460,429,689,1
864,939,191,620,431,1
905,337,200,400,77,2
304,997,141,208,615,3
19,280,187,44,639,1
280,279,275,305,123,1
866,519,331,241,972,1
27,77,860,458,643,3
486,713,917,324,855,2
466,16,897,222,731,1
712,230,215,805,341,1
300,100,292,978,115,3
938,800,911,345,49,3
98,593,43,583,684,1
348,479,406,605,595,2
892,877,592,339,615,3
203,53,995,704,927,2
991,968,886,43,883,1
733,939,71,388,56,1
249,376,830,628,812,2
4,877,743,242,266,1
95,537,106,490,518,2
870,704,430,270,327,2
402,97,283,569,638,3
537,979,966,729,8,3
399,51,285,973,509,1
662,951,947,923,112,3
71,573,9,305,351,2
240,837,836,277,177,1
513,318,709,435,367,2
553,253,980,868,26,1
848,543,171,420,73,1
449,538,720,347,500,2
42,319,830,447,727,2
165,968,151,672,452,3
1,781,142,137,157,2
907,364,776,490,502,2
146,512,87,344,233,3
478,62,55,815,283,3
751,789,112,277,483,1
189,597,866,73,397,3
607,210,327,538,68,2
337,401,557,667,642,1
249,894,84,81,643,1
896,858,568,345,157,1
362,886,558,531,735,1
865,418,866,824,370,3
14,517,514,257,129,2
845,833,998,211,684,2
289,302,416,364,920,2
383,173,991,815,368,3
652,325,903,471,224,3
757,580,974,667,620,1
946,247,684,191,332,2
63,330,199,280,608,2
752,298,95,143,134,2
987,105,747,931,413,3
510,23,385,711,701,1
326,195,651,727,85,3
214,978,396,428,14,1
646,133,388,896,971,1
849,817,294,491,397,2
854,973,274,315,897,3
666,530,683,234,439,1
1 701 478 227 863 963 2
2 96 147 210 493 586 2
3 798 143 431 541 94 1
4 233 146 667 886 771 1
5 668 815 628 429 387 3
6 718 456 883 281 840 1
7 182 837 144 664 460 2
8 882 533 203 776 56 3
9 648 715 288 619 293 1
10 178 951 965 164 1 3
11 270 432 457 978 794 1
12 335 219 596 763 231 1
13 47 477 78 423 616 3
14 324 969 514 55 722 2
15 824 571 159 516 594 2
16 837 667 957 150 508 3
17 833 945 311 12 859 1
18 536 280 21 292 518 1
19 943 55 709 269 425 1
20 593 178 861 130 26 3
21 54 165 3 638 816 2
22 637 861 423 855 98 1
23 222 502 427 944 732 1
24 8 465 403 376 761 2
25 184 602 673 825 741 1
26 639 677 204 385 236 2
27 176 843 479 952 898 2
28 125 626 553 74 1000 3
29 302 495 294 362 169 2
30 131 912 803 232 852 1
31 117 609 290 133 357 2
32 207 812 788 182 494 1
33 954 76 257 620 844 1
34 287 266 195 30 344 3
35 440 590 324 868 969 3
36 831 290 228 586 971 1
37 567 734 460 429 689 1
38 864 939 191 620 431 1
39 905 337 200 400 77 2
40 304 997 141 208 615 3
41 19 280 187 44 639 1
42 280 279 275 305 123 1
43 866 519 331 241 972 1
44 27 77 860 458 643 3
45 486 713 917 324 855 2
46 466 16 897 222 731 1
47 712 230 215 805 341 1
48 300 100 292 978 115 3
49 938 800 911 345 49 3
50 98 593 43 583 684 1
51 348 479 406 605 595 2
52 892 877 592 339 615 3
53 203 53 995 704 927 2
54 991 968 886 43 883 1
55 733 939 71 388 56 1
56 249 376 830 628 812 2
57 4 877 743 242 266 1
58 95 537 106 490 518 2
59 870 704 430 270 327 2
60 402 97 283 569 638 3
61 537 979 966 729 8 3
62 399 51 285 973 509 1
63 662 951 947 923 112 3
64 71 573 9 305 351 2
65 240 837 836 277 177 1
66 513 318 709 435 367 2
67 553 253 980 868 26 1
68 848 543 171 420 73 1
69 449 538 720 347 500 2
70 42 319 830 447 727 2
71 165 968 151 672 452 3
72 1 781 142 137 157 2
73 907 364 776 490 502 2
74 146 512 87 344 233 3
75 478 62 55 815 283 3
76 751 789 112 277 483 1
77 189 597 866 73 397 3
78 607 210 327 538 68 2
79 337 401 557 667 642 1
80 249 894 84 81 643 1
81 896 858 568 345 157 1
82 362 886 558 531 735 1
83 865 418 866 824 370 3
84 14 517 514 257 129 2
85 845 833 998 211 684 2
86 289 302 416 364 920 2
87 383 173 991 815 368 3
88 652 325 903 471 224 3
89 757 580 974 667 620 1
90 946 247 684 191 332 2
91 63 330 199 280 608 2
92 752 298 95 143 134 2
93 987 105 747 931 413 3
94 510 23 385 711 701 1
95 326 195 651 727 85 3
96 214 978 396 428 14 1
97 646 133 388 896 971 1
98 849 817 294 491 397 2
99 854 973 274 315 897 3
100 666 530 683 234 439 1

View File

@@ -0,0 +1,90 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3

View File

@@ -0,0 +1,170 @@
"""
Author: Philip Andreadis
e-mail: philip_andreadis@hotmail.com
Implementation of Random Forest model from scratch.
The DecisionTree class from this project is used for generating the trees of the random forest.
This class remains with no changes as the dataset is split into a number of folds with a random subset of features on which each tree is trained on.
As a result each tree is trained on a different group of the dataset in order to avoid correlation between them.
The predicted class value of each instance is chosen by voting from each single tree's outcome.
Parameters of the model:
MAX_DEPTH (int): Maximum depth of the decision tree
MIN_NODE (int): Minimum number of instances a node can have. If this threshold is exceeded the node is terminated
FOLD_SIZE (int): Value between 1-10 representing the percentage of the original dataset size each fold should be.
N_TREES (int):The toral number of trees that will be trained.
Input dataset to train() function must be a numpy array containing both feature and label values.
"""
from random import randrange
from random import randint
import numpy as np
from decision_tree import DecisionTree
# fold size (% of dataset size) e.g. 3 means 30%
FOLD_SIZE = 10
# number of trees
N_TREES = 20
# max tree depth
MAX_DEPTH = 30
# min size of tree node
MIN_NODE = 1
class RandomForest:
def __init__(self,n_trees,fold_size):
self.n_trees = n_trees
self.fold_size = fold_size
self.trees = list()
"""
This function splits the given dataset into n-folds with replacement. The number of folds is equal to the number of the trees that will be trained.
Each tree will have one fold as input. The size of the folds is a percentage (p) of the size of the original dataset.
Parameters:
dataset: np array of the given dataset
n_folds (int): number of folds in which the dataset should be split. Must be equal to the number of trees the user wants to train
p (int): suggests the percentage of the dataset's size the size of a single fold should be.
Returns list of np arrays: list with the k-folds
"""
def cross_validation_split(self,dataset, n_folds, p):
dataset_split = list()
fold_size = int(len(dataset)*p/10)
for i in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset))
fold.append(dataset[index])
set = np.array(fold)
dataset_split.append(set)
return dataset_split
"""
This function randomizes the selection of the features each tree will be trained on.
Parameters:
splits list of np arrays: list of folds
Returns list of np arrays: list with the k-folds with some features randomly removed
"""
def randomize_features(self,splits):
dataset_split = list()
l = len(splits[0][0])
n_features = int((l-1)*5/10)
for split in splits:
for i in range(n_features):
rng = list(range(len(split[0]) - 1))
selected = rng.pop(randint(0,len(rng)-1))
split = np.delete(split, selected, 1)
set = np.array(split)
dataset_split.append(set)
return dataset_split
"""
Prints out all the decision trees of the random forest.
BUG: The feature number is not representative of its initial enumeration in the original dataset due to the randomization.
This means that we do not know on which features each tree is trained on.
"""
def print_trees(self):
i = 1
for t in self.trees:
print("Tree#",i)
temp = t.final_tree
t.print_dt(temp)
print("\n")
i = i+1
"""
Iteratively train each decision tree.
Parameters:
X (np.array): Training data
"""
def train(self,X):
train_x = self.cross_validation_split(X,self.n_trees,self.fold_size)
train_x = self.randomize_features(train_x)
for fold in train_x:
dt = DecisionTree(MAX_DEPTH, MIN_NODE)
dt.train(fold)
self.trees.append(dt)
"""
This function outputs the class value for each instance of the given dataset as predicted by the random forest algorithm.
Parameters:
X (np.array): Dataset with labels
Returns y (np.array): array with the predicted class values of the dataset
"""
def predict(self,X):
predicts = list()
final_predicts = list()
for tree in self.trees:
predicts.append(tree.predict(X))
# iterate through each tree's class prediction and find the most frequent for each instance
for i in range(len(predicts[0])):
values = list()
for j in range(len(predicts)):
values.append(predicts[j][i])
final_predicts.append(max(set(values), key=values.count))
return final_predicts,predicts
if __name__ == "__main__":
# Training data
train_data = np.loadtxt("example_data/data.txt", delimiter=",")
train_y = np.loadtxt("example_data/targets.txt")
mock_train = np.loadtxt("example_data/mock_data.csv", delimiter=",")
mock_y = mock_train[ : , -1]
# Build and train model
rf = RandomForest(N_TREES,FOLD_SIZE)
rf.train(mock_train)
# Evaluate model on training data
y_pred,y_pred_ind = rf.predict(mock_train)
print(f"Accuracy of random forest: {sum(y_pred == mock_y) / mock_y.shape[0]}")
print("\nAccuracy for each individual tree:")
c = 1
for i in y_pred_ind:
print("\nTree",c)
print(f"Accuracy: {sum(i == mock_y) / mock_y.shape[0]}")
c = c+1

Binary file not shown.

Binary file not shown.

96
ML/algorithms/svm/svm.py Normal file
View File

@@ -0,0 +1,96 @@
"""
Implementation of SVM using cvxopt package. Implementation uses
soft margin and I've defined linear, polynomial and gaussian kernels.
To understand the theory (which is a bit challenging) I recommend reading the following:
http://cs229.stanford.edu/notes/cs229-notes3.pdf
https://www.youtube.com/playlist?list=PLoROMvodv4rMiGQp3WXShtMGgzqpfVfbU (Lectures 6,7 by Andrew Ng)
To understand how to reformulate the optimization problem we obtain
to get the input to cvxopt QP solver this blogpost can be useful:
https://xavierbourretsicotte.github.io/SVM_implementation.html
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-26 Initial coding
"""
import numpy as np
import cvxopt
from utils import create_dataset, plot_contour
def linear(x, z):
return np.dot(x, z.T)
def polynomial(x, z, p=5):
return (1 + np.dot(x, z.T)) ** p
def gaussian(x, z, sigma=0.1):
return np.exp(-np.linalg.norm(x - z, axis=1) ** 2 / (2 * (sigma ** 2)))
class SVM:
def __init__(self, kernel=gaussian, C=1):
self.kernel = kernel
self.C = C
def fit(self, X, y):
self.y = y
self.X = X
m, n = X.shape
# Calculate Kernel
self.K = np.zeros((m, m))
for i in range(m):
self.K[i, :] = self.kernel(X[i, np.newaxis], self.X)
# Solve with cvxopt final QP needs to be reformulated
# to match the input form for cvxopt.solvers.qp
P = cvxopt.matrix(np.outer(y, y) * self.K)
q = cvxopt.matrix(-np.ones((m, 1)))
G = cvxopt.matrix(np.vstack((np.eye(m) * -1, np.eye(m))))
h = cvxopt.matrix(np.hstack((np.zeros(m), np.ones(m) * self.C)))
A = cvxopt.matrix(y, (1, m), "d")
b = cvxopt.matrix(np.zeros(1))
cvxopt.solvers.options["show_progress"] = False
sol = cvxopt.solvers.qp(P, q, G, h, A, b)
self.alphas = np.array(sol["x"])
def predict(self, X):
y_predict = np.zeros((X.shape[0]))
sv = self.get_parameters(self.alphas)
for i in range(X.shape[0]):
y_predict[i] = np.sum(
self.alphas[sv]
* self.y[sv, np.newaxis]
* self.kernel(X[i], self.X[sv])[:, np.newaxis]
)
return np.sign(y_predict + self.b)
def get_parameters(self, alphas):
threshold = 1e-5
sv = ((alphas > threshold) * (alphas < self.C)).flatten()
self.w = np.dot(X[sv].T, alphas[sv] * self.y[sv, np.newaxis])
self.b = np.mean(
self.y[sv, np.newaxis]
- self.alphas[sv] * self.y[sv, np.newaxis] * self.K[sv, sv][:, np.newaxis]
)
return sv
if __name__ == "__main__":
np.random.seed(1)
X, y = create_dataset(N=50)
svm = SVM(kernel=gaussian)
svm.fit(X, y)
y_pred = svm.predict(X)
plot_contour(X, y, svm)
print(f"Accuracy: {sum(y==y_pred)/y.shape[0]}")

View File

@@ -0,0 +1,47 @@
"""
These were (shamelessly) taken from cs231n course github code.
I believe these were coded by Andrej Karpathy so credit goes to him
for coding these.
"""
import numpy as np
import matplotlib.pyplot as plt
def create_dataset(N, D=2, K=2):
X = np.zeros((N * K, D)) # data matrix (each row = single example)
y = np.zeros(N * K) # class labels
for j in range(K):
ix = range(N * j, N * (j + 1))
r = np.linspace(0.0, 1, N) # radius
t = np.linspace(j * 4, (j + 1) * 4, N) + np.random.randn(N) * 0.2 # theta
X[ix] = np.c_[r * np.sin(t), r * np.cos(t)]
y[ix] = j
# lets visualize the data:
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.show()
y[y == 0] -= 1
return X, y
def plot_contour(X, y, svm):
# plot the resulting classifier
h = 0.01
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
points = np.c_[xx.ravel(), yy.ravel()]
Z = svm.predict(points)
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral, alpha=0.8)
# plt the points
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.show()