mirror of
https://github.com/aladdinpersson/Machine-Learning-Collection.git
synced 2026-02-21 19:27:58 +00:00
171 lines
5.7 KiB
Python
171 lines
5.7 KiB
Python
|
|
"""
|
||
|
|
Author: Philip Andreadis
|
||
|
|
e-mail: philip_andreadis@hotmail.com
|
||
|
|
|
||
|
|
|
||
|
|
Implementation of Random Forest model from scratch.
|
||
|
|
The DecisionTree class from this project is used for generating the trees of the random forest.
|
||
|
|
This class remains with no changes as the dataset is split into a number of folds with a random subset of features on which each tree is trained on.
|
||
|
|
As a result each tree is trained on a different group of the dataset in order to avoid correlation between them.
|
||
|
|
The predicted class value of each instance is chosen by voting from each single tree's outcome.
|
||
|
|
|
||
|
|
Parameters of the model:
|
||
|
|
MAX_DEPTH (int): Maximum depth of the decision tree
|
||
|
|
MIN_NODE (int): Minimum number of instances a node can have. If this threshold is exceeded the node is terminated
|
||
|
|
FOLD_SIZE (int): Value between 1-10 representing the percentage of the original dataset size each fold should be.
|
||
|
|
N_TREES (int):The toral number of trees that will be trained.
|
||
|
|
|
||
|
|
Input dataset to train() function must be a numpy array containing both feature and label values.
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
from random import randrange
|
||
|
|
from random import randint
|
||
|
|
import numpy as np
|
||
|
|
from decision_tree import DecisionTree
|
||
|
|
|
||
|
|
# fold size (% of dataset size) e.g. 3 means 30%
|
||
|
|
FOLD_SIZE = 10
|
||
|
|
# number of trees
|
||
|
|
N_TREES = 20
|
||
|
|
# max tree depth
|
||
|
|
MAX_DEPTH = 30
|
||
|
|
# min size of tree node
|
||
|
|
MIN_NODE = 1
|
||
|
|
|
||
|
|
|
||
|
|
class RandomForest:
|
||
|
|
def __init__(self,n_trees,fold_size):
|
||
|
|
self.n_trees = n_trees
|
||
|
|
self.fold_size = fold_size
|
||
|
|
self.trees = list()
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
"""
|
||
|
|
This function splits the given dataset into n-folds with replacement. The number of folds is equal to the number of the trees that will be trained.
|
||
|
|
Each tree will have one fold as input. The size of the folds is a percentage (p) of the size of the original dataset.
|
||
|
|
|
||
|
|
Parameters:
|
||
|
|
dataset: np array of the given dataset
|
||
|
|
n_folds (int): number of folds in which the dataset should be split. Must be equal to the number of trees the user wants to train
|
||
|
|
p (int): suggests the percentage of the dataset's size the size of a single fold should be.
|
||
|
|
|
||
|
|
Returns list of np arrays: list with the k-folds
|
||
|
|
|
||
|
|
"""
|
||
|
|
def cross_validation_split(self,dataset, n_folds, p):
|
||
|
|
dataset_split = list()
|
||
|
|
fold_size = int(len(dataset)*p/10)
|
||
|
|
for i in range(n_folds):
|
||
|
|
fold = list()
|
||
|
|
while len(fold) < fold_size:
|
||
|
|
index = randrange(len(dataset))
|
||
|
|
fold.append(dataset[index])
|
||
|
|
set = np.array(fold)
|
||
|
|
dataset_split.append(set)
|
||
|
|
return dataset_split
|
||
|
|
|
||
|
|
|
||
|
|
"""
|
||
|
|
This function randomizes the selection of the features each tree will be trained on.
|
||
|
|
|
||
|
|
Parameters:
|
||
|
|
splits list of np arrays: list of folds
|
||
|
|
|
||
|
|
|
||
|
|
Returns list of np arrays: list with the k-folds with some features randomly removed
|
||
|
|
|
||
|
|
"""
|
||
|
|
def randomize_features(self,splits):
|
||
|
|
dataset_split = list()
|
||
|
|
l = len(splits[0][0])
|
||
|
|
n_features = int((l-1)*5/10)
|
||
|
|
for split in splits:
|
||
|
|
for i in range(n_features):
|
||
|
|
rng = list(range(len(split[0]) - 1))
|
||
|
|
selected = rng.pop(randint(0,len(rng)-1))
|
||
|
|
split = np.delete(split, selected, 1)
|
||
|
|
set = np.array(split)
|
||
|
|
dataset_split.append(set)
|
||
|
|
return dataset_split
|
||
|
|
|
||
|
|
|
||
|
|
"""
|
||
|
|
Prints out all the decision trees of the random forest.
|
||
|
|
|
||
|
|
BUG: The feature number is not representative of its initial enumeration in the original dataset due to the randomization.
|
||
|
|
This means that we do not know on which features each tree is trained on.
|
||
|
|
"""
|
||
|
|
def print_trees(self):
|
||
|
|
i = 1
|
||
|
|
for t in self.trees:
|
||
|
|
print("Tree#",i)
|
||
|
|
temp = t.final_tree
|
||
|
|
t.print_dt(temp)
|
||
|
|
print("\n")
|
||
|
|
i = i+1
|
||
|
|
|
||
|
|
"""
|
||
|
|
Iteratively train each decision tree.
|
||
|
|
Parameters:
|
||
|
|
X (np.array): Training data
|
||
|
|
|
||
|
|
"""
|
||
|
|
def train(self,X):
|
||
|
|
train_x = self.cross_validation_split(X,self.n_trees,self.fold_size)
|
||
|
|
train_x = self.randomize_features(train_x)
|
||
|
|
for fold in train_x:
|
||
|
|
dt = DecisionTree(MAX_DEPTH, MIN_NODE)
|
||
|
|
dt.train(fold)
|
||
|
|
self.trees.append(dt)
|
||
|
|
|
||
|
|
|
||
|
|
"""
|
||
|
|
This function outputs the class value for each instance of the given dataset as predicted by the random forest algorithm.
|
||
|
|
Parameters:
|
||
|
|
X (np.array): Dataset with labels
|
||
|
|
|
||
|
|
Returns y (np.array): array with the predicted class values of the dataset
|
||
|
|
"""
|
||
|
|
def predict(self,X):
|
||
|
|
predicts = list()
|
||
|
|
final_predicts = list()
|
||
|
|
for tree in self.trees:
|
||
|
|
predicts.append(tree.predict(X))
|
||
|
|
# iterate through each tree's class prediction and find the most frequent for each instance
|
||
|
|
for i in range(len(predicts[0])):
|
||
|
|
values = list()
|
||
|
|
for j in range(len(predicts)):
|
||
|
|
values.append(predicts[j][i])
|
||
|
|
final_predicts.append(max(set(values), key=values.count))
|
||
|
|
return final_predicts,predicts
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
|
||
|
|
|
||
|
|
# Training data
|
||
|
|
train_data = np.loadtxt("example_data/data.txt", delimiter=",")
|
||
|
|
train_y = np.loadtxt("example_data/targets.txt")
|
||
|
|
|
||
|
|
mock_train = np.loadtxt("example_data/mock_data.csv", delimiter=",")
|
||
|
|
mock_y = mock_train[ : , -1]
|
||
|
|
|
||
|
|
# Build and train model
|
||
|
|
rf = RandomForest(N_TREES,FOLD_SIZE)
|
||
|
|
rf.train(mock_train)
|
||
|
|
|
||
|
|
# Evaluate model on training data
|
||
|
|
y_pred,y_pred_ind = rf.predict(mock_train)
|
||
|
|
print(f"Accuracy of random forest: {sum(y_pred == mock_y) / mock_y.shape[0]}")
|
||
|
|
print("\nAccuracy for each individual tree:")
|
||
|
|
c = 1
|
||
|
|
for i in y_pred_ind:
|
||
|
|
print("\nTree",c)
|
||
|
|
print(f"Accuracy: {sum(i == mock_y) / mock_y.shape[0]}")
|
||
|
|
c = c+1
|