mirror of
https://github.com/aladdinpersson/Machine-Learning-Collection.git
synced 2026-02-21 11:18:01 +00:00
Initial commit
This commit is contained in:
170
ML/algorithms/randomforest/random_forest.py
Normal file
170
ML/algorithms/randomforest/random_forest.py
Normal file
@@ -0,0 +1,170 @@
|
||||
"""
|
||||
Author: Philip Andreadis
|
||||
e-mail: philip_andreadis@hotmail.com
|
||||
|
||||
|
||||
Implementation of Random Forest model from scratch.
|
||||
The DecisionTree class from this project is used for generating the trees of the random forest.
|
||||
This class remains with no changes as the dataset is split into a number of folds with a random subset of features on which each tree is trained on.
|
||||
As a result each tree is trained on a different group of the dataset in order to avoid correlation between them.
|
||||
The predicted class value of each instance is chosen by voting from each single tree's outcome.
|
||||
|
||||
Parameters of the model:
|
||||
MAX_DEPTH (int): Maximum depth of the decision tree
|
||||
MIN_NODE (int): Minimum number of instances a node can have. If this threshold is exceeded the node is terminated
|
||||
FOLD_SIZE (int): Value between 1-10 representing the percentage of the original dataset size each fold should be.
|
||||
N_TREES (int):The toral number of trees that will be trained.
|
||||
|
||||
Input dataset to train() function must be a numpy array containing both feature and label values.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
from random import randrange
|
||||
from random import randint
|
||||
import numpy as np
|
||||
from decision_tree import DecisionTree
|
||||
|
||||
# fold size (% of dataset size) e.g. 3 means 30%
|
||||
FOLD_SIZE = 10
|
||||
# number of trees
|
||||
N_TREES = 20
|
||||
# max tree depth
|
||||
MAX_DEPTH = 30
|
||||
# min size of tree node
|
||||
MIN_NODE = 1
|
||||
|
||||
|
||||
class RandomForest:
|
||||
def __init__(self,n_trees,fold_size):
|
||||
self.n_trees = n_trees
|
||||
self.fold_size = fold_size
|
||||
self.trees = list()
|
||||
|
||||
|
||||
|
||||
"""
|
||||
This function splits the given dataset into n-folds with replacement. The number of folds is equal to the number of the trees that will be trained.
|
||||
Each tree will have one fold as input. The size of the folds is a percentage (p) of the size of the original dataset.
|
||||
|
||||
Parameters:
|
||||
dataset: np array of the given dataset
|
||||
n_folds (int): number of folds in which the dataset should be split. Must be equal to the number of trees the user wants to train
|
||||
p (int): suggests the percentage of the dataset's size the size of a single fold should be.
|
||||
|
||||
Returns list of np arrays: list with the k-folds
|
||||
|
||||
"""
|
||||
def cross_validation_split(self,dataset, n_folds, p):
|
||||
dataset_split = list()
|
||||
fold_size = int(len(dataset)*p/10)
|
||||
for i in range(n_folds):
|
||||
fold = list()
|
||||
while len(fold) < fold_size:
|
||||
index = randrange(len(dataset))
|
||||
fold.append(dataset[index])
|
||||
set = np.array(fold)
|
||||
dataset_split.append(set)
|
||||
return dataset_split
|
||||
|
||||
|
||||
"""
|
||||
This function randomizes the selection of the features each tree will be trained on.
|
||||
|
||||
Parameters:
|
||||
splits list of np arrays: list of folds
|
||||
|
||||
|
||||
Returns list of np arrays: list with the k-folds with some features randomly removed
|
||||
|
||||
"""
|
||||
def randomize_features(self,splits):
|
||||
dataset_split = list()
|
||||
l = len(splits[0][0])
|
||||
n_features = int((l-1)*5/10)
|
||||
for split in splits:
|
||||
for i in range(n_features):
|
||||
rng = list(range(len(split[0]) - 1))
|
||||
selected = rng.pop(randint(0,len(rng)-1))
|
||||
split = np.delete(split, selected, 1)
|
||||
set = np.array(split)
|
||||
dataset_split.append(set)
|
||||
return dataset_split
|
||||
|
||||
|
||||
"""
|
||||
Prints out all the decision trees of the random forest.
|
||||
|
||||
BUG: The feature number is not representative of its initial enumeration in the original dataset due to the randomization.
|
||||
This means that we do not know on which features each tree is trained on.
|
||||
"""
|
||||
def print_trees(self):
|
||||
i = 1
|
||||
for t in self.trees:
|
||||
print("Tree#",i)
|
||||
temp = t.final_tree
|
||||
t.print_dt(temp)
|
||||
print("\n")
|
||||
i = i+1
|
||||
|
||||
"""
|
||||
Iteratively train each decision tree.
|
||||
Parameters:
|
||||
X (np.array): Training data
|
||||
|
||||
"""
|
||||
def train(self,X):
|
||||
train_x = self.cross_validation_split(X,self.n_trees,self.fold_size)
|
||||
train_x = self.randomize_features(train_x)
|
||||
for fold in train_x:
|
||||
dt = DecisionTree(MAX_DEPTH, MIN_NODE)
|
||||
dt.train(fold)
|
||||
self.trees.append(dt)
|
||||
|
||||
|
||||
"""
|
||||
This function outputs the class value for each instance of the given dataset as predicted by the random forest algorithm.
|
||||
Parameters:
|
||||
X (np.array): Dataset with labels
|
||||
|
||||
Returns y (np.array): array with the predicted class values of the dataset
|
||||
"""
|
||||
def predict(self,X):
|
||||
predicts = list()
|
||||
final_predicts = list()
|
||||
for tree in self.trees:
|
||||
predicts.append(tree.predict(X))
|
||||
# iterate through each tree's class prediction and find the most frequent for each instance
|
||||
for i in range(len(predicts[0])):
|
||||
values = list()
|
||||
for j in range(len(predicts)):
|
||||
values.append(predicts[j][i])
|
||||
final_predicts.append(max(set(values), key=values.count))
|
||||
return final_predicts,predicts
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
||||
# Training data
|
||||
train_data = np.loadtxt("example_data/data.txt", delimiter=",")
|
||||
train_y = np.loadtxt("example_data/targets.txt")
|
||||
|
||||
mock_train = np.loadtxt("example_data/mock_data.csv", delimiter=",")
|
||||
mock_y = mock_train[ : , -1]
|
||||
|
||||
# Build and train model
|
||||
rf = RandomForest(N_TREES,FOLD_SIZE)
|
||||
rf.train(mock_train)
|
||||
|
||||
# Evaluate model on training data
|
||||
y_pred,y_pred_ind = rf.predict(mock_train)
|
||||
print(f"Accuracy of random forest: {sum(y_pred == mock_y) / mock_y.shape[0]}")
|
||||
print("\nAccuracy for each individual tree:")
|
||||
c = 1
|
||||
for i in y_pred_ind:
|
||||
print("\nTree",c)
|
||||
print(f"Accuracy: {sum(i == mock_y) / mock_y.shape[0]}")
|
||||
c = c+1
|
||||
Reference in New Issue
Block a user