ML/algorithms/randomforest/random_forest.py

"""
Author: Philip Andreadis
e-mail: philip_andreadis@hotmail.com


Implementation of Random Forest model from scratch.
The DecisionTree class from this project is used for generating the trees of the random forest.
This class remains with no changes as the dataset is split into a number of folds with a random subset of features on which each tree is trained on.
As a result each tree is trained on a different group of the dataset in order to avoid correlation between them.
The predicted class value of each instance is chosen by voting from each single tree's outcome.

Parameters of the model:
MAX_DEPTH (int): Maximum depth of the decision tree
MIN_NODE (int): Minimum number of instances a node can have. If this threshold is exceeded the node is terminated
FOLD_SIZE (int): Value between 1-10 representing the percentage of the original dataset size each fold should be.
N_TREES (int):The toral number of trees that will be trained.

Input dataset to train() function must be a numpy array containing both feature and label values.

"""


from random import randrange
from random import randint
import numpy as np
from decision_tree import DecisionTree

# fold size (% of dataset size) e.g. 3 means 30%
FOLD_SIZE = 10
# number of trees
N_TREES = 20
# max tree depth
MAX_DEPTH = 30
# min size of tree node
MIN_NODE = 1


class RandomForest:
    def __init__(self,n_trees,fold_size):
        self.n_trees = n_trees
        self.fold_size = fold_size
        self.trees = list()


    """
        This function splits the given dataset into n-folds with replacement. The number of folds is equal to the number of the trees that will be trained.
        Each tree will have one fold as input. The size of the folds is a percentage (p) of the size of the original dataset. 

        Parameters:
        dataset: np array of the given dataset
        n_folds (int): number of folds in which the dataset should be split. Must be equal to the number of trees the user wants to train
        p (int): suggests the percentage of the dataset's size the size of a single fold should be.

        Returns list of np arrays: list with the k-folds 

    """
    def cross_validation_split(self,dataset, n_folds, p):
        dataset_split = list()
        fold_size = int(len(dataset)*p/10)
        for i in range(n_folds):
            fold = list()
            while len(fold) < fold_size:
                index = randrange(len(dataset))
                fold.append(dataset[index])
            set = np.array(fold)
            dataset_split.append(set)
        return dataset_split


    """
        This function randomizes the selection of the features each tree will be trained on.

        Parameters:
            splits list of np arrays: list of folds
            

        Returns list of np arrays: list with the k-folds with some features randomly removed

    """
    def randomize_features(self,splits):
        dataset_split = list()
        l = len(splits[0][0])
        n_features = int((l-1)*5/10)
        for split in splits:
            for i in range(n_features):
                rng = list(range(len(split[0]) - 1))
                selected = rng.pop(randint(0,len(rng)-1))
                split = np.delete(split, selected, 1)
            set = np.array(split)
            dataset_split.append(set)
        return dataset_split


    """
        Prints out all the decision trees of the random forest.
            
        BUG: The feature number is not representative of its initial enumeration in the original dataset due to the randomization. 
             This means that we do not know on which features each tree is trained on.
    """
    def print_trees(self):
        i = 1
        for t in self.trees:
            print("Tree#",i)
            temp = t.final_tree
            t.print_dt(temp)
            print("\n")
            i = i+1

    """
        Iteratively train each decision tree.
        Parameters:
        X (np.array): Training data

    """
    def train(self,X):
        train_x = self.cross_validation_split(X,self.n_trees,self.fold_size)
        train_x = self.randomize_features(train_x)
        for fold in train_x:
            dt = DecisionTree(MAX_DEPTH, MIN_NODE)
            dt.train(fold)
            self.trees.append(dt)


    """
        This function outputs the class value for each instance of the given dataset as predicted by the random forest algorithm.
        Parameters:
        X (np.array): Dataset with labels

        Returns y (np.array): array with the predicted class values of the dataset
    """
    def predict(self,X):
        predicts = list()
        final_predicts = list()
        for tree in self.trees:
            predicts.append(tree.predict(X))
        # iterate through each tree's class prediction and find the most frequent for each instance
        for i in range(len(predicts[0])):
            values = list()
            for j in range(len(predicts)):
                values.append(predicts[j][i])
            final_predicts.append(max(set(values), key=values.count))
        return final_predicts,predicts


if __name__ == "__main__":


    # Training data
    train_data = np.loadtxt("example_data/data.txt", delimiter=",")
    train_y = np.loadtxt("example_data/targets.txt")

    mock_train = np.loadtxt("example_data/mock_data.csv", delimiter=",")
    mock_y = mock_train[ : , -1]

    # Build and train model
    rf = RandomForest(N_TREES,FOLD_SIZE)
    rf.train(mock_train)

    # Evaluate model on training data
    y_pred,y_pred_ind = rf.predict(mock_train)
    print(f"Accuracy of random forest: {sum(y_pred == mock_y) / mock_y.shape[0]}")
    print("\nAccuracy for each individual tree:")
    c = 1
    for i in y_pred_ind:
        print("\nTree",c)
        print(f"Accuracy: {sum(i == mock_y) / mock_y.shape[0]}")
        c = c+1
Initial commit 2021-01-30 21:49:15 +01:00			`"""`
			`Author: Philip Andreadis`
			`e-mail: philip_andreadis@hotmail.com`


			`Implementation of Random Forest model from scratch.`
			`The DecisionTree class from this project is used for generating the trees of the random forest.`
			`This class remains with no changes as the dataset is split into a number of folds with a random subset of features on which each tree is trained on.`
			`As a result each tree is trained on a different group of the dataset in order to avoid correlation between them.`
			`The predicted class value of each instance is chosen by voting from each single tree's outcome.`

			`Parameters of the model:`
			`MAX_DEPTH (int): Maximum depth of the decision tree`
			`MIN_NODE (int): Minimum number of instances a node can have. If this threshold is exceeded the node is terminated`
			`FOLD_SIZE (int): Value between 1-10 representing the percentage of the original dataset size each fold should be.`
			`N_TREES (int):The toral number of trees that will be trained.`

			`Input dataset to train() function must be a numpy array containing both feature and label values.`

			`"""`



			`from random import randrange`
			`from random import randint`
			`import numpy as np`
			`from decision_tree import DecisionTree`

			`# fold size (% of dataset size) e.g. 3 means 30%`
			`FOLD_SIZE = 10`
			`# number of trees`
			`N_TREES = 20`
			`# max tree depth`
			`MAX_DEPTH = 30`
			`# min size of tree node`
			`MIN_NODE = 1`


			`class RandomForest:`
			`def __init__(self,n_trees,fold_size):`
			`self.n_trees = n_trees`
			`self.fold_size = fold_size`
			`self.trees = list()`



			`"""`
			`This function splits the given dataset into n-folds with replacement. The number of folds is equal to the number of the trees that will be trained.`
			`Each tree will have one fold as input. The size of the folds is a percentage (p) of the size of the original dataset.`

			`Parameters:`
			`dataset: np array of the given dataset`
			`n_folds (int): number of folds in which the dataset should be split. Must be equal to the number of trees the user wants to train`
			`p (int): suggests the percentage of the dataset's size the size of a single fold should be.`

			`Returns list of np arrays: list with the k-folds`

			`"""`
			`def cross_validation_split(self,dataset, n_folds, p):`
			`dataset_split = list()`
			`fold_size = int(len(dataset)*p/10)`
			`for i in range(n_folds):`
			`fold = list()`
			`while len(fold) < fold_size:`
			`index = randrange(len(dataset))`
			`fold.append(dataset[index])`
			`set = np.array(fold)`
			`dataset_split.append(set)`
			`return dataset_split`


			`"""`
			`This function randomizes the selection of the features each tree will be trained on.`

			`Parameters:`
			`splits list of np arrays: list of folds`


			`Returns list of np arrays: list with the k-folds with some features randomly removed`

			`"""`
			`def randomize_features(self,splits):`
			`dataset_split = list()`
			`l = len(splits[0][0])`
			`n_features = int((l-1)*5/10)`
			`for split in splits:`
			`for i in range(n_features):`
			`rng = list(range(len(split[0]) - 1))`
			`selected = rng.pop(randint(0,len(rng)-1))`
			`split = np.delete(split, selected, 1)`
			`set = np.array(split)`
			`dataset_split.append(set)`
			`return dataset_split`


			`"""`
			`Prints out all the decision trees of the random forest.`

			`BUG: The feature number is not representative of its initial enumeration in the original dataset due to the randomization.`
			`This means that we do not know on which features each tree is trained on.`
			`"""`
			`def print_trees(self):`
			`i = 1`
			`for t in self.trees:`
			`print("Tree#",i)`
			`temp = t.final_tree`
			`t.print_dt(temp)`
			`print("\n")`
			`i = i+1`

			`"""`
			`Iteratively train each decision tree.`
			`Parameters:`
			`X (np.array): Training data`

			`"""`
			`def train(self,X):`
			`train_x = self.cross_validation_split(X,self.n_trees,self.fold_size)`
			`train_x = self.randomize_features(train_x)`
			`for fold in train_x:`
			`dt = DecisionTree(MAX_DEPTH, MIN_NODE)`
			`dt.train(fold)`
			`self.trees.append(dt)`


			`"""`
			`This function outputs the class value for each instance of the given dataset as predicted by the random forest algorithm.`
			`Parameters:`
			`X (np.array): Dataset with labels`

			`Returns y (np.array): array with the predicted class values of the dataset`
			`"""`
			`def predict(self,X):`
			`predicts = list()`
			`final_predicts = list()`
			`for tree in self.trees:`
			`predicts.append(tree.predict(X))`
			`# iterate through each tree's class prediction and find the most frequent for each instance`
			`for i in range(len(predicts[0])):`
			`values = list()`
			`for j in range(len(predicts)):`
			`values.append(predicts[j][i])`
			`final_predicts.append(max(set(values), key=values.count))`
			`return final_predicts,predicts`



			`if __name__ == "__main__":`


			`# Training data`
			`train_data = np.loadtxt("example_data/data.txt", delimiter=",")`
			`train_y = np.loadtxt("example_data/targets.txt")`

			`mock_train = np.loadtxt("example_data/mock_data.csv", delimiter=",")`
			`mock_y = mock_train[ : , -1]`

			`# Build and train model`
			`rf = RandomForest(N_TREES,FOLD_SIZE)`
			`rf.train(mock_train)`

			`# Evaluate model on training data`
			`y_pred,y_pred_ind = rf.predict(mock_train)`
			`print(f"Accuracy of random forest: {sum(y_pred == mock_y) / mock_y.shape[0]}")`
			`print("\nAccuracy for each individual tree:")`
			`c = 1`
			`for i in y_pred_ind:`
			`print("\nTree",c)`
			`print(f"Accuracy: {sum(i == mock_y) / mock_y.shape[0]}")`
			`c = c+1`