"""
Simple two-layered Neural Network from scratch implementation.

Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
*    2020-04-28 Initial coding

"""

import numpy as np
from utils import create_dataset, plot_contour


class NeuralNetwork:
    def __init__(self, X, y):
        # m for #training examples and n for #features
        self.m, self.n = X.shape

        # regularization term lambd (lambda is reserved keyword)
        self.lambd = 1e-3
        self.learning_rate = 0.1

        # Define size of first hidden-layer and second hidden layer (output layer)
        self.h1 = 25
        self.h2 = len(np.unique(y))

    def init_kaiming_weights(self, l0, l1):
        # Kaiming weights
        w = np.random.randn(l0, l1) * np.sqrt(2.0 / l0)
        b = np.zeros((1, l1))

        return w, b

    def forward_prop(self, X, parameters):
        W2 = parameters["W2"]
        W1 = parameters["W1"]
        b2 = parameters["b2"]
        b1 = parameters["b1"]

        # forward prop
        a0 = X
        z1 = np.dot(a0, W1) + b1

        # apply nonlinearity (relu)
        a1 = np.maximum(0, z1)
        z2 = np.dot(a1, W2) + b2

        # softmax on the last layer
        scores = z2
        exp_scores = np.exp(scores)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

        # cache values from forward pass to use for backward pass
        cache = {"a0": X, "probs": probs, "a1": a1}

        return cache, probs

    def compute_cost(self, y, probs, parameters):
        W2 = parameters["W2"]
        W1 = parameters["W1"]

        y = y.astype(int)
        data_loss = np.sum(-np.log(probs[np.arange(self.m), y]) / self.m)
        reg_loss = 0.5 * self.lambd * np.sum(W1 * W1) + 0.5 * self.lambd * np.sum(
            W2 * W2
        )

        # total cost J
        total_cost = data_loss + reg_loss

        return total_cost

    def back_prop(self, cache, parameters, y):
        # Unpack from parameters
        W2 = parameters["W2"]
        W1 = parameters["W1"]
        b2 = parameters["b2"]
        b1 = parameters["b1"]

        # Unpack from forward prop
        a0 = cache["a0"]
        a1 = cache["a1"]
        probs = cache["probs"]

        dz2 = probs
        dz2[np.arange(self.m), y] -= 1
        dz2 /= self.m

        # backprop through values dW2 and db2
        dW2 = np.dot(a1.T, dz2) + self.lambd * W2
        db2 = np.sum(dz2, axis=0, keepdims=True)

        # Back to the (only) hidden layer in this case
        dz1 = np.dot(dz2, W2.T)
        dz1 = dz1 * (a1 > 0)

        # backprop through values dW1, db1
        dW1 = np.dot(a0.T, dz1) + self.lambd * W1
        db1 = np.sum(dz1, axis=0, keepdims=True)

        grads = {"dW1": dW1, "dW2": dW2, "db1": db1, "db2": db2}

        return grads

    def update_parameters(self, parameters, grads):
        learning_rate = self.learning_rate

        W2 = parameters["W2"]
        W1 = parameters["W1"]
        b2 = parameters["b2"]
        b1 = parameters["b1"]

        dW2 = grads["dW2"]
        dW1 = grads["dW1"]
        db2 = grads["db2"]
        db1 = grads["db1"]

        # Do gradient descent step
        W2 -= learning_rate * dW2
        W1 -= learning_rate * dW1
        b2 -= learning_rate * db2
        b1 -= learning_rate * db1

        # store back weights in parameters
        parameters = {"W1": W1, "W2": W2, "b1": b1, "b2": b2}

        return parameters

    def main(self, X, y, num_iter=10000):
        # initialize our weights
        W1, b1 = self.init_kaiming_weights(self.n, self.h1)
        W2, b2 = self.init_kaiming_weights(self.h1, self.h2)

        # pack parameters into a dictionary
        parameters = {"W1": W1, "W2": W2, "b1": b1, "b2": b2}

        # How many gradient descent updates we want to do
        for it in range(num_iter + 1):

            # forward prop
            cache, probs = self.forward_prop(X, parameters)

            # calculate cost
            cost = self.compute_cost(y, probs, parameters)

            # print cost sometimes
            if it % 2500 == 0:
                print(f"At iteration {it} we have a cost of {cost}")

            # back prop
            grads = self.back_prop(cache, parameters, y)

            # update parameters
            parameters = self.update_parameters(parameters, grads)

        return parameters


if __name__ == "__main__":
    # Generate dataset
    X, y = create_dataset(300, K=3)
    y = y.astype(int)

    # Train network
    NN = NeuralNetwork(X, y)
    trained_parameters = NN.main(X, y)

    # Get trained parameters
    W2 = trained_parameters["W2"]
    W1 = trained_parameters["W1"]
    b2 = trained_parameters["b2"]
    b1 = trained_parameters["b1"]

    # Plot the decision boundary (for nice visualization)
    plot_contour(X, y, NN, trained_parameters)