Initial commit

2026-04-10 12:33:44 +00:00 · 2021-01-30 21:49:15 +01:00
commit 65b8c80495
432 changed files with 1290844 additions and 0 deletions
--- a/ML/Projects/spam_classifier_naive_bayes/build_vocabulary.py
+++ b/ML/Projects/spam_classifier_naive_bayes/build_vocabulary.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+"""
+We want go through each word in all emails,
+check if the word is an actual english word
+by comparing with nltk.corpus words and if it is
+then add it to our vocabulary.
+
+"""
+
+import pandas as pd
+import nltk
+from nltk.corpus import words
+
+vocabulary = {}
+data = pd.read_csv("data/emails.csv")
+nltk.download("words")
+set_words = set(words.words())
+
+
+def build_vocabulary(curr_email):
+    idx = len(vocabulary)
+    for word in curr_email:
+        if word.lower() not in vocabulary and word.lower() in set_words:
+            vocabulary[word] = idx
+            idx += 1
+
+
+if __name__ == "__main__":
+    for i in range(data.shape[0]):
+        curr_email = data.iloc[i, :][0].split()
+        print(
+            f"Current email is {i}/{data.shape[0]} and the \
+               length of vocab is curr {len(vocabulary)}"
+        )
+
+        build_vocabulary(curr_email)
+
+# Write dictionary to vocabulary.txt file
+file = open("vocabulary.txt", "w")
+file.write(str(vocabulary))
+file.close()
--- a/ML/Projects/spam_classifier_naive_bayes/create_freq_vectors.py
+++ b/ML/Projects/spam_classifier_naive_bayes/create_freq_vectors.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+"""
+Having created our vocabulary we now need to create
+the dataset X,y which we will create by doing frequency
+vector for each email. For example if our vocabulary
+has the words
+
+[aardkvark, ..., buy, ... money, .... zulu]
+
+We go through each email and count up how many times each
+word was repeated, so for a specific example this might look
+like:
+    
+[0, ..., 4, ... 2, .... 0] 
+
+And perhaps since both "buy" and "money" this email might be
+spam
+
+"""
+import pandas as pd
+import numpy as np
+import ast
+
+data = pd.read_csv("data/emails.csv")
+file = open("vocabulary.txt", "r")
+contents = file.read()
+vocabulary = ast.literal_eval(contents)
+
+X = np.zeros((data.shape[0], len(vocabulary)))
+y = np.zeros((data.shape[0]))
+
+for i in range(data.shape[0]):
+    email = data.iloc[i, :][0].split()
+
+    for email_word in email:
+        if email_word.lower() in vocabulary:
+            X[i, vocabulary[email_word]] += 1
+
+    y[i] = data.iloc[i, :][1]
+
+# Save stored numpy arrays
+np.save("data/X.npy", X)
+np.save("data/y.npy", y)
--- a/ML/Projects/spam_classifier_naive_bayes/data/emails.csv
+++ b/ML/Projects/spam_classifier_naive_bayes/data/emails.csv
--- a/ML/Projects/spam_classifier_naive_bayes/naivebayes.py
+++ b/ML/Projects/spam_classifier_naive_bayes/naivebayes.py
@@ -0,0 +1,68 @@
+"""
+Naive Bayes Classifier Implementation from scratch
+
+To run the code structure the code in the following way:
+    X be size: (num_training_examples, num_features)
+    y be size: (num_classes, )
+
+Where the classes are 0, 1, 2, etc. Then an example run looks like:
+    NB = NaiveBayes(X, y)
+    NB.fit(X)
+    predictions = NB.predict(X)
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-21 Initial coding
+
+"""
+import numpy as np
+
+
+class NaiveBayes:
+    def __init__(self, X, y):
+        self.num_examples, self.num_features = X.shape
+        self.num_classes = len(np.unique(y))
+        self.eps = 1e-6
+
+    def fit(self, X):
+        self.classes_mean = {}
+        self.classes_variance = {}
+        self.classes_prior = {}
+
+        for c in range(self.num_classes):
+            X_c = X[y == c]
+
+            self.classes_mean[str(c)] = np.mean(X_c, axis=0)
+            self.classes_variance[str(c)] = np.var(X_c, axis=0)
+            self.classes_prior[str(c)] = X_c.shape[0] / X.shape[0]
+
+    def predict(self, X):
+        probs = np.zeros((self.num_examples, self.num_classes))
+
+        for c in range(self.num_classes):
+            prior = self.classes_prior[str(c)]
+            probs_c = self.density_function(
+                X, self.classes_mean[str(c)], self.classes_variance[str(c)]
+            )
+            probs[:, c] = probs_c + np.log(prior)
+
+        return np.argmax(probs, 1)
+
+    def density_function(self, x, mean, sigma):
+        # Calculate probability from Gaussian density function
+        const = -self.num_features / 2 * np.log(2 * np.pi) - 0.5 * np.sum(
+            np.log(sigma + self.eps)
+        )
+        probs = 0.5 * np.sum(np.power(x - mean, 2) / (sigma + self.eps), 1)
+        return const - probs
+
+
+if __name__ == "__main__":
+    # For spam emails (Make sure to run build_vocab etc. to have .npy files)
+    X = np.load("data/X.npy")
+    y = np.load("data/y.npy")
+
+    NB = NaiveBayes(X, y)
+    NB.fit(X)
+    y_pred = NB.predict(X)
+
+    print(f"Accuracy: {sum(y_pred==y)/X.shape[0]}")