Initial commit

This commit is contained in:
Aladdin Persson
2021-01-30 21:49:15 +01:00
commit 65b8c80495
432 changed files with 1290844 additions and 0 deletions

View File

@@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
"""
We want go through each word in all emails,
check if the word is an actual english word
by comparing with nltk.corpus words and if it is
then add it to our vocabulary.
"""
import pandas as pd
import nltk
from nltk.corpus import words
vocabulary = {}
data = pd.read_csv("data/emails.csv")
nltk.download("words")
set_words = set(words.words())
def build_vocabulary(curr_email):
idx = len(vocabulary)
for word in curr_email:
if word.lower() not in vocabulary and word.lower() in set_words:
vocabulary[word] = idx
idx += 1
if __name__ == "__main__":
for i in range(data.shape[0]):
curr_email = data.iloc[i, :][0].split()
print(
f"Current email is {i}/{data.shape[0]} and the \
length of vocab is curr {len(vocabulary)}"
)
build_vocabulary(curr_email)
# Write dictionary to vocabulary.txt file
file = open("vocabulary.txt", "w")
file.write(str(vocabulary))
file.close()

View File

@@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
"""
Having created our vocabulary we now need to create
the dataset X,y which we will create by doing frequency
vector for each email. For example if our vocabulary
has the words
[aardkvark, ..., buy, ... money, .... zulu]
We go through each email and count up how many times each
word was repeated, so for a specific example this might look
like:
[0, ..., 4, ... 2, .... 0]
And perhaps since both "buy" and "money" this email might be
spam
"""
import pandas as pd
import numpy as np
import ast
data = pd.read_csv("data/emails.csv")
file = open("vocabulary.txt", "r")
contents = file.read()
vocabulary = ast.literal_eval(contents)
X = np.zeros((data.shape[0], len(vocabulary)))
y = np.zeros((data.shape[0]))
for i in range(data.shape[0]):
email = data.iloc[i, :][0].split()
for email_word in email:
if email_word.lower() in vocabulary:
X[i, vocabulary[email_word]] += 1
y[i] = data.iloc[i, :][1]
# Save stored numpy arrays
np.save("data/X.npy", X)
np.save("data/y.npy", y)

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,68 @@
"""
Naive Bayes Classifier Implementation from scratch
To run the code structure the code in the following way:
X be size: (num_training_examples, num_features)
y be size: (num_classes, )
Where the classes are 0, 1, 2, etc. Then an example run looks like:
NB = NaiveBayes(X, y)
NB.fit(X)
predictions = NB.predict(X)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-21 Initial coding
"""
import numpy as np
class NaiveBayes:
def __init__(self, X, y):
self.num_examples, self.num_features = X.shape
self.num_classes = len(np.unique(y))
self.eps = 1e-6
def fit(self, X):
self.classes_mean = {}
self.classes_variance = {}
self.classes_prior = {}
for c in range(self.num_classes):
X_c = X[y == c]
self.classes_mean[str(c)] = np.mean(X_c, axis=0)
self.classes_variance[str(c)] = np.var(X_c, axis=0)
self.classes_prior[str(c)] = X_c.shape[0] / X.shape[0]
def predict(self, X):
probs = np.zeros((self.num_examples, self.num_classes))
for c in range(self.num_classes):
prior = self.classes_prior[str(c)]
probs_c = self.density_function(
X, self.classes_mean[str(c)], self.classes_variance[str(c)]
)
probs[:, c] = probs_c + np.log(prior)
return np.argmax(probs, 1)
def density_function(self, x, mean, sigma):
# Calculate probability from Gaussian density function
const = -self.num_features / 2 * np.log(2 * np.pi) - 0.5 * np.sum(
np.log(sigma + self.eps)
)
probs = 0.5 * np.sum(np.power(x - mean, 2) / (sigma + self.eps), 1)
return const - probs
if __name__ == "__main__":
# For spam emails (Make sure to run build_vocab etc. to have .npy files)
X = np.load("data/X.npy")
y = np.load("data/y.npy")
NB = NaiveBayes(X, y)
NB.fit(X)
y_pred = NB.predict(X)
print(f"Accuracy: {sum(y_pred==y)/X.shape[0]}")