mirror of
https://github.com/aladdinpersson/Machine-Learning-Collection.git
synced 2026-04-10 12:33:44 +00:00
Initial commit
This commit is contained in:
41
ML/Projects/spam_classifier_naive_bayes/build_vocabulary.py
Normal file
41
ML/Projects/spam_classifier_naive_bayes/build_vocabulary.py
Normal file
@@ -0,0 +1,41 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
We want go through each word in all emails,
|
||||
check if the word is an actual english word
|
||||
by comparing with nltk.corpus words and if it is
|
||||
then add it to our vocabulary.
|
||||
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import nltk
|
||||
from nltk.corpus import words
|
||||
|
||||
vocabulary = {}
|
||||
data = pd.read_csv("data/emails.csv")
|
||||
nltk.download("words")
|
||||
set_words = set(words.words())
|
||||
|
||||
|
||||
def build_vocabulary(curr_email):
|
||||
idx = len(vocabulary)
|
||||
for word in curr_email:
|
||||
if word.lower() not in vocabulary and word.lower() in set_words:
|
||||
vocabulary[word] = idx
|
||||
idx += 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
for i in range(data.shape[0]):
|
||||
curr_email = data.iloc[i, :][0].split()
|
||||
print(
|
||||
f"Current email is {i}/{data.shape[0]} and the \
|
||||
length of vocab is curr {len(vocabulary)}"
|
||||
)
|
||||
|
||||
build_vocabulary(curr_email)
|
||||
|
||||
# Write dictionary to vocabulary.txt file
|
||||
file = open("vocabulary.txt", "w")
|
||||
file.write(str(vocabulary))
|
||||
file.close()
|
||||
@@ -0,0 +1,44 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Having created our vocabulary we now need to create
|
||||
the dataset X,y which we will create by doing frequency
|
||||
vector for each email. For example if our vocabulary
|
||||
has the words
|
||||
|
||||
[aardkvark, ..., buy, ... money, .... zulu]
|
||||
|
||||
We go through each email and count up how many times each
|
||||
word was repeated, so for a specific example this might look
|
||||
like:
|
||||
|
||||
[0, ..., 4, ... 2, .... 0]
|
||||
|
||||
And perhaps since both "buy" and "money" this email might be
|
||||
spam
|
||||
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import ast
|
||||
|
||||
data = pd.read_csv("data/emails.csv")
|
||||
file = open("vocabulary.txt", "r")
|
||||
contents = file.read()
|
||||
vocabulary = ast.literal_eval(contents)
|
||||
|
||||
X = np.zeros((data.shape[0], len(vocabulary)))
|
||||
y = np.zeros((data.shape[0]))
|
||||
|
||||
for i in range(data.shape[0]):
|
||||
email = data.iloc[i, :][0].split()
|
||||
|
||||
for email_word in email:
|
||||
if email_word.lower() in vocabulary:
|
||||
X[i, vocabulary[email_word]] += 1
|
||||
|
||||
y[i] = data.iloc[i, :][1]
|
||||
|
||||
# Save stored numpy arrays
|
||||
np.save("data/X.npy", X)
|
||||
np.save("data/y.npy", y)
|
||||
5729
ML/Projects/spam_classifier_naive_bayes/data/emails.csv
Normal file
5729
ML/Projects/spam_classifier_naive_bayes/data/emails.csv
Normal file
File diff suppressed because one or more lines are too long
68
ML/Projects/spam_classifier_naive_bayes/naivebayes.py
Normal file
68
ML/Projects/spam_classifier_naive_bayes/naivebayes.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""
|
||||
Naive Bayes Classifier Implementation from scratch
|
||||
|
||||
To run the code structure the code in the following way:
|
||||
X be size: (num_training_examples, num_features)
|
||||
y be size: (num_classes, )
|
||||
|
||||
Where the classes are 0, 1, 2, etc. Then an example run looks like:
|
||||
NB = NaiveBayes(X, y)
|
||||
NB.fit(X)
|
||||
predictions = NB.predict(X)
|
||||
|
||||
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
|
||||
* 2020-04-21 Initial coding
|
||||
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
class NaiveBayes:
|
||||
def __init__(self, X, y):
|
||||
self.num_examples, self.num_features = X.shape
|
||||
self.num_classes = len(np.unique(y))
|
||||
self.eps = 1e-6
|
||||
|
||||
def fit(self, X):
|
||||
self.classes_mean = {}
|
||||
self.classes_variance = {}
|
||||
self.classes_prior = {}
|
||||
|
||||
for c in range(self.num_classes):
|
||||
X_c = X[y == c]
|
||||
|
||||
self.classes_mean[str(c)] = np.mean(X_c, axis=0)
|
||||
self.classes_variance[str(c)] = np.var(X_c, axis=0)
|
||||
self.classes_prior[str(c)] = X_c.shape[0] / X.shape[0]
|
||||
|
||||
def predict(self, X):
|
||||
probs = np.zeros((self.num_examples, self.num_classes))
|
||||
|
||||
for c in range(self.num_classes):
|
||||
prior = self.classes_prior[str(c)]
|
||||
probs_c = self.density_function(
|
||||
X, self.classes_mean[str(c)], self.classes_variance[str(c)]
|
||||
)
|
||||
probs[:, c] = probs_c + np.log(prior)
|
||||
|
||||
return np.argmax(probs, 1)
|
||||
|
||||
def density_function(self, x, mean, sigma):
|
||||
# Calculate probability from Gaussian density function
|
||||
const = -self.num_features / 2 * np.log(2 * np.pi) - 0.5 * np.sum(
|
||||
np.log(sigma + self.eps)
|
||||
)
|
||||
probs = 0.5 * np.sum(np.power(x - mean, 2) / (sigma + self.eps), 1)
|
||||
return const - probs
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# For spam emails (Make sure to run build_vocab etc. to have .npy files)
|
||||
X = np.load("data/X.npy")
|
||||
y = np.load("data/y.npy")
|
||||
|
||||
NB = NaiveBayes(X, y)
|
||||
NB.fit(X)
|
||||
y_pred = NB.predict(X)
|
||||
|
||||
print(f"Accuracy: {sum(y_pred==y)/X.shape[0]}")
|
||||
Reference in New Issue
Block a user