Initial commit

2026-04-10 12:33:44 +00:00 · 2021-01-30 21:49:15 +01:00
commit 65b8c80495
432 changed files with 1290844 additions and 0 deletions
--- a/ML/Pytorch/more_advanced/torchtext/mydata/test.csv
+++ b/ML/Pytorch/more_advanced/torchtext/mydata/test.csv
@@ -0,0 +1,4 @@
+name,quote,score
+Jocko,You must own everything in your world. There is no one else to blame.,1
+Bruce Lee,"Do not pray for an easy life, pray for the strength to endure a difficult one.",1
+Potato guy,"Stand tall, and rice like a potato!",0
--- a/ML/Pytorch/more_advanced/torchtext/mydata/test.json
+++ b/ML/Pytorch/more_advanced/torchtext/mydata/test.json
@@ -0,0 +1,3 @@
+{"name": "Jocko", "quote": "You must own everything in your world. There is no one else to blame.", "score":1}
+{"name": "Bruce", "quote": "Do not pray for an easy life, pray for the strength to endure a difficult one.", "score":1}
+{"name": "Random Potato", "quote": "Stand tall, and rice like a potato!", "score":0}
--- a/ML/Pytorch/more_advanced/torchtext/mydata/test.tsv
+++ b/ML/Pytorch/more_advanced/torchtext/mydata/test.tsv
@@ -0,0 +1,4 @@
+name	quote	score
+Jocko	You must own everything in your world. There is no one else to blame.	1
+Bruce Lee	Do not pray for an easy life, pray for the strength to endure a difficult one.	1
+Potato guy	Stand tall, and rice like a potato!	0
--- a/ML/Pytorch/more_advanced/torchtext/mydata/train.csv
+++ b/ML/Pytorch/more_advanced/torchtext/mydata/train.csv
@@ -0,0 +1,4 @@
+name,quote,score
+Jocko,You must own everything in your world. There is no one else to blame.,1
+Bruce Lee,"Do not pray for an easy life, pray for the strength to endure a difficult one.",1
+Potato guy,"Stand tall, and rice like a potato!",0
--- a/ML/Pytorch/more_advanced/torchtext/mydata/train.json
+++ b/ML/Pytorch/more_advanced/torchtext/mydata/train.json
@@ -0,0 +1,3 @@
+{"name": "Jocko", "quote": "You must own everything in your world. There is no one else to blame.", "score":1}
+{"name": "Bruce", "quote": "Do not pray for an easy life, pray for the strength to endure a difficult one.", "score":1}
+{"name": "Random Potato", "quote": "Stand tall, and rice like a potato!", "score":0}
--- a/ML/Pytorch/more_advanced/torchtext/mydata/train.tsv
+++ b/ML/Pytorch/more_advanced/torchtext/mydata/train.tsv
@@ -0,0 +1,4 @@
+name	quote	score
+Jocko	You must own everything in your world. There is no one else to blame.	1
+Bruce Lee	Do not pray for an easy life, pray for the strength to endure a difficult one.	1
+Potato guy	Stand tall, and rice like a potato!	0
--- a/ML/Pytorch/more_advanced/torchtext/torchtext_tutorial1.py
+++ b/ML/Pytorch/more_advanced/torchtext/torchtext_tutorial1.py
@@ -0,0 +1,111 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import spacy
+from torchtext.data import Field, TabularDataset, BucketIterator
+
+######### Loading from JSON/CSV/TSV files #########
+
+# STEPS:
+# 1. Specify how preprocessing should be done -> Fields
+# 2. Use Dataset to load the data -> TabularDataset (JSON/CSV/TSV Files)
+# 3. Construct an iterator to do batching & padding -> BucketIterator
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# python -m spacy download en
+spacy_en = spacy.load("en")
+
+
+def tokenize(text):
+    return [tok.text for tok in spacy_en.tokenizer(text)]
+
+
+quote = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)
+score = Field(sequential=False, use_vocab=False)
+
+fields = {"quote": ("q", quote), "score": ("s", score)}
+
+train_data, test_data = TabularDataset.splits(
+    path="mydata", train="train.json", test="test.json", format="json", fields=fields
+)
+
+# # train_data, test_data = TabularDataset.splits(
+# #                                         path='mydata',
+# #                                         train='train.csv',
+# #                                         test='test.csv',
+# #                                         format='csv',
+# #                                         fields=fields)
+
+# # train_data, test_data = TabularDataset.splits(
+# #                                         path='mydata',
+# #                                         train='train.tsv',
+# #                                         test='test.tsv',
+# #                                         format='tsv',
+# #                                         fields=fields)
+
+quote.build_vocab(train_data, max_size=10000, min_freq=1, vectors="glove.6B.100d")
+
+train_iterator, test_iterator = BucketIterator.splits(
+    (train_data, test_data), batch_size=2, device=device
+)
+
+######### Training a simple LSTM on this toy data of ours #########
+class RNN_LSTM(nn.Module):
+    def __init__(self, input_size, embed_size, hidden_size, num_layers):
+        super(RNN_LSTM, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+
+        self.embedding = nn.Embedding(input_size, embed_size)
+        self.rnn = nn.LSTM(embed_size, hidden_size, num_layers)
+        self.fc_out = nn.Linear(hidden_size, 1)
+
+    def forward(self, x):
+        # Set initial hidden and cell states
+        h0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)
+        c0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)
+
+        embedded = self.embedding(x)
+        outputs, _ = self.rnn(embedded, (h0, c0))
+        prediction = self.fc_out(outputs[-1, :, :])
+
+        return prediction
+
+
+# Hyperparameters
+input_size = len(quote.vocab)
+hidden_size = 512
+num_layers = 2
+embedding_size = 100
+learning_rate = 0.005
+num_epochs = 10
+
+# Initialize network
+model = RNN_LSTM(input_size, embedding_size, hidden_size, num_layers).to(device)
+
+# (NOT COVERED IN YOUTUBE VIDEO): Load the pretrained embeddings onto our model
+pretrained_embeddings = quote.vocab.vectors
+model.embedding.weight.data.copy_(pretrained_embeddings)
+
+# Loss and optimizer
+criterion = nn.BCEWithLogitsLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Train Network
+for epoch in range(num_epochs):
+    for batch_idx, batch in enumerate(train_iterator):
+        # Get data to cuda if possible
+        data = batch.q.to(device=device)
+        targets = batch.s.to(device=device)
+
+        # forward
+        scores = model(data)
+        loss = criterion(scores.squeeze(1), targets.type_as(scores))
+
+        # backward
+        optimizer.zero_grad()
+        loss.backward()
+
+        # gradient descent
+        optimizer.step()
--- a/ML/Pytorch/more_advanced/torchtext/torchtext_tutorial2.py
+++ b/ML/Pytorch/more_advanced/torchtext/torchtext_tutorial2.py
@@ -0,0 +1,45 @@
+import spacy
+from torchtext.datasets import Multi30k
+from torchtext.data import Field, BucketIterator
+
+"""
+To install spacy languages use:
+python -m spacy download en
+python -m spacy download de
+"""
+
+spacy_eng = spacy.load("en")
+spacy_ger = spacy.load("de")
+
+
+def tokenize_eng(text):
+    return [tok.text for tok in spacy_eng.tokenizer(text)]
+
+
+def tokenize_ger(text):
+    return [tok.text for tok in spacy_ger.tokenizer(text)]
+
+
+english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)
+german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True)
+
+train_data, validation_data, test_data = Multi30k.splits(
+    exts=(".de", ".en"), fields=(german, english)
+)
+
+english.build_vocab(train_data, max_size=10000, min_freq=2)
+german.build_vocab(train_data, max_size=10000, min_freq=2)
+
+train_iterator, validation_iterator, test_iterator = BucketIterator.splits(
+    (train_data, validation_data, test_data), batch_size=64, device="cuda"
+)
+
+for batch in train_iterator:
+    print(batch)
+
+# string to integer (stoi)
+print(f'Index of the word (the) is: {english.vocab.stoi["the"]}')
+
+# print integer to string (itos)
+print(f"Word of the index (1612) is: {english.vocab.itos[1612]}")
+print(f"Word of the index (0) is: {english.vocab.itos[0]}")
--- a/ML/Pytorch/more_advanced/torchtext/torchtext_tutorial3.py
+++ b/ML/Pytorch/more_advanced/torchtext/torchtext_tutorial3.py
@@ -0,0 +1,64 @@
+import spacy
+import pandas as pd
+from torchtext.data import Field, BucketIterator, TabularDataset
+from sklearn.model_selection import train_test_split
+
+### Load data from two text files where each row is a sentence ###
+english_txt = open("train_WMT_english.txt", encoding="utf8").read().split("\n")
+german_txt = open("train_WMT_german.txt", encoding="utf8").read().split("\n")
+
+raw_data = {
+    "English": [line for line in english_txt[1:100]],
+    "German": [line for line in german_txt[1:100]],
+}
+
+df = pd.DataFrame(raw_data, columns=["English", "German"])
+
+# create train and test set
+train, test = train_test_split(df, test_size=0.1)
+
+# Get train, test data to json and csv format which can be read by torchtext
+train.to_json("train.json", orient="records", lines=True)
+test.to_json("test.json", orient="records", lines=True)
+
+train.to_csv("train.csv", index=False)
+test.to_csv("test.csv", index=False)
+
+### Now we're back to where we were in previous Tutorials ###
+
+"""
+To install spacy languages use:
+python -m spacy download en
+python -m spacy download de
+"""
+
+spacy_eng = spacy.load("en")
+spacy_ger = spacy.load("de")
+
+
+def tokenize_eng(text):
+    return [tok.text for tok in spacy_eng.tokenizer(text)]
+
+
+def tokenize_ger(text):
+    return [tok.text for tok in spacy_ger.tokenizer(text)]
+
+
+english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)
+german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True)
+
+fields = {"English": ("eng", english), "German": ("ger", german)}
+
+train_data, test_data = TabularDataset.splits(
+    path="", train="train.json", test="test.json", format="json", fields=fields
+)
+
+english.build_vocab(train_data, max_size=10000, min_freq=2)
+german.build_vocab(train_data, max_size=10000, min_freq=2)
+
+train_iterator, test_iterator = BucketIterator.splits(
+    (train_data, test_data), batch_size=32, device="cuda"
+)
+
+for batch in train_iterator:
+    print(batch)