Initial commit

2026-04-10 12:33:44 +00:00 · 2021-01-30 21:49:15 +01:00
commit 65b8c80495
432 changed files with 1290844 additions and 0 deletions
--- a/ML/Pytorch/more_advanced/GANs/DCGAN_mnist.py
+++ b/ML/Pytorch/more_advanced/GANs/DCGAN_mnist.py
@@ -0,0 +1,131 @@
+"""
+Example code of how to code GANs and more specifically DCGAN,
+for more information about DCGANs read: https://arxiv.org/abs/1511.06434
+
+We then train the DCGAN on the MNIST dataset (toy dataset of handwritten digits)
+and then generate our own. You can apply this more generally on really any dataset
+but MNIST is simple enough to get the overall idea.
+
+Video explanation: https://youtu.be/5RYETbFFQ7s
+Got any questions leave a comment on youtube :)
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-20 Initial coding
+
+"""
+
+# Imports
+import torch
+import torchvision
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+from torch.utils.data import (
+    DataLoader,
+)  # Gives easier dataset managment and creates mini batches
+from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
+from model_utils import (
+    Discriminator,
+    Generator,
+)  # Import our models we've defined (from DCGAN paper)
+
+# Hyperparameters
+lr = 0.0005
+batch_size = 64
+image_size = 64
+channels_img = 1
+channels_noise = 256
+num_epochs = 10
+
+# For how many channels Generator and Discriminator should use
+features_d = 16
+features_g = 16
+
+my_transforms = transforms.Compose(
+    [
+        transforms.Resize(image_size),
+        transforms.ToTensor(),
+        transforms.Normalize((0.5,), (0.5,)),
+    ]
+)
+
+dataset = datasets.MNIST(
+    root="dataset/", train=True, transform=my_transforms, download=True
+)
+dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Create discriminator and generator
+netD = Discriminator(channels_img, features_d).to(device)
+netG = Generator(channels_noise, channels_img, features_g).to(device)
+
+# Setup Optimizer for G and D
+optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(0.5, 0.999))
+optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(0.5, 0.999))
+
+netG.train()
+netD.train()
+
+criterion = nn.BCELoss()
+
+real_label = 1
+fake_label = 0
+
+fixed_noise = torch.randn(64, channels_noise, 1, 1).to(device)
+writer_real = SummaryWriter(f"runs/GAN_MNIST/test_real")
+writer_fake = SummaryWriter(f"runs/GAN_MNIST/test_fake")
+step = 0
+
+print("Starting Training...")
+
+for epoch in range(num_epochs):
+    for batch_idx, (data, targets) in enumerate(dataloader):
+        data = data.to(device)
+        batch_size = data.shape[0]
+
+        ### Train Discriminator: max log(D(x)) + log(1 - D(G(z)))
+        netD.zero_grad()
+        label = (torch.ones(batch_size) * 0.9).to(device)
+        output = netD(data).reshape(-1)
+        lossD_real = criterion(output, label)
+        D_x = output.mean().item()
+
+        noise = torch.randn(batch_size, channels_noise, 1, 1).to(device)
+        fake = netG(noise)
+        label = (torch.ones(batch_size) * 0.1).to(device)
+
+        output = netD(fake.detach()).reshape(-1)
+        lossD_fake = criterion(output, label)
+
+        lossD = lossD_real + lossD_fake
+        lossD.backward()
+        optimizerD.step()
+
+        ### Train Generator: max log(D(G(z)))
+        netG.zero_grad()
+        label = torch.ones(batch_size).to(device)
+        output = netD(fake).reshape(-1)
+        lossG = criterion(output, label)
+        lossG.backward()
+        optimizerG.step()
+
+        # Print losses ocassionally and print to tensorboard
+        if batch_idx % 100 == 0:
+            step += 1
+            print(
+                f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(dataloader)} \
+                  Loss D: {lossD:.4f}, loss G: {lossG:.4f} D(x): {D_x:.4f}"
+            )
+
+            with torch.no_grad():
+                fake = netG(fixed_noise)
+                img_grid_real = torchvision.utils.make_grid(data[:32], normalize=True)
+                img_grid_fake = torchvision.utils.make_grid(fake[:32], normalize=True)
+                writer_real.add_image(
+                    "Mnist Real Images", img_grid_real, global_step=step
+                )
+                writer_fake.add_image(
+                    "Mnist Fake Images", img_grid_fake, global_step=step
+                )
--- a/ML/Pytorch/more_advanced/GANs/README.md
+++ b/ML/Pytorch/more_advanced/GANs/README.md
@@ -0,0 +1,4 @@
+### Generative Adversarial Network
+
+DCGAN_mnist.py: main file and training network
+model_utils.py: Generator and discriminator implementation
--- a/ML/Pytorch/more_advanced/GANs/model_utils.py
+++ b/ML/Pytorch/more_advanced/GANs/model_utils.py
@@ -0,0 +1,76 @@
+"""
+Discriminator and Generator implementation from DCGAN paper
+that we import in the main (DCGAN_mnist.py) file.
+"""
+
+import torch
+import torch.nn as nn
+
+
+class Discriminator(nn.Module):
+    def __init__(self, channels_img, features_d):
+        super(Discriminator, self).__init__()
+        self.net = nn.Sequential(
+            # N x channels_img x 64 x 64
+            nn.Conv2d(channels_img, features_d, kernel_size=4, stride=2, padding=1),
+            nn.LeakyReLU(0.2),
+            # N x features_d x 32 x 32
+            nn.Conv2d(features_d, features_d * 2, kernel_size=4, stride=2, padding=1),
+            nn.BatchNorm2d(features_d * 2),
+            nn.LeakyReLU(0.2),
+            nn.Conv2d(
+                features_d * 2, features_d * 4, kernel_size=4, stride=2, padding=1
+            ),
+            nn.BatchNorm2d(features_d * 4),
+            nn.LeakyReLU(0.2),
+            nn.Conv2d(
+                features_d * 4, features_d * 8, kernel_size=4, stride=2, padding=1
+            ),
+            nn.BatchNorm2d(features_d * 8),
+            nn.LeakyReLU(0.2),
+            # N x features_d*8 x 4 x 4
+            nn.Conv2d(features_d * 8, 1, kernel_size=4, stride=2, padding=0),
+            # N x 1 x 1 x 1
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class Generator(nn.Module):
+    def __init__(self, channels_noise, channels_img, features_g):
+        super(Generator, self).__init__()
+
+        self.net = nn.Sequential(
+            # N x channels_noise x 1 x 1
+            nn.ConvTranspose2d(
+                channels_noise, features_g * 16, kernel_size=4, stride=1, padding=0
+            ),
+            nn.BatchNorm2d(features_g * 16),
+            nn.ReLU(),
+            # N x features_g*16 x 4 x 4
+            nn.ConvTranspose2d(
+                features_g * 16, features_g * 8, kernel_size=4, stride=2, padding=1
+            ),
+            nn.BatchNorm2d(features_g * 8),
+            nn.ReLU(),
+            nn.ConvTranspose2d(
+                features_g * 8, features_g * 4, kernel_size=4, stride=2, padding=1
+            ),
+            nn.BatchNorm2d(features_g * 4),
+            nn.ReLU(),
+            nn.ConvTranspose2d(
+                features_g * 4, features_g * 2, kernel_size=4, stride=2, padding=1
+            ),
+            nn.BatchNorm2d(features_g * 2),
+            nn.ReLU(),
+            nn.ConvTranspose2d(
+                features_g * 2, channels_img, kernel_size=4, stride=2, padding=1
+            ),
+            # N x channels_img x 64 x 64
+            nn.Tanh(),
+        )
+
+    def forward(self, x):
+        return self.net(x)
--- a/ML/Pytorch/more_advanced/Seq2Seq/seq2seq.py
+++ b/ML/Pytorch/more_advanced/Seq2Seq/seq2seq.py
@@ -0,0 +1,242 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torchtext.datasets import Multi30k
+from torchtext.data import Field, BucketIterator
+import numpy as np
+import spacy
+import random
+from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
+from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
+
+spacy_ger = spacy.load("de")
+spacy_eng = spacy.load("en")
+
+
+def tokenize_ger(text):
+    return [tok.text for tok in spacy_ger.tokenizer(text)]
+
+
+def tokenize_eng(text):
+    return [tok.text for tok in spacy_eng.tokenizer(text)]
+
+
+german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")
+
+english = Field(
+    tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
+)
+
+train_data, valid_data, test_data = Multi30k.splits(
+    exts=(".de", ".en"), fields=(german, english)
+)
+
+german.build_vocab(train_data, max_size=10000, min_freq=2)
+english.build_vocab(train_data, max_size=10000, min_freq=2)
+
+
+class Encoder(nn.Module):
+    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
+        super(Encoder, self).__init__()
+        self.dropout = nn.Dropout(p)
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+
+        self.embedding = nn.Embedding(input_size, embedding_size)
+        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
+
+    def forward(self, x):
+        # x shape: (seq_length, N) where N is batch size
+
+        embedding = self.dropout(self.embedding(x))
+        # embedding shape: (seq_length, N, embedding_size)
+
+        outputs, (hidden, cell) = self.rnn(embedding)
+        # outputs shape: (seq_length, N, hidden_size)
+
+        return hidden, cell
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
+    ):
+        super(Decoder, self).__init__()
+        self.dropout = nn.Dropout(p)
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+
+        self.embedding = nn.Embedding(input_size, embedding_size)
+        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
+        self.fc = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x, hidden, cell):
+        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
+        # is 1 here because we are sending in a single word and not a sentence
+        x = x.unsqueeze(0)
+
+        embedding = self.dropout(self.embedding(x))
+        # embedding shape: (1, N, embedding_size)
+
+        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
+        # outputs shape: (1, N, hidden_size)
+
+        predictions = self.fc(outputs)
+
+        # predictions shape: (1, N, length_target_vocabulary) to send it to
+        # loss function we want it to be (N, length_target_vocabulary) so we're
+        # just gonna remove the first dim
+        predictions = predictions.squeeze(0)
+
+        return predictions, hidden, cell
+
+
+class Seq2Seq(nn.Module):
+    def __init__(self, encoder, decoder):
+        super(Seq2Seq, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+
+    def forward(self, source, target, teacher_force_ratio=0.5):
+        batch_size = source.shape[1]
+        target_len = target.shape[0]
+        target_vocab_size = len(english.vocab)
+
+        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
+
+        hidden, cell = self.encoder(source)
+
+        # Grab the first input to the Decoder which will be <SOS> token
+        x = target[0]
+
+        for t in range(1, target_len):
+            # Use previous hidden, cell as context from encoder at start
+            output, hidden, cell = self.decoder(x, hidden, cell)
+
+            # Store next output prediction
+            outputs[t] = output
+
+            # Get the best word the Decoder predicted (index in the vocabulary)
+            best_guess = output.argmax(1)
+
+            # With probability of teacher_force_ratio we take the actual next word
+            # otherwise we take the word that the Decoder predicted it to be.
+            # Teacher Forcing is used so that the model gets used to seeing
+            # similar inputs at training and testing time, if teacher forcing is 1
+            # then inputs at test time might be completely different than what the
+            # network is used to. This was a long comment.
+            x = target[t] if random.random() < teacher_force_ratio else best_guess
+
+        return outputs
+
+
+### We're ready to define everything we need for training our Seq2Seq model ###
+
+# Training hyperparameters
+num_epochs = 100
+learning_rate = 0.001
+batch_size = 64
+
+# Model hyperparameters
+load_model = False
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+input_size_encoder = len(german.vocab)
+input_size_decoder = len(english.vocab)
+output_size = len(english.vocab)
+encoder_embedding_size = 300
+decoder_embedding_size = 300
+hidden_size = 1024  # Needs to be the same for both RNN's
+num_layers = 2
+enc_dropout = 0.5
+dec_dropout = 0.5
+
+# Tensorboard to get nice loss plot
+writer = SummaryWriter(f"runs/loss_plot")
+step = 0
+
+train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
+    (train_data, valid_data, test_data),
+    batch_size=batch_size,
+    sort_within_batch=True,
+    sort_key=lambda x: len(x.src),
+    device=device,
+)
+
+encoder_net = Encoder(
+    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
+).to(device)
+
+decoder_net = Decoder(
+    input_size_decoder,
+    decoder_embedding_size,
+    hidden_size,
+    output_size,
+    num_layers,
+    dec_dropout,
+).to(device)
+
+model = Seq2Seq(encoder_net, decoder_net).to(device)
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+pad_idx = english.vocab.stoi["<pad>"]
+criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
+
+if load_model:
+    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
+
+
+sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."
+
+for epoch in range(num_epochs):
+    print(f"[Epoch {epoch} / {num_epochs}]")
+
+    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
+    save_checkpoint(checkpoint)
+
+    model.eval()
+
+    translated_sentence = translate_sentence(
+        model, sentence, german, english, device, max_length=50
+    )
+
+    print(f"Translated example sentence: \n {translated_sentence}")
+
+    model.train()
+
+    for batch_idx, batch in enumerate(train_iterator):
+        # Get input and targets and get to cuda
+        inp_data = batch.src.to(device)
+        target = batch.trg.to(device)
+
+        # Forward prop
+        output = model(inp_data, target)
+
+        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
+        # doesn't take input in that form. For example if we have MNIST we want to have
+        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
+        # way that we have output_words * batch_size that we want to send in into
+        # our cost function, so we need to do some reshapin. While we're at it
+        # Let's also remove the start token while we're at it
+        output = output[1:].reshape(-1, output.shape[2])
+        target = target[1:].reshape(-1)
+
+        optimizer.zero_grad()
+        loss = criterion(output, target)
+
+        # Back prop
+        loss.backward()
+
+        # Clip to avoid exploding gradient issues, makes sure grads are
+        # within a healthy range
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
+
+        # Gradient descent step
+        optimizer.step()
+
+        # Plot to tensorboard
+        writer.add_scalar("Training loss", loss, global_step=step)
+        step += 1
+
+
+score = bleu(test_data[1:100], model, german, english, device)
+print(f"Bleu score {score*100:.2f}")
--- a/ML/Pytorch/more_advanced/Seq2Seq/utils.py
+++ b/ML/Pytorch/more_advanced/Seq2Seq/utils.py
@@ -0,0 +1,84 @@
+import torch
+import spacy
+from torchtext.data.metrics import bleu_score
+import sys
+
+
+def translate_sentence(model, sentence, german, english, device, max_length=50):
+    # print(sentence)
+
+    # sys.exit()
+
+    # Load german tokenizer
+    spacy_ger = spacy.load("de")
+
+    # Create tokens using spacy and everything in lower case (which is what our vocab is)
+    if type(sentence) == str:
+        tokens = [token.text.lower() for token in spacy_ger(sentence)]
+    else:
+        tokens = [token.lower() for token in sentence]
+
+    # print(tokens)
+
+    # sys.exit()
+    # Add <SOS> and <EOS> in beginning and end respectively
+    tokens.insert(0, german.init_token)
+    tokens.append(german.eos_token)
+
+    # Go through each german token and convert to an index
+    text_to_indices = [german.vocab.stoi[token] for token in tokens]
+
+    # Convert to Tensor
+    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
+
+    # Build encoder hidden, cell state
+    with torch.no_grad():
+        hidden, cell = model.encoder(sentence_tensor)
+
+    outputs = [english.vocab.stoi["<sos>"]]
+
+    for _ in range(max_length):
+        previous_word = torch.LongTensor([outputs[-1]]).to(device)
+
+        with torch.no_grad():
+            output, hidden, cell = model.decoder(previous_word, hidden, cell)
+            best_guess = output.argmax(1).item()
+
+        outputs.append(best_guess)
+
+        # Model predicts it's the end of the sentence
+        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
+            break
+
+    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
+
+    # remove start token
+    return translated_sentence[1:]
+
+
+def bleu(data, model, german, english, device):
+    targets = []
+    outputs = []
+
+    for example in data:
+        src = vars(example)["src"]
+        trg = vars(example)["trg"]
+
+        prediction = translate_sentence(model, src, german, english, device)
+        prediction = prediction[:-1]  # remove <eos> token
+
+        targets.append([trg])
+        outputs.append(prediction)
+
+    return bleu_score(outputs, targets)
+
+
+def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+
+
+def load_checkpoint(checkpoint, model, optimizer):
+    print("=> Loading checkpoint")
+    model.load_state_dict(checkpoint["state_dict"])
+    optimizer.load_state_dict(checkpoint["optimizer"])
--- a/ML/Pytorch/more_advanced/Seq2Seq_attention/seq2seq_attention.py
+++ b/ML/Pytorch/more_advanced/Seq2Seq_attention/seq2seq_attention.py
@@ -0,0 +1,279 @@
+import random
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+import spacy
+from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
+from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
+from torchtext.datasets import Multi30k
+from torchtext.data import Field, BucketIterator
+
+"""
+To install spacy languages do:
+python -m spacy download en
+python -m spacy download de
+"""
+spacy_ger = spacy.load("de")
+spacy_eng = spacy.load("en")
+
+
+def tokenize_ger(text):
+    return [tok.text for tok in spacy_ger.tokenizer(text)]
+
+
+def tokenize_eng(text):
+    return [tok.text for tok in spacy_eng.tokenizer(text)]
+
+
+german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")
+
+english = Field(
+    tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
+)
+
+train_data, valid_data, test_data = Multi30k.splits(
+    exts=(".de", ".en"), fields=(german, english)
+)
+
+german.build_vocab(train_data, max_size=10000, min_freq=2)
+english.build_vocab(train_data, max_size=10000, min_freq=2)
+
+
+class Encoder(nn.Module):
+    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
+        super(Encoder, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+
+        self.embedding = nn.Embedding(input_size, embedding_size)
+        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional=True)
+
+        self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
+        self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
+        self.dropout = nn.Dropout(p)
+
+    def forward(self, x):
+        # x: (seq_length, N) where N is batch size
+
+        embedding = self.dropout(self.embedding(x))
+        # embedding shape: (seq_length, N, embedding_size)
+
+        encoder_states, (hidden, cell) = self.rnn(embedding)
+        # outputs shape: (seq_length, N, hidden_size)
+
+        # Use forward, backward cells and hidden through a linear layer
+        # so that it can be input to the decoder which is not bidirectional
+        # Also using index slicing ([idx:idx+1]) to keep the dimension
+        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
+        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))
+
+        return encoder_states, hidden, cell
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
+    ):
+        super(Decoder, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+
+        self.embedding = nn.Embedding(input_size, embedding_size)
+        self.rnn = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size, num_layers)
+
+        self.energy = nn.Linear(hidden_size * 3, 1)
+        self.fc = nn.Linear(hidden_size, output_size)
+        self.dropout = nn.Dropout(p)
+        self.softmax = nn.Softmax(dim=0)
+        self.relu = nn.ReLU()
+
+    def forward(self, x, encoder_states, hidden, cell):
+        x = x.unsqueeze(0)
+        # x: (1, N) where N is the batch size
+
+        embedding = self.dropout(self.embedding(x))
+        # embedding shape: (1, N, embedding_size)
+
+        sequence_length = encoder_states.shape[0]
+        h_reshaped = hidden.repeat(sequence_length, 1, 1)
+        # h_reshaped: (seq_length, N, hidden_size*2)
+
+        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
+        # energy: (seq_length, N, 1)
+
+        attention = self.softmax(energy)
+        # attention: (seq_length, N, 1)
+
+        # attention: (seq_length, N, 1), snk
+        # encoder_states: (seq_length, N, hidden_size*2), snl
+        # we want context_vector: (1, N, hidden_size*2), i.e knl
+        context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)
+
+        rnn_input = torch.cat((context_vector, embedding), dim=2)
+        # rnn_input: (1, N, hidden_size*2 + embedding_size)
+
+        outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
+        # outputs shape: (1, N, hidden_size)
+
+        predictions = self.fc(outputs).squeeze(0)
+        # predictions: (N, hidden_size)
+
+        return predictions, hidden, cell
+
+
+class Seq2Seq(nn.Module):
+    def __init__(self, encoder, decoder):
+        super(Seq2Seq, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+
+    def forward(self, source, target, teacher_force_ratio=0.5):
+        batch_size = source.shape[1]
+        target_len = target.shape[0]
+        target_vocab_size = len(english.vocab)
+
+        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
+        encoder_states, hidden, cell = self.encoder(source)
+
+        # First input will be <SOS> token
+        x = target[0]
+
+        for t in range(1, target_len):
+            # At every time step use encoder_states and update hidden, cell
+            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)
+
+            # Store prediction for current time step
+            outputs[t] = output
+
+            # Get the best word the Decoder predicted (index in the vocabulary)
+            best_guess = output.argmax(1)
+
+            # With probability of teacher_force_ratio we take the actual next word
+            # otherwise we take the word that the Decoder predicted it to be.
+            # Teacher Forcing is used so that the model gets used to seeing
+            # similar inputs at training and testing time, if teacher forcing is 1
+            # then inputs at test time might be completely different than what the
+            # network is used to. This was a long comment.
+            x = target[t] if random.random() < teacher_force_ratio else best_guess
+
+        return outputs
+
+
+### We're ready to define everything we need for training our Seq2Seq model ###
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+load_model = False
+save_model = True
+
+# Training hyperparameters
+num_epochs = 100
+learning_rate = 3e-4
+batch_size = 32
+
+# Model hyperparameters
+input_size_encoder = len(german.vocab)
+input_size_decoder = len(english.vocab)
+output_size = len(english.vocab)
+encoder_embedding_size = 300
+decoder_embedding_size = 300
+hidden_size = 1024
+num_layers = 1
+enc_dropout = 0.0
+dec_dropout = 0.0
+
+# Tensorboard to get nice loss plot
+writer = SummaryWriter(f"runs/loss_plot")
+step = 0
+
+train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
+    (train_data, valid_data, test_data),
+    batch_size=batch_size,
+    sort_within_batch=True,
+    sort_key=lambda x: len(x.src),
+    device=device,
+)
+
+encoder_net = Encoder(
+    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
+).to(device)
+
+decoder_net = Decoder(
+    input_size_decoder,
+    decoder_embedding_size,
+    hidden_size,
+    output_size,
+    num_layers,
+    dec_dropout,
+).to(device)
+
+model = Seq2Seq(encoder_net, decoder_net).to(device)
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+pad_idx = english.vocab.stoi["<pad>"]
+criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
+
+if load_model:
+    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
+
+sentence = (
+    "ein boot mit mehreren männern darauf wird von einem großen"
+    "pferdegespann ans ufer gezogen."
+)
+
+for epoch in range(num_epochs):
+    print(f"[Epoch {epoch} / {num_epochs}]")
+
+    if save_model:
+        checkpoint = {
+            "state_dict": model.state_dict(),
+            "optimizer": optimizer.state_dict(),
+        }
+        save_checkpoint(checkpoint)
+
+    model.eval()
+
+    translated_sentence = translate_sentence(
+        model, sentence, german, english, device, max_length=50
+    )
+
+    print(f"Translated example sentence: \n {translated_sentence}")
+
+    model.train()
+
+    for batch_idx, batch in enumerate(train_iterator):
+        # Get input and targets and get to cuda
+        inp_data = batch.src.to(device)
+        target = batch.trg.to(device)
+
+        # Forward prop
+        output = model(inp_data, target)
+
+        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
+        # doesn't take input in that form. For example if we have MNIST we want to have
+        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
+        # way that we have output_words * batch_size that we want to send in into
+        # our cost function, so we need to do some reshapin. While we're at it
+        # Let's also remove the start token while we're at it
+        output = output[1:].reshape(-1, output.shape[2])
+        target = target[1:].reshape(-1)
+
+        optimizer.zero_grad()
+        loss = criterion(output, target)
+
+        # Back prop
+        loss.backward()
+
+        # Clip to avoid exploding gradient issues, makes sure grads are
+        # within a healthy range
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
+
+        # Gradient descent step
+        optimizer.step()
+
+        # Plot to tensorboard
+        writer.add_scalar("Training loss", loss, global_step=step)
+        step += 1
+
+# running on entire test data takes a while
+score = bleu(test_data[1:100], model, german, english, device)
+print(f"Bleu score {score * 100:.2f}")
--- a/ML/Pytorch/more_advanced/Seq2Seq_attention/utils.py
+++ b/ML/Pytorch/more_advanced/Seq2Seq_attention/utils.py
@@ -0,0 +1,79 @@
+import torch
+import spacy
+from torchtext.data.metrics import bleu_score
+import sys
+
+
+def translate_sentence(model, sentence, german, english, device, max_length=50):
+    # Load german tokenizer
+    spacy_ger = spacy.load("de")
+
+    # Create tokens using spacy and everything in lower case (which is what our vocab is)
+    if type(sentence) == str:
+        tokens = [token.text.lower() for token in spacy_ger(sentence)]
+    else:
+        tokens = [token.lower() for token in sentence]
+
+    # Add <SOS> and <EOS> in beginning and end respectively
+    tokens.insert(0, german.init_token)
+    tokens.append(german.eos_token)
+
+    # Go through each german token and convert to an index
+    text_to_indices = [german.vocab.stoi[token] for token in tokens]
+
+    # Convert to Tensor
+    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
+
+    # Build encoder hidden, cell state
+    with torch.no_grad():
+        outputs_encoder, hiddens, cells = model.encoder(sentence_tensor)
+
+    outputs = [english.vocab.stoi["<sos>"]]
+
+    for _ in range(max_length):
+        previous_word = torch.LongTensor([outputs[-1]]).to(device)
+
+        with torch.no_grad():
+            output, hiddens, cells = model.decoder(
+                previous_word, outputs_encoder, hiddens, cells
+            )
+            best_guess = output.argmax(1).item()
+
+        outputs.append(best_guess)
+
+        # Model predicts it's the end of the sentence
+        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
+            break
+
+    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
+
+    # remove start token
+    return translated_sentence[1:]
+
+
+def bleu(data, model, german, english, device):
+    targets = []
+    outputs = []
+
+    for example in data:
+        src = vars(example)["src"]
+        trg = vars(example)["trg"]
+
+        prediction = translate_sentence(model, src, german, english, device)
+        prediction = prediction[:-1]  # remove <eos> token
+
+        targets.append([trg])
+        outputs.append(prediction)
+
+    return bleu_score(outputs, targets)
+
+
+def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+
+
+def load_checkpoint(checkpoint, model, optimizer):
+    print("=> Loading checkpoint")
+    model.load_state_dict(checkpoint["state_dict"])
+    optimizer.load_state_dict(checkpoint["optimizer"])
--- a/ML/Pytorch/more_advanced/image_captioning/README.md
+++ b/ML/Pytorch/more_advanced/image_captioning/README.md
@@ -0,0 +1,12 @@
+### Image Captioning
+
+Download the dataset used: https://www.kaggle.com/dataset/e1cd22253a9b23b073794872bf565648ddbe4f17e7fa9e74766ad3707141adeb
+Then set images folder, captions.txt inside a folder Flickr8k.
+
+train.py: For training the network
+
+model.py: creating the encoderCNN, decoderRNN and hooking them togethor
+
+get_loader.py: Loading the data, creating vocabulary
+
+utils.py: Load model, save model, printing few test cases downloaded online
--- a/ML/Pytorch/more_advanced/image_captioning/get_loader.py
+++ b/ML/Pytorch/more_advanced/image_captioning/get_loader.py
@@ -0,0 +1,142 @@
+import os  # when loading file paths
+import pandas as pd  # for lookup in annotation file
+import spacy  # for tokenizer
+import torch
+from torch.nn.utils.rnn import pad_sequence  # pad batch
+from torch.utils.data import DataLoader, Dataset
+from PIL import Image  # Load img
+import torchvision.transforms as transforms
+
+
+# We want to convert text -> numerical values
+# 1. We need a Vocabulary mapping each word to a index
+# 2. We need to setup a Pytorch dataset to load the data
+# 3. Setup padding of every batch (all examples should be
+#    of same seq_len and setup dataloader)
+# Note that loading the image is very easy compared to the text!
+
+# Download with: python -m spacy download en
+spacy_eng = spacy.load("en")
+
+
+class Vocabulary:
+    def __init__(self, freq_threshold):
+        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
+        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
+        self.freq_threshold = freq_threshold
+
+    def __len__(self):
+        return len(self.itos)
+
+    @staticmethod
+    def tokenizer_eng(text):
+        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
+
+    def build_vocabulary(self, sentence_list):
+        frequencies = {}
+        idx = 4
+
+        for sentence in sentence_list:
+            for word in self.tokenizer_eng(sentence):
+                if word not in frequencies:
+                    frequencies[word] = 1
+
+                else:
+                    frequencies[word] += 1
+
+                if frequencies[word] == self.freq_threshold:
+                    self.stoi[word] = idx
+                    self.itos[idx] = word
+                    idx += 1
+
+    def numericalize(self, text):
+        tokenized_text = self.tokenizer_eng(text)
+
+        return [
+            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
+            for token in tokenized_text
+        ]
+
+
+class FlickrDataset(Dataset):
+    def __init__(self, root_dir, captions_file, transform=None, freq_threshold=5):
+        self.root_dir = root_dir
+        self.df = pd.read_csv(captions_file)
+        self.transform = transform
+
+        # Get img, caption columns
+        self.imgs = self.df["image"]
+        self.captions = self.df["caption"]
+
+        # Initialize vocabulary and build vocab
+        self.vocab = Vocabulary(freq_threshold)
+        self.vocab.build_vocabulary(self.captions.tolist())
+
+    def __len__(self):
+        return len(self.df)
+
+    def __getitem__(self, index):
+        caption = self.captions[index]
+        img_id = self.imgs[index]
+        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
+
+        if self.transform is not None:
+            img = self.transform(img)
+
+        numericalized_caption = [self.vocab.stoi["<SOS>"]]
+        numericalized_caption += self.vocab.numericalize(caption)
+        numericalized_caption.append(self.vocab.stoi["<EOS>"])
+
+        return img, torch.tensor(numericalized_caption)
+
+
+class MyCollate:
+    def __init__(self, pad_idx):
+        self.pad_idx = pad_idx
+
+    def __call__(self, batch):
+        imgs = [item[0].unsqueeze(0) for item in batch]
+        imgs = torch.cat(imgs, dim=0)
+        targets = [item[1] for item in batch]
+        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)
+
+        return imgs, targets
+
+
+def get_loader(
+    root_folder,
+    annotation_file,
+    transform,
+    batch_size=32,
+    num_workers=8,
+    shuffle=True,
+    pin_memory=True,
+):
+    dataset = FlickrDataset(root_folder, annotation_file, transform=transform)
+
+    pad_idx = dataset.vocab.stoi["<PAD>"]
+
+    loader = DataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=shuffle,
+        pin_memory=pin_memory,
+        collate_fn=MyCollate(pad_idx=pad_idx),
+    )
+
+    return loader, dataset
+
+
+if __name__ == "__main__":
+    transform = transforms.Compose(
+        [transforms.Resize((224, 224)), transforms.ToTensor(),]
+    )
+
+    loader, dataset = get_loader(
+        "flickr8k/images/", "flickr8k/captions.txt", transform=transform
+    )
+
+    for idx, (imgs, captions) in enumerate(loader):
+        print(imgs.shape)
+        print(captions.shape)
--- a/ML/Pytorch/more_advanced/image_captioning/model.py
+++ b/ML/Pytorch/more_advanced/image_captioning/model.py
@@ -0,0 +1,66 @@
+import torch
+import torch.nn as nn
+import statistics
+import torchvision.models as models
+
+
+class EncoderCNN(nn.Module):
+    def __init__(self, embed_size, train_CNN=False):
+        super(EncoderCNN, self).__init__()
+        self.train_CNN = train_CNN
+        self.inception = models.inception_v3(pretrained=True, aux_logits=False)
+        self.inception.fc = nn.Linear(self.inception.fc.in_features, embed_size)
+        self.relu = nn.ReLU()
+        self.times = []
+        self.dropout = nn.Dropout(0.5)
+
+    def forward(self, images):
+        features = self.inception(images)
+        return self.dropout(self.relu(features))
+
+
+class DecoderRNN(nn.Module):
+    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
+        super(DecoderRNN, self).__init__()
+        self.embed = nn.Embedding(vocab_size, embed_size)
+        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
+        self.linear = nn.Linear(hidden_size, vocab_size)
+        self.dropout = nn.Dropout(0.5)
+
+    def forward(self, features, captions):
+        embeddings = self.dropout(self.embed(captions))
+        embeddings = torch.cat((features.unsqueeze(0), embeddings), dim=0)
+        hiddens, _ = self.lstm(embeddings)
+        outputs = self.linear(hiddens)
+        return outputs
+
+
+class CNNtoRNN(nn.Module):
+    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
+        super(CNNtoRNN, self).__init__()
+        self.encoderCNN = EncoderCNN(embed_size)
+        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)
+
+    def forward(self, images, captions):
+        features = self.encoderCNN(images)
+        outputs = self.decoderRNN(features, captions)
+        return outputs
+
+    def caption_image(self, image, vocabulary, max_length=50):
+        result_caption = []
+
+        with torch.no_grad():
+            x = self.encoderCNN(image).unsqueeze(0)
+            states = None
+
+            for _ in range(max_length):
+                hiddens, states = self.decoderRNN.lstm(x, states)
+                output = self.decoderRNN.linear(hiddens.squeeze(0))
+                predicted = output.argmax(1)
+                result_caption.append(predicted.item())
+                x = self.decoderRNN.embed(predicted).unsqueeze(0)
+
+                if vocabulary.itos[predicted.item()] == "<EOS>":
+                    break
+
+        return [vocabulary.itos[idx] for idx in result_caption]
--- a/ML/Pytorch/more_advanced/image_captioning/test_examples/boat.png
+++ b/ML/Pytorch/more_advanced/image_captioning/test_examples/boat.png
--- a/ML/Pytorch/more_advanced/image_captioning/test_examples/bus.png
+++ b/ML/Pytorch/more_advanced/image_captioning/test_examples/bus.png
--- a/ML/Pytorch/more_advanced/image_captioning/test_examples/child.jpg
+++ b/ML/Pytorch/more_advanced/image_captioning/test_examples/child.jpg
--- a/ML/Pytorch/more_advanced/image_captioning/test_examples/dog.jpg
+++ b/ML/Pytorch/more_advanced/image_captioning/test_examples/dog.jpg
--- a/ML/Pytorch/more_advanced/image_captioning/test_examples/horse.png
+++ b/ML/Pytorch/more_advanced/image_captioning/test_examples/horse.png
--- a/ML/Pytorch/more_advanced/image_captioning/train.py
+++ b/ML/Pytorch/more_advanced/image_captioning/train.py
@@ -0,0 +1,96 @@
+import torch
+from tqdm import tqdm
+import torch.nn as nn
+import torch.optim as optim
+import torchvision.transforms as transforms
+from torch.utils.tensorboard import SummaryWriter
+from utils import save_checkpoint, load_checkpoint, print_examples
+from get_loader import get_loader
+from model import CNNtoRNN
+
+
+def train():
+    transform = transforms.Compose(
+        [
+            transforms.Resize((356, 356)),
+            transforms.RandomCrop((299, 299)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]
+    )
+
+    train_loader, dataset = get_loader(
+        root_folder="flickr8k/images",
+        annotation_file="flickr8k/captions.txt",
+        transform=transform,
+        num_workers=2,
+    )
+
+    torch.backends.cudnn.benchmark = True
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    load_model = False
+    save_model = False
+    train_CNN = False
+
+    # Hyperparameters
+    embed_size = 256
+    hidden_size = 256
+    vocab_size = len(dataset.vocab)
+    num_layers = 1
+    learning_rate = 3e-4
+    num_epochs = 100
+
+    # for tensorboard
+    writer = SummaryWriter("runs/flickr")
+    step = 0
+
+    # initialize model, loss etc
+    model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
+    criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi["<PAD>"])
+    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+    # Only finetune the CNN
+    for name, param in model.encoderCNN.inception.named_parameters():
+        if "fc.weight" in name or "fc.bias" in name:
+            param.requires_grad = True
+        else:
+            param.requires_grad = train_CNN
+
+    if load_model:
+        step = load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
+
+    model.train()
+
+    for epoch in range(num_epochs):
+        # Uncomment the line below to see a couple of test cases
+        # print_examples(model, device, dataset)
+
+        if save_model:
+            checkpoint = {
+                "state_dict": model.state_dict(),
+                "optimizer": optimizer.state_dict(),
+                "step": step,
+            }
+            save_checkpoint(checkpoint)
+
+        for idx, (imgs, captions) in tqdm(
+            enumerate(train_loader), total=len(train_loader), leave=False
+        ):
+            imgs = imgs.to(device)
+            captions = captions.to(device)
+
+            outputs = model(imgs, captions[:-1])
+            loss = criterion(
+                outputs.reshape(-1, outputs.shape[2]), captions.reshape(-1)
+            )
+
+            writer.add_scalar("Training loss", loss.item(), global_step=step)
+            step += 1
+
+            optimizer.zero_grad()
+            loss.backward(loss)
+            optimizer.step()
+
+
+if __name__ == "__main__":
+    train()
--- a/ML/Pytorch/more_advanced/image_captioning/utils.py
+++ b/ML/Pytorch/more_advanced/image_captioning/utils.py
@@ -0,0 +1,69 @@
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+
+
+def print_examples(model, device, dataset):
+    transform = transforms.Compose(
+        [
+            transforms.Resize((299, 299)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]
+    )
+
+    model.eval()
+    test_img1 = transform(Image.open("test_examples/dog.jpg").convert("RGB")).unsqueeze(
+        0
+    )
+    print("Example 1 CORRECT: Dog on a beach by the ocean")
+    print(
+        "Example 1 OUTPUT: "
+        + " ".join(model.caption_image(test_img1.to(device), dataset.vocab))
+    )
+    test_img2 = transform(
+        Image.open("test_examples/child.jpg").convert("RGB")
+    ).unsqueeze(0)
+    print("Example 2 CORRECT: Child holding red frisbee outdoors")
+    print(
+        "Example 2 OUTPUT: "
+        + " ".join(model.caption_image(test_img2.to(device), dataset.vocab))
+    )
+    test_img3 = transform(Image.open("test_examples/bus.png").convert("RGB")).unsqueeze(
+        0
+    )
+    print("Example 3 CORRECT: Bus driving by parked cars")
+    print(
+        "Example 3 OUTPUT: "
+        + " ".join(model.caption_image(test_img3.to(device), dataset.vocab))
+    )
+    test_img4 = transform(
+        Image.open("test_examples/boat.png").convert("RGB")
+    ).unsqueeze(0)
+    print("Example 4 CORRECT: A small boat in the ocean")
+    print(
+        "Example 4 OUTPUT: "
+        + " ".join(model.caption_image(test_img4.to(device), dataset.vocab))
+    )
+    test_img5 = transform(
+        Image.open("test_examples/horse.png").convert("RGB")
+    ).unsqueeze(0)
+    print("Example 5 CORRECT: A cowboy riding a horse in the desert")
+    print(
+        "Example 5 OUTPUT: "
+        + " ".join(model.caption_image(test_img5.to(device), dataset.vocab))
+    )
+    model.train()
+
+
+def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+
+
+def load_checkpoint(checkpoint, model, optimizer):
+    print("=> Loading checkpoint")
+    model.load_state_dict(checkpoint["state_dict"])
+    optimizer.load_state_dict(checkpoint["optimizer"])
+    step = checkpoint["step"]
+    return step
--- a/ML/Pytorch/more_advanced/neuralstyle/annahathaway.png
+++ b/ML/Pytorch/more_advanced/neuralstyle/annahathaway.png
--- a/ML/Pytorch/more_advanced/neuralstyle/nst.py
+++ b/ML/Pytorch/more_advanced/neuralstyle/nst.py
@@ -0,0 +1,112 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from PIL import Image
+import torchvision.transforms as transforms
+import torchvision.models as models
+from torchvision.utils import save_image
+
+
+class VGG(nn.Module):
+    def __init__(self):
+        super(VGG, self).__init__()
+        # The first number x in convx_y gets added by 1 after it has gone
+        # through a maxpool, and the second y if we have several conv layers
+        # in between a max pool. These strings (0, 5, 10, ..) then correspond
+        # to conv1_1, conv2_1, conv3_1, conv4_1, conv5_1 mentioned in NST paper
+        self.chosen_features = ["0", "5", "10", "19", "28"]
+
+        # We don't need to run anything further than conv5_1 (the 28th module in vgg)
+        # Since remember, we dont actually care about the output of VGG: the only thing
+        # that is modified is the generated image (i.e, the input).
+        self.model = models.vgg19(pretrained=True).features[:29]
+
+    def forward(self, x):
+        # Store relevant features
+        features = []
+
+        # Go through each layer in model, if the layer is in the chosen_features,
+        # store it in features. At the end we'll just return all the activations
+        # for the specific layers we have in chosen_features
+        for layer_num, layer in enumerate(self.model):
+            x = layer(x)
+
+            if str(layer_num) in self.chosen_features:
+                features.append(x)
+
+        return features
+
+
+def load_image(image_name):
+    image = Image.open(image_name)
+    image = loader(image).unsqueeze(0)
+    return image.to(device)
+
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+imsize = 356
+
+# Here we may want to use the Normalization constants used in the original
+# VGG network (to get similar values net was originally trained on), but
+# I found it didn't matter too much so I didn't end of using it. If you
+# use it make sure to normalize back so the images don't look weird.
+loader = transforms.Compose(
+    [
+        transforms.Resize((imsize, imsize)),
+        transforms.ToTensor(),
+        # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ]
+)
+
+original_img = load_image("annahathaway.png")
+style_img = load_image("style.jpg")
+
+# initialized generated as white noise or clone of original image.
+# Clone seemed to work better for me.
+
+# generated = torch.randn(original_img.data.shape, device=device, requires_grad=True)
+generated = original_img.clone().requires_grad_(True)
+model = VGG().to(device).eval()
+
+# Hyperparameters
+total_steps = 6000
+learning_rate = 0.001
+alpha = 1
+beta = 0.01
+optimizer = optim.Adam([generated], lr=learning_rate)
+
+for step in range(total_steps):
+    # Obtain the convolution features in specifically chosen layers
+    generated_features = model(generated)
+    original_img_features = model(original_img)
+    style_features = model(style_img)
+
+    # Loss is 0 initially
+    style_loss = original_loss = 0
+
+    # iterate through all the features for the chosen layers
+    for gen_feature, orig_feature, style_feature in zip(
+        generated_features, original_img_features, style_features
+    ):
+
+        # batch_size will just be 1
+        batch_size, channel, height, width = gen_feature.shape
+        original_loss += torch.mean((gen_feature - orig_feature) ** 2)
+        # Compute Gram Matrix of generated
+        G = gen_feature.view(channel, height * width).mm(
+            gen_feature.view(channel, height * width).t()
+        )
+        # Compute Gram Matrix of Style
+        A = style_feature.view(channel, height * width).mm(
+            style_feature.view(channel, height * width).t()
+        )
+        style_loss += torch.mean((G - A) ** 2)
+
+    total_loss = alpha * original_loss + beta * style_loss
+    optimizer.zero_grad()
+    total_loss.backward()
+    optimizer.step()
+
+    if step % 200 == 0:
+        print(total_loss)
+        save_image(generated, "generated.png")
--- a/ML/Pytorch/more_advanced/neuralstyle/output/img1.png
+++ b/ML/Pytorch/more_advanced/neuralstyle/output/img1.png
--- a/ML/Pytorch/more_advanced/neuralstyle/output/img2.png
+++ b/ML/Pytorch/more_advanced/neuralstyle/output/img2.png
--- a/ML/Pytorch/more_advanced/neuralstyle/output/img3.png
+++ b/ML/Pytorch/more_advanced/neuralstyle/output/img3.png
--- a/ML/Pytorch/more_advanced/neuralstyle/output/img4.png
+++ b/ML/Pytorch/more_advanced/neuralstyle/output/img4.png
--- a/ML/Pytorch/more_advanced/neuralstyle/output/img5.png
+++ b/ML/Pytorch/more_advanced/neuralstyle/output/img5.png
--- a/ML/Pytorch/more_advanced/neuralstyle/output/img6.png
+++ b/ML/Pytorch/more_advanced/neuralstyle/output/img6.png
--- a/ML/Pytorch/more_advanced/neuralstyle/output/img7.png
+++ b/ML/Pytorch/more_advanced/neuralstyle/output/img7.png
--- a/ML/Pytorch/more_advanced/neuralstyle/output/img8.png
+++ b/ML/Pytorch/more_advanced/neuralstyle/output/img8.png
--- a/ML/Pytorch/more_advanced/neuralstyle/style.jpg
+++ b/ML/Pytorch/more_advanced/neuralstyle/style.jpg
--- a/ML/Pytorch/more_advanced/neuralstyle/styles/style1.jpg
+++ b/ML/Pytorch/more_advanced/neuralstyle/styles/style1.jpg
--- a/ML/Pytorch/more_advanced/neuralstyle/styles/style2.jpg
+++ b/ML/Pytorch/more_advanced/neuralstyle/styles/style2.jpg
--- a/ML/Pytorch/more_advanced/neuralstyle/styles/style3.jpg
+++ b/ML/Pytorch/more_advanced/neuralstyle/styles/style3.jpg
--- a/ML/Pytorch/more_advanced/neuralstyle/styles/style4.jpg
+++ b/ML/Pytorch/more_advanced/neuralstyle/styles/style4.jpg
--- a/ML/Pytorch/more_advanced/neuralstyle/styles/style5.jpg
+++ b/ML/Pytorch/more_advanced/neuralstyle/styles/style5.jpg
--- a/ML/Pytorch/more_advanced/neuralstyle/styles/style6.jpg
+++ b/ML/Pytorch/more_advanced/neuralstyle/styles/style6.jpg
--- a/ML/Pytorch/more_advanced/neuralstyle/styles/style7.jpg
+++ b/ML/Pytorch/more_advanced/neuralstyle/styles/style7.jpg
--- a/ML/Pytorch/more_advanced/neuralstyle/styles/style8.jpg
+++ b/ML/Pytorch/more_advanced/neuralstyle/styles/style8.jpg
--- a/ML/Pytorch/more_advanced/seq2seq_transformer/seq2seq_transformer.py
+++ b/ML/Pytorch/more_advanced/seq2seq_transformer/seq2seq_transformer.py
@@ -0,0 +1,255 @@
+"""
+Seq2Seq using Transformers on the Multi30k
+dataset. In this video I utilize Pytorch
+inbuilt Transformer modules, and have a
+separate implementation for Transformers
+from scratch. Training this model for a
+while (not too long) gives a BLEU score
+of ~35, and I think training for longer
+would give even better results.
+
+"""
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import spacy
+from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
+from torch.utils.tensorboard import SummaryWriter
+from torchtext.datasets import Multi30k
+from torchtext.data import Field, BucketIterator
+
+"""
+To install spacy languages do:
+python -m spacy download en
+python -m spacy download de
+"""
+spacy_ger = spacy.load("de")
+spacy_eng = spacy.load("en")
+
+
+def tokenize_ger(text):
+    return [tok.text for tok in spacy_ger.tokenizer(text)]
+
+
+def tokenize_eng(text):
+    return [tok.text for tok in spacy_eng.tokenizer(text)]
+
+
+german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")
+
+english = Field(
+    tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
+)
+
+train_data, valid_data, test_data = Multi30k.splits(
+    exts=(".de", ".en"), fields=(german, english)
+)
+
+german.build_vocab(train_data, max_size=10000, min_freq=2)
+english.build_vocab(train_data, max_size=10000, min_freq=2)
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        embedding_size,
+        src_vocab_size,
+        trg_vocab_size,
+        src_pad_idx,
+        num_heads,
+        num_encoder_layers,
+        num_decoder_layers,
+        forward_expansion,
+        dropout,
+        max_len,
+        device,
+    ):
+        super(Transformer, self).__init__()
+        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
+        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
+        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
+        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)
+
+        self.device = device
+        self.transformer = nn.Transformer(
+            embedding_size,
+            num_heads,
+            num_encoder_layers,
+            num_decoder_layers,
+            forward_expansion,
+            dropout,
+        )
+        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
+        self.dropout = nn.Dropout(dropout)
+        self.src_pad_idx = src_pad_idx
+
+    def make_src_mask(self, src):
+        src_mask = src.transpose(0, 1) == self.src_pad_idx
+
+        # (N, src_len)
+        return src_mask.to(self.device)
+
+    def forward(self, src, trg):
+        src_seq_length, N = src.shape
+        trg_seq_length, N = trg.shape
+
+        src_positions = (
+            torch.arange(0, src_seq_length)
+            .unsqueeze(1)
+            .expand(src_seq_length, N)
+            .to(self.device)
+        )
+
+        trg_positions = (
+            torch.arange(0, trg_seq_length)
+            .unsqueeze(1)
+            .expand(trg_seq_length, N)
+            .to(self.device)
+        )
+
+        embed_src = self.dropout(
+            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
+        )
+        embed_trg = self.dropout(
+            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
+        )
+
+        src_padding_mask = self.make_src_mask(src)
+        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
+            self.device
+        )
+
+        out = self.transformer(
+            embed_src,
+            embed_trg,
+            src_key_padding_mask=src_padding_mask,
+            tgt_mask=trg_mask,
+        )
+        out = self.fc_out(out)
+        return out
+
+
+# We're ready to define everything we need for training our Seq2Seq model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+load_model = True
+save_model = True
+
+# Training hyperparameters
+num_epochs = 10000
+learning_rate = 3e-4
+batch_size = 32
+
+# Model hyperparameters
+src_vocab_size = len(german.vocab)
+trg_vocab_size = len(english.vocab)
+embedding_size = 512
+num_heads = 8
+num_encoder_layers = 3
+num_decoder_layers = 3
+dropout = 0.10
+max_len = 100
+forward_expansion = 4
+src_pad_idx = english.vocab.stoi["<pad>"]
+
+# Tensorboard to get nice loss plot
+writer = SummaryWriter("runs/loss_plot")
+step = 0
+
+train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
+    (train_data, valid_data, test_data),
+    batch_size=batch_size,
+    sort_within_batch=True,
+    sort_key=lambda x: len(x.src),
+    device=device,
+)
+
+model = Transformer(
+    embedding_size,
+    src_vocab_size,
+    trg_vocab_size,
+    src_pad_idx,
+    num_heads,
+    num_encoder_layers,
+    num_decoder_layers,
+    forward_expansion,
+    dropout,
+    max_len,
+    device,
+).to(device)
+
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+    optimizer, factor=0.1, patience=10, verbose=True
+)
+
+pad_idx = english.vocab.stoi["<pad>"]
+criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
+
+if load_model:
+    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
+
+sentence = "ein pferd geht unter einer brücke neben einem boot."
+
+for epoch in range(num_epochs):
+    print(f"[Epoch {epoch} / {num_epochs}]")
+
+    if save_model:
+        checkpoint = {
+            "state_dict": model.state_dict(),
+            "optimizer": optimizer.state_dict(),
+        }
+        save_checkpoint(checkpoint)
+
+    model.eval()
+    translated_sentence = translate_sentence(
+        model, sentence, german, english, device, max_length=50
+    )
+
+    print(f"Translated example sentence: \n {translated_sentence}")
+    model.train()
+    losses = []
+
+    for batch_idx, batch in enumerate(train_iterator):
+        # Get input and targets and get to cuda
+        inp_data = batch.src.to(device)
+        target = batch.trg.to(device)
+
+        # Forward prop
+        output = model(inp_data, target[:-1, :])
+
+        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
+        # doesn't take input in that form. For example if we have MNIST we want to have
+        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
+        # way that we have output_words * batch_size that we want to send in into
+        # our cost function, so we need to do some reshapin.
+        # Let's also remove the start token while we're at it
+        output = output.reshape(-1, output.shape[2])
+        target = target[1:].reshape(-1)
+
+        optimizer.zero_grad()
+
+        loss = criterion(output, target)
+        losses.append(loss.item())
+
+        # Back prop
+        loss.backward()
+        # Clip to avoid exploding gradient issues, makes sure grads are
+        # within a healthy range
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
+
+        # Gradient descent step
+        optimizer.step()
+
+        # plot to tensorboard
+        writer.add_scalar("Training loss", loss, global_step=step)
+        step += 1
+
+    mean_loss = sum(losses) / len(losses)
+    scheduler.step(mean_loss)
+
+# running on entire test data takes a while
+score = bleu(test_data[1:100], model, german, english, device)
+print(f"Bleu score {score * 100:.2f}")
--- a/ML/Pytorch/more_advanced/seq2seq_transformer/utils.py
+++ b/ML/Pytorch/more_advanced/seq2seq_transformer/utils.py
@@ -0,0 +1,70 @@
+import torch
+import spacy
+from torchtext.data.metrics import bleu_score
+import sys
+
+
+def translate_sentence(model, sentence, german, english, device, max_length=50):
+    # Load german tokenizer
+    spacy_ger = spacy.load("de")
+
+    # Create tokens using spacy and everything in lower case (which is what our vocab is)
+    if type(sentence) == str:
+        tokens = [token.text.lower() for token in spacy_ger(sentence)]
+    else:
+        tokens = [token.lower() for token in sentence]
+
+    # Add <SOS> and <EOS> in beginning and end respectively
+    tokens.insert(0, german.init_token)
+    tokens.append(german.eos_token)
+
+    # Go through each german token and convert to an index
+    text_to_indices = [german.vocab.stoi[token] for token in tokens]
+
+    # Convert to Tensor
+    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
+
+    outputs = [english.vocab.stoi["<sos>"]]
+    for i in range(max_length):
+        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)
+
+        with torch.no_grad():
+            output = model(sentence_tensor, trg_tensor)
+
+        best_guess = output.argmax(2)[-1, :].item()
+        outputs.append(best_guess)
+
+        if best_guess == english.vocab.stoi["<eos>"]:
+            break
+
+    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
+    # remove start token
+    return translated_sentence[1:]
+
+
+def bleu(data, model, german, english, device):
+    targets = []
+    outputs = []
+
+    for example in data:
+        src = vars(example)["src"]
+        trg = vars(example)["trg"]
+
+        prediction = translate_sentence(model, src, german, english, device)
+        prediction = prediction[:-1]  # remove <eos> token
+
+        targets.append([trg])
+        outputs.append(prediction)
+
+    return bleu_score(outputs, targets)
+
+
+def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+
+
+def load_checkpoint(checkpoint, model, optimizer):
+    print("=> Loading checkpoint")
+    model.load_state_dict(checkpoint["state_dict"])
+    optimizer.load_state_dict(checkpoint["optimizer"])
--- a/ML/Pytorch/more_advanced/torchtext/mydata/test.csv
+++ b/ML/Pytorch/more_advanced/torchtext/mydata/test.csv
@@ -0,0 +1,4 @@
+name,quote,score
+Jocko,You must own everything in your world. There is no one else to blame.,1
+Bruce Lee,"Do not pray for an easy life, pray for the strength to endure a difficult one.",1
+Potato guy,"Stand tall, and rice like a potato!",0
--- a/ML/Pytorch/more_advanced/torchtext/mydata/test.json
+++ b/ML/Pytorch/more_advanced/torchtext/mydata/test.json
@@ -0,0 +1,3 @@
+{"name": "Jocko", "quote": "You must own everything in your world. There is no one else to blame.", "score":1}
+{"name": "Bruce", "quote": "Do not pray for an easy life, pray for the strength to endure a difficult one.", "score":1}
+{"name": "Random Potato", "quote": "Stand tall, and rice like a potato!", "score":0}
--- a/ML/Pytorch/more_advanced/torchtext/mydata/test.tsv
+++ b/ML/Pytorch/more_advanced/torchtext/mydata/test.tsv
@@ -0,0 +1,4 @@
+name	quote	score
+Jocko	You must own everything in your world. There is no one else to blame.	1
+Bruce Lee	Do not pray for an easy life, pray for the strength to endure a difficult one.	1
+Potato guy	Stand tall, and rice like a potato!	0
--- a/ML/Pytorch/more_advanced/torchtext/mydata/train.csv
+++ b/ML/Pytorch/more_advanced/torchtext/mydata/train.csv
@@ -0,0 +1,4 @@
+name,quote,score
+Jocko,You must own everything in your world. There is no one else to blame.,1
+Bruce Lee,"Do not pray for an easy life, pray for the strength to endure a difficult one.",1
+Potato guy,"Stand tall, and rice like a potato!",0
--- a/ML/Pytorch/more_advanced/torchtext/mydata/train.json
+++ b/ML/Pytorch/more_advanced/torchtext/mydata/train.json
@@ -0,0 +1,3 @@
+{"name": "Jocko", "quote": "You must own everything in your world. There is no one else to blame.", "score":1}
+{"name": "Bruce", "quote": "Do not pray for an easy life, pray for the strength to endure a difficult one.", "score":1}
+{"name": "Random Potato", "quote": "Stand tall, and rice like a potato!", "score":0}
--- a/ML/Pytorch/more_advanced/torchtext/mydata/train.tsv
+++ b/ML/Pytorch/more_advanced/torchtext/mydata/train.tsv
@@ -0,0 +1,4 @@
+name	quote	score
+Jocko	You must own everything in your world. There is no one else to blame.	1
+Bruce Lee	Do not pray for an easy life, pray for the strength to endure a difficult one.	1
+Potato guy	Stand tall, and rice like a potato!	0
--- a/ML/Pytorch/more_advanced/torchtext/torchtext_tutorial1.py
+++ b/ML/Pytorch/more_advanced/torchtext/torchtext_tutorial1.py
@@ -0,0 +1,111 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import spacy
+from torchtext.data import Field, TabularDataset, BucketIterator
+
+######### Loading from JSON/CSV/TSV files #########
+
+# STEPS:
+# 1. Specify how preprocessing should be done -> Fields
+# 2. Use Dataset to load the data -> TabularDataset (JSON/CSV/TSV Files)
+# 3. Construct an iterator to do batching & padding -> BucketIterator
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# python -m spacy download en
+spacy_en = spacy.load("en")
+
+
+def tokenize(text):
+    return [tok.text for tok in spacy_en.tokenizer(text)]
+
+
+quote = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)
+score = Field(sequential=False, use_vocab=False)
+
+fields = {"quote": ("q", quote), "score": ("s", score)}
+
+train_data, test_data = TabularDataset.splits(
+    path="mydata", train="train.json", test="test.json", format="json", fields=fields
+)
+
+# # train_data, test_data = TabularDataset.splits(
+# #                                         path='mydata',
+# #                                         train='train.csv',
+# #                                         test='test.csv',
+# #                                         format='csv',
+# #                                         fields=fields)
+
+# # train_data, test_data = TabularDataset.splits(
+# #                                         path='mydata',
+# #                                         train='train.tsv',
+# #                                         test='test.tsv',
+# #                                         format='tsv',
+# #                                         fields=fields)
+
+quote.build_vocab(train_data, max_size=10000, min_freq=1, vectors="glove.6B.100d")
+
+train_iterator, test_iterator = BucketIterator.splits(
+    (train_data, test_data), batch_size=2, device=device
+)
+
+######### Training a simple LSTM on this toy data of ours #########
+class RNN_LSTM(nn.Module):
+    def __init__(self, input_size, embed_size, hidden_size, num_layers):
+        super(RNN_LSTM, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+
+        self.embedding = nn.Embedding(input_size, embed_size)
+        self.rnn = nn.LSTM(embed_size, hidden_size, num_layers)
+        self.fc_out = nn.Linear(hidden_size, 1)
+
+    def forward(self, x):
+        # Set initial hidden and cell states
+        h0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)
+        c0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)
+
+        embedded = self.embedding(x)
+        outputs, _ = self.rnn(embedded, (h0, c0))
+        prediction = self.fc_out(outputs[-1, :, :])
+
+        return prediction
+
+
+# Hyperparameters
+input_size = len(quote.vocab)
+hidden_size = 512
+num_layers = 2
+embedding_size = 100
+learning_rate = 0.005
+num_epochs = 10
+
+# Initialize network
+model = RNN_LSTM(input_size, embedding_size, hidden_size, num_layers).to(device)
+
+# (NOT COVERED IN YOUTUBE VIDEO): Load the pretrained embeddings onto our model
+pretrained_embeddings = quote.vocab.vectors
+model.embedding.weight.data.copy_(pretrained_embeddings)
+
+# Loss and optimizer
+criterion = nn.BCEWithLogitsLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Train Network
+for epoch in range(num_epochs):
+    for batch_idx, batch in enumerate(train_iterator):
+        # Get data to cuda if possible
+        data = batch.q.to(device=device)
+        targets = batch.s.to(device=device)
+
+        # forward
+        scores = model(data)
+        loss = criterion(scores.squeeze(1), targets.type_as(scores))
+
+        # backward
+        optimizer.zero_grad()
+        loss.backward()
+
+        # gradient descent
+        optimizer.step()
--- a/ML/Pytorch/more_advanced/torchtext/torchtext_tutorial2.py
+++ b/ML/Pytorch/more_advanced/torchtext/torchtext_tutorial2.py
@@ -0,0 +1,45 @@
+import spacy
+from torchtext.datasets import Multi30k
+from torchtext.data import Field, BucketIterator
+
+"""
+To install spacy languages use:
+python -m spacy download en
+python -m spacy download de
+"""
+
+spacy_eng = spacy.load("en")
+spacy_ger = spacy.load("de")
+
+
+def tokenize_eng(text):
+    return [tok.text for tok in spacy_eng.tokenizer(text)]
+
+
+def tokenize_ger(text):
+    return [tok.text for tok in spacy_ger.tokenizer(text)]
+
+
+english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)
+german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True)
+
+train_data, validation_data, test_data = Multi30k.splits(
+    exts=(".de", ".en"), fields=(german, english)
+)
+
+english.build_vocab(train_data, max_size=10000, min_freq=2)
+german.build_vocab(train_data, max_size=10000, min_freq=2)
+
+train_iterator, validation_iterator, test_iterator = BucketIterator.splits(
+    (train_data, validation_data, test_data), batch_size=64, device="cuda"
+)
+
+for batch in train_iterator:
+    print(batch)
+
+# string to integer (stoi)
+print(f'Index of the word (the) is: {english.vocab.stoi["the"]}')
+
+# print integer to string (itos)
+print(f"Word of the index (1612) is: {english.vocab.itos[1612]}")
+print(f"Word of the index (0) is: {english.vocab.itos[0]}")
--- a/ML/Pytorch/more_advanced/torchtext/torchtext_tutorial3.py
+++ b/ML/Pytorch/more_advanced/torchtext/torchtext_tutorial3.py
@@ -0,0 +1,64 @@
+import spacy
+import pandas as pd
+from torchtext.data import Field, BucketIterator, TabularDataset
+from sklearn.model_selection import train_test_split
+
+### Load data from two text files where each row is a sentence ###
+english_txt = open("train_WMT_english.txt", encoding="utf8").read().split("\n")
+german_txt = open("train_WMT_german.txt", encoding="utf8").read().split("\n")
+
+raw_data = {
+    "English": [line for line in english_txt[1:100]],
+    "German": [line for line in german_txt[1:100]],
+}
+
+df = pd.DataFrame(raw_data, columns=["English", "German"])
+
+# create train and test set
+train, test = train_test_split(df, test_size=0.1)
+
+# Get train, test data to json and csv format which can be read by torchtext
+train.to_json("train.json", orient="records", lines=True)
+test.to_json("test.json", orient="records", lines=True)
+
+train.to_csv("train.csv", index=False)
+test.to_csv("test.csv", index=False)
+
+### Now we're back to where we were in previous Tutorials ###
+
+"""
+To install spacy languages use:
+python -m spacy download en
+python -m spacy download de
+"""
+
+spacy_eng = spacy.load("en")
+spacy_ger = spacy.load("de")
+
+
+def tokenize_eng(text):
+    return [tok.text for tok in spacy_eng.tokenizer(text)]
+
+
+def tokenize_ger(text):
+    return [tok.text for tok in spacy_ger.tokenizer(text)]
+
+
+english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)
+german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True)
+
+fields = {"English": ("eng", english), "German": ("ger", german)}
+
+train_data, test_data = TabularDataset.splits(
+    path="", train="train.json", test="test.json", format="json", fields=fields
+)
+
+english.build_vocab(train_data, max_size=10000, min_freq=2)
+german.build_vocab(train_data, max_size=10000, min_freq=2)
+
+train_iterator, test_iterator = BucketIterator.splits(
+    (train_data, test_data), batch_size=32, device="cuda"
+)
+
+for batch in train_iterator:
+    print(batch)
--- a/ML/Pytorch/more_advanced/transformer_from_scratch/transformer_from_scratch.py
+++ b/ML/Pytorch/more_advanced/transformer_from_scratch/transformer_from_scratch.py
@@ -0,0 +1,291 @@
+"""
+A from scratch implementation of Transformer network,
+following the paper Attention is all you need with a
+few minor differences. I tried to make it as clear as
+possible to understand and also went through the code
+on my youtube channel!
+
+
+"""
+
+import torch
+import torch.nn as nn
+
+
+class SelfAttention(nn.Module):
+    def __init__(self, embed_size, heads):
+        super(SelfAttention, self).__init__()
+        self.embed_size = embed_size
+        self.heads = heads
+        self.head_dim = embed_size // heads
+
+        assert (
+            self.head_dim * heads == embed_size
+        ), "Embedding size needs to be divisible by heads"
+
+        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
+        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
+        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
+        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)
+
+    def forward(self, values, keys, query, mask):
+        # Get number of training examples
+        N = query.shape[0]
+
+        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
+
+        # Split the embedding into self.heads different pieces
+        values = values.reshape(N, value_len, self.heads, self.head_dim)
+        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
+        query = query.reshape(N, query_len, self.heads, self.head_dim)
+
+        values = self.values(values)  # (N, value_len, heads, head_dim)
+        keys = self.keys(keys)  # (N, key_len, heads, head_dim)
+        queries = self.queries(query)  # (N, query_len, heads, heads_dim)
+
+        # Einsum does matrix mult. for query*keys for each training example
+        # with every other training example, don't be confused by einsum
+        # it's just how I like doing matrix multiplication & bmm
+
+        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
+        # queries shape: (N, query_len, heads, heads_dim),
+        # keys shape: (N, key_len, heads, heads_dim)
+        # energy: (N, heads, query_len, key_len)
+
+        # Mask padded indices so their weights become 0
+        if mask is not None:
+            energy = energy.masked_fill(mask == 0, float("-1e20"))
+
+        # Normalize energy values similarly to seq2seq + attention
+        # so that they sum to 1. Also divide by scaling factor for
+        # better stability
+        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
+        # attention shape: (N, heads, query_len, key_len)
+
+        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
+            N, query_len, self.heads * self.head_dim
+        )
+        # attention shape: (N, heads, query_len, key_len)
+        # values shape: (N, value_len, heads, heads_dim)
+        # out after matrix multiply: (N, query_len, heads, head_dim), then
+        # we reshape and flatten the last two dimensions.
+
+        out = self.fc_out(out)
+        # Linear layer doesn't modify the shape, final shape will be
+        # (N, query_len, embed_size)
+
+        return out
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, embed_size, heads, dropout, forward_expansion):
+        super(TransformerBlock, self).__init__()
+        self.attention = SelfAttention(embed_size, heads)
+        self.norm1 = nn.LayerNorm(embed_size)
+        self.norm2 = nn.LayerNorm(embed_size)
+
+        self.feed_forward = nn.Sequential(
+            nn.Linear(embed_size, forward_expansion * embed_size),
+            nn.ReLU(),
+            nn.Linear(forward_expansion * embed_size, embed_size),
+        )
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, value, key, query, mask):
+        attention = self.attention(value, key, query, mask)
+
+        # Add skip connection, run through normalization and finally dropout
+        x = self.dropout(self.norm1(attention + query))
+        forward = self.feed_forward(x)
+        out = self.dropout(self.norm2(forward + x))
+        return out
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        src_vocab_size,
+        embed_size,
+        num_layers,
+        heads,
+        device,
+        forward_expansion,
+        dropout,
+        max_length,
+    ):
+
+        super(Encoder, self).__init__()
+        self.embed_size = embed_size
+        self.device = device
+        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
+        self.position_embedding = nn.Embedding(max_length, embed_size)
+
+        self.layers = nn.ModuleList(
+            [
+                TransformerBlock(
+                    embed_size,
+                    heads,
+                    dropout=dropout,
+                    forward_expansion=forward_expansion,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, mask):
+        N, seq_length = x.shape
+        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
+        out = self.dropout(
+            (self.word_embedding(x) + self.position_embedding(positions))
+        )
+
+        # In the Encoder the query, key, value are all the same, it's in the
+        # decoder this will change. This might look a bit odd in this case.
+        for layer in self.layers:
+            out = layer(out, out, out, mask)
+
+        return out
+
+
+class DecoderBlock(nn.Module):
+    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
+        super(DecoderBlock, self).__init__()
+        self.norm = nn.LayerNorm(embed_size)
+        self.attention = SelfAttention(embed_size, heads=heads)
+        self.transformer_block = TransformerBlock(
+            embed_size, heads, dropout, forward_expansion
+        )
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, value, key, src_mask, trg_mask):
+        attention = self.attention(x, x, x, trg_mask)
+        query = self.dropout(self.norm(attention + x))
+        out = self.transformer_block(value, key, query, src_mask)
+        return out
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        trg_vocab_size,
+        embed_size,
+        num_layers,
+        heads,
+        forward_expansion,
+        dropout,
+        device,
+        max_length,
+    ):
+        super(Decoder, self).__init__()
+        self.device = device
+        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
+        self.position_embedding = nn.Embedding(max_length, embed_size)
+
+        self.layers = nn.ModuleList(
+            [
+                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
+                for _ in range(num_layers)
+            ]
+        )
+        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, enc_out, src_mask, trg_mask):
+        N, seq_length = x.shape
+        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
+        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
+
+        for layer in self.layers:
+            x = layer(x, enc_out, enc_out, src_mask, trg_mask)
+
+        out = self.fc_out(x)
+
+        return out
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        src_vocab_size,
+        trg_vocab_size,
+        src_pad_idx,
+        trg_pad_idx,
+        embed_size=512,
+        num_layers=6,
+        forward_expansion=4,
+        heads=8,
+        dropout=0,
+        device="cpu",
+        max_length=100,
+    ):
+
+        super(Transformer, self).__init__()
+
+        self.encoder = Encoder(
+            src_vocab_size,
+            embed_size,
+            num_layers,
+            heads,
+            device,
+            forward_expansion,
+            dropout,
+            max_length,
+        )
+
+        self.decoder = Decoder(
+            trg_vocab_size,
+            embed_size,
+            num_layers,
+            heads,
+            forward_expansion,
+            dropout,
+            device,
+            max_length,
+        )
+
+        self.src_pad_idx = src_pad_idx
+        self.trg_pad_idx = trg_pad_idx
+        self.device = device
+
+    def make_src_mask(self, src):
+        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
+        # (N, 1, 1, src_len)
+        return src_mask.to(self.device)
+
+    def make_trg_mask(self, trg):
+        N, trg_len = trg.shape
+        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
+            N, 1, trg_len, trg_len
+        )
+
+        return trg_mask.to(self.device)
+
+    def forward(self, src, trg):
+        src_mask = self.make_src_mask(src)
+        trg_mask = self.make_trg_mask(trg)
+        enc_src = self.encoder(src, src_mask)
+        out = self.decoder(trg, enc_src, src_mask, trg_mask)
+        return out
+
+
+if __name__ == "__main__":
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(device)
+
+    x = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]).to(
+        device
+    )
+    trg = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]).to(device)
+
+    src_pad_idx = 0
+    trg_pad_idx = 0
+    src_vocab_size = 10
+    trg_vocab_size = 10
+    model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device).to(
+        device
+    )
+    out = model(x, trg[:, :-1])
+    print(out.shape)