Initial commit

2026-02-21 11:18:01 +00:00 · 2021-01-30 21:49:15 +01:00
commit 65b8c80495
432 changed files with 1290844 additions and 0 deletions
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.0.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.0.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.1.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.1.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.2.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.2.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.3.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.3.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.4.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.4.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.5.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.5.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.6.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.6.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.7.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/cat.7.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/dog.0.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/cats/dog.0.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123686.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123686.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123687.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123687.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123688.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123688.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123689.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123689.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123690.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123690.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123691.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123691.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123692.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123692.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123693.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123693.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123694.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123694.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123695.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/cat_dogs/dogs/n123695.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/classification.py
+++ b/ML/Pytorch/Basics/albumentations_tutorial/classification.py
@@ -0,0 +1,31 @@
+import cv2
+import albumentations as A
+import numpy as np
+from utils import plot_examples
+from PIL import Image
+
+image = Image.open("images/elon.jpeg")
+
+transform = A.Compose(
+    [
+        A.Resize(width=1920, height=1080),
+        A.RandomCrop(width=1280, height=720),
+        A.Rotate(limit=40, p=0.9, border_mode=cv2.BORDER_CONSTANT),
+        A.HorizontalFlip(p=0.5),
+        A.VerticalFlip(p=0.1),
+        A.RGBShift(r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.9),
+        A.OneOf([
+            A.Blur(blur_limit=3, p=0.5),
+            A.ColorJitter(p=0.5),
+        ], p=1.0),
+    ]
+)
+
+images_list = [image]
+image = np.array(image)
+for i in range(15):
+    augmentations = transform(image=image)
+    augmented_img = augmentations["image"]
+    images_list.append(augmented_img)
+plot_examples(images_list)
+
--- a/ML/Pytorch/Basics/albumentations_tutorial/detection.py
+++ b/ML/Pytorch/Basics/albumentations_tutorial/detection.py
@@ -0,0 +1,41 @@
+import cv2
+import albumentations as A
+import numpy as np
+from utils import plot_examples
+from PIL import Image
+
+image = cv2.imread("images/cat.jpg")
+image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+bboxes = [[13, 170, 224, 410]]
+
+# Pascal_voc (x_min, y_min, x_max, y_max), YOLO, COCO
+
+transform = A.Compose(
+    [
+        A.Resize(width=1920, height=1080),
+        A.RandomCrop(width=1280, height=720),
+        A.Rotate(limit=40, p=0.9, border_mode=cv2.BORDER_CONSTANT),
+        A.HorizontalFlip(p=0.5),
+        A.VerticalFlip(p=0.1),
+        A.RGBShift(r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.9),
+        A.OneOf([
+            A.Blur(blur_limit=3, p=0.5),
+            A.ColorJitter(p=0.5),
+        ], p=1.0),
+    ], bbox_params=A.BboxParams(format="pascal_voc", min_area=2048,
+                                min_visibility=0.3, label_fields=[])
+)
+
+images_list = [image]
+saved_bboxes = [bboxes[0]]
+for i in range(15):
+    augmentations = transform(image=image, bboxes=bboxes)
+    augmented_img = augmentations["image"]
+
+    if len(augmentations["bboxes"]) == 0:
+        continue
+
+    images_list.append(augmented_img)
+    saved_bboxes.append(augmentations["bboxes"][0])
+
+plot_examples(images_list, saved_bboxes)
--- a/ML/Pytorch/Basics/albumentations_tutorial/full_pytorch_example.py
+++ b/ML/Pytorch/Basics/albumentations_tutorial/full_pytorch_example.py
@@ -0,0 +1,62 @@
+import torch
+import numpy as np
+import cv2
+from PIL import Image
+import torch.nn as nn
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+from torch.utils.data import Dataset
+import os
+
+class ImageFolder(Dataset):
+    def __init__(self, root_dir, transform=None):
+        super(ImageFolder, self).__init__()
+        self.data = []
+        self.root_dir = root_dir
+        self.transform = transform
+        self.class_names = os.listdir(root_dir)
+
+        for index, name in enumerate(self.class_names):
+            files = os.listdir(os.path.join(root_dir, name))
+            self.data += list(zip(files, [index]*len(files)))
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        img_file, label = self.data[index]
+        root_and_dir = os.path.join(self.root_dir, self.class_names[label])
+        image = np.array(Image.open(os.path.join(root_and_dir, img_file)))
+
+        if self.transform is not None:
+            augmentations = self.transform(image=image)
+            image = augmentations["image"]
+
+        return image, label
+
+
+transform = A.Compose(
+    [
+        A.Resize(width=1920, height=1080),
+        A.RandomCrop(width=1280, height=720),
+        A.Rotate(limit=40, p=0.9, border_mode=cv2.BORDER_CONSTANT),
+        A.HorizontalFlip(p=0.5),
+        A.VerticalFlip(p=0.1),
+        A.RGBShift(r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.9),
+        A.OneOf([
+            A.Blur(blur_limit=3, p=0.5),
+            A.ColorJitter(p=0.5),
+        ], p=1.0),
+        A.Normalize(
+            mean=[0, 0, 0],
+            std=[1, 1, 1],
+            max_pixel_value=255,
+        ),
+        ToTensorV2(),
+    ]
+)
+
+dataset = ImageFolder(root_dir="cat_dogs", transform=transform)
+
+for x,y in dataset:
+    print(x.shape)
--- a/ML/Pytorch/Basics/albumentations_tutorial/images/cat.jpg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/images/cat.jpg
--- a/ML/Pytorch/Basics/albumentations_tutorial/images/elon.jpeg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/images/elon.jpeg
--- a/ML/Pytorch/Basics/albumentations_tutorial/images/mask.jpeg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/images/mask.jpeg
--- a/ML/Pytorch/Basics/albumentations_tutorial/images/second_mask.jpeg
+++ b/ML/Pytorch/Basics/albumentations_tutorial/images/second_mask.jpeg
--- a/ML/Pytorch/Basics/albumentations_tutorial/segmentation.py
+++ b/ML/Pytorch/Basics/albumentations_tutorial/segmentation.py
@@ -0,0 +1,37 @@
+import cv2
+import albumentations as A
+import numpy as np
+from utils import plot_examples
+from PIL import Image
+
+image = Image.open("images/elon.jpeg")
+mask = Image.open("images/mask.jpeg")
+mask2 = Image.open("images/second_mask.jpeg")
+
+transform = A.Compose(
+    [
+        A.Resize(width=1920, height=1080),
+        A.RandomCrop(width=1280, height=720),
+        A.Rotate(limit=40, p=0.9, border_mode=cv2.BORDER_CONSTANT),
+        A.HorizontalFlip(p=0.5),
+        A.VerticalFlip(p=0.1),
+        A.RGBShift(r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.9),
+        A.OneOf([
+            A.Blur(blur_limit=3, p=0.5),
+            A.ColorJitter(p=0.5),
+        ], p=1.0),
+    ]
+)
+
+images_list = [image]
+image = np.array(image)
+mask = np.array(mask) # np.asarray(mask), np.array(mask)
+mask2 = np.array(mask2)
+for i in range(4):
+    augmentations = transform(image=image, masks=[mask, mask2])
+    augmented_img = augmentations["image"]
+    augmented_masks = augmentations["masks"]
+    images_list.append(augmented_img)
+    images_list.append(augmented_masks[0])
+    images_list.append(augmented_masks[1])
+plot_examples(images_list)
--- a/ML/Pytorch/Basics/albumentations_tutorial/utils.py
+++ b/ML/Pytorch/Basics/albumentations_tutorial/utils.py
@@ -0,0 +1,36 @@
+import random
+import cv2
+from matplotlib import pyplot as plt
+import matplotlib.patches as patches
+import numpy as np
+import albumentations as A
+
+
+def visualize(image):
+    plt.figure(figsize=(10, 10))
+    plt.axis('off')
+    plt.imshow(image)
+    plt.show()
+
+
+def plot_examples(images, bboxes=None):
+    fig = plt.figure(figsize=(15, 15))
+    columns = 4
+    rows = 5
+
+    for i in range(1, len(images)):
+        if bboxes is not None:
+            img = visualize_bbox(images[i - 1], bboxes[i - 1], class_name="Elon")
+        else:
+            img = images[i-1]
+        fig.add_subplot(rows, columns, i)
+        plt.imshow(img)
+    plt.show()
+
+
+# From https://albumentations.ai/docs/examples/example_bboxes/
+def visualize_bbox(img, bbox, class_name, color=(255, 0, 0), thickness=5):
+    """Visualizes a single bounding box on the image"""
+    x_min, y_min, x_max, y_max = map(int, bbox)
+    cv2.rectangle(img, (x_min, y_min), (x_max, y_max), color, thickness)
+    return img
--- a/ML/Pytorch/Basics/custom_dataset/cats_dogs.csv
+++ b/ML/Pytorch/Basics/custom_dataset/cats_dogs.csv
@@ -0,0 +1,11 @@
+Animal,Label
+cat.0.jpg,0
+cat.1.jpg,0
+cat.2.jpg,0
+cat.3.jpg,0
+cat.4.jpg,0
+cat.5.jpg,0
+cat.6.jpg,0
+cat.7.jpg,0
+dog.0.jpg,1
+dog.1.jpg,1
--- a/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.0.jpg
+++ b/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.0.jpg
--- a/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.1.jpg
+++ b/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.1.jpg
--- a/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.2.jpg
+++ b/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.2.jpg
--- a/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.3.jpg
+++ b/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.3.jpg
--- a/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.4.jpg
+++ b/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.4.jpg
--- a/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.5.jpg
+++ b/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.5.jpg
--- a/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.6.jpg
+++ b/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.6.jpg
--- a/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.7.jpg
+++ b/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/cat.7.jpg
--- a/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/dog.0.jpg
+++ b/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/dog.0.jpg
--- a/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/dog.1.jpg
+++ b/ML/Pytorch/Basics/custom_dataset/cats_dogs_resized/dog.1.jpg
--- a/ML/Pytorch/Basics/custom_dataset/custom_FCNN.py
+++ b/ML/Pytorch/Basics/custom_dataset/custom_FCNN.py
@@ -0,0 +1,131 @@
+# Imports
+import os
+from typing import Union
+
+import torch.nn.functional as F  # All functions that don't have any parameters
+import pandas as pd
+import torch
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+import torchvision
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+from pandas import io
+
+# from skimage import io
+from torch.utils.data import (
+    Dataset,
+    DataLoader,
+)  # Gives easier dataset managment and creates mini batches
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+
+
+# Create Fully Connected Network
+class NN(nn.Module):
+    def __init__(self, input_size, num_classes):
+        super(NN, self).__init__()
+        self.fc1 = nn.Linear(input_size, 50)
+        self.fc2 = nn.Linear(50, num_classes)
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return x
+
+
+class SoloDataset(Dataset):
+    def __init__(self, csv_file, root_dir, transform=None):
+        self.annotations = pd.read_csv(csv_file)
+        self.root_dir = root_dir
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.annotations)
+
+    def __getitem__(self, index):
+        x_data = self.annotations.iloc[index, 0:11]
+        x_data = torch.tensor(x_data)
+        y_label = torch.tensor(int(self.annotations.iloc[index, 11]))
+
+        return (x_data.float(), y_label)
+
+
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+num_classes = 26
+learning_rate = 1e-3
+batch_size = 5
+num_epochs = 30
+input_size = 11
+
+# Load Data
+dataset = SoloDataset(
+    csv_file="power.csv", root_dir="test123", transform=transforms.ToTensor()
+)
+train_set, test_set = torch.utils.data.random_split(dataset, [2900, 57])
+train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
+test_loader = DataLoader(dataset=test_set, batch_size=batch_size, shuffle=True)
+
+# Model
+model = NN(input_size=input_size, num_classes=num_classes).to(device)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+print(len(train_set))
+print(len(test_set))
+# Train Network
+for epoch in range(num_epochs):
+    losses = []
+
+    for batch_idx, (data, targets) in enumerate(train_loader):
+        # Get data to cuda if possible
+        data = data.to(device=device)
+        targets = targets.to(device=device)
+
+        # forward
+        scores = model(data)
+        loss = criterion(scores, targets)
+
+        losses.append(loss.item())
+
+        # backward
+        optimizer.zero_grad()
+        loss.backward()
+
+        # gradient descent or adam step
+        optimizer.step()
+
+    print(f"Cost at epoch {epoch} is {sum(losses) / len(losses)}")
+
+
+# Check accuracy on training to see how good our model is
+def check_accuracy(loader, model):
+    num_correct = 0
+    num_samples = 0
+    model.eval()
+
+    with torch.no_grad():
+        for x, y in loader:
+            x = x.to(device=device)
+            y = y.to(device=device)
+
+            scores = model(x)
+            _, predictions = scores.max(1)
+            num_correct += (predictions == y).sum()
+            num_samples += predictions.size(0)
+
+        print(
+            f"Got {num_correct} / {num_samples} with accuracy {float(num_correct) / float(num_samples) * 100:.2f}"
+        )
+
+    model.train()
+
+
+print("Checking accuracy on Training Set")
+check_accuracy(train_loader, model)
+
+print("Checking accuracy on Test Set")
+check_accuracy(test_loader, model)
--- a/ML/Pytorch/Basics/custom_dataset/custom_dataset.py
+++ b/ML/Pytorch/Basics/custom_dataset/custom_dataset.py
@@ -0,0 +1,130 @@
+"""
+Example of how to create custom dataset in Pytorch. In this case
+we have images of cats and dogs in a separate folder and a csv
+file containing the name to the jpg file as well as the target
+label (0 for cat, 1 for dog).
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-03 Initial coding
+
+"""
+
+# Imports
+import torch
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+import torchvision
+import os
+import pandas as pd
+from skimage import io
+from torch.utils.data import (
+    Dataset,
+    DataLoader,
+)  # Gives easier dataset managment and creates mini batches
+
+
+class CatsAndDogsDataset(Dataset):
+    def __init__(self, csv_file, root_dir, transform=None):
+        self.annotations = pd.read_csv(csv_file)
+        self.root_dir = root_dir
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.annotations)
+
+    def __getitem__(self, index):
+        img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
+        image = io.imread(img_path)
+        y_label = torch.tensor(int(self.annotations.iloc[index, 1]))
+
+        if self.transform:
+            image = self.transform(image)
+
+        return (image, y_label)
+
+
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+in_channel = 3
+num_classes = 2
+learning_rate = 1e-3
+batch_size = 32
+num_epochs = 10
+
+# Load Data
+dataset = CatsAndDogsDataset(
+    csv_file="cats_dogs.csv",
+    root_dir="cats_dogs_resized",
+    transform=transforms.ToTensor(),
+)
+
+# Dataset is actually a lot larger ~25k images, just took out 10 pictures
+# to upload to Github. It's enough to understand the structure and scale
+# if you got more images.
+train_set, test_set = torch.utils.data.random_split(dataset, [5, 5])
+train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
+test_loader = DataLoader(dataset=test_set, batch_size=batch_size, shuffle=True)
+
+# Model
+model = torchvision.models.googlenet(pretrained=True)
+model.to(device)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Train Network
+for epoch in range(num_epochs):
+    losses = []
+
+    for batch_idx, (data, targets) in enumerate(train_loader):
+        # Get data to cuda if possible
+        data = data.to(device=device)
+        targets = targets.to(device=device)
+
+        # forward
+        scores = model(data)
+        loss = criterion(scores, targets)
+
+        losses.append(loss.item())
+
+        # backward
+        optimizer.zero_grad()
+        loss.backward()
+
+        # gradient descent or adam step
+        optimizer.step()
+
+    print(f"Cost at epoch {epoch} is {sum(losses)/len(losses)}")
+
+# Check accuracy on training to see how good our model is
+def check_accuracy(loader, model):
+    num_correct = 0
+    num_samples = 0
+    model.eval()
+
+    with torch.no_grad():
+        for x, y in loader:
+            x = x.to(device=device)
+            y = y.to(device=device)
+
+            scores = model(x)
+            _, predictions = scores.max(1)
+            num_correct += (predictions == y).sum()
+            num_samples += predictions.size(0)
+
+        print(
+            f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
+        )
+
+    model.train()
+
+
+print("Checking accuracy on Training Set")
+check_accuracy(train_loader, model)
+
+print("Checking accuracy on Test Set")
+check_accuracy(test_loader, model)
--- a/ML/Pytorch/Basics/custom_dataset/power.csv
+++ b/ML/Pytorch/Basics/custom_dataset/power.csv
--- a/ML/Pytorch/Basics/custom_dataset_txt/loader_customtext.py
+++ b/ML/Pytorch/Basics/custom_dataset_txt/loader_customtext.py
@@ -0,0 +1,142 @@
+import os  # when loading file paths
+import pandas as pd  # for lookup in annotation file
+import spacy  # for tokenizer
+import torch
+from torch.nn.utils.rnn import pad_sequence  # pad batch
+from torch.utils.data import DataLoader, Dataset
+from PIL import Image  # Load img
+import torchvision.transforms as transforms
+
+
+# We want to convert text -> numerical values
+# 1. We need a Vocabulary mapping each word to a index
+# 2. We need to setup a Pytorch dataset to load the data
+# 3. Setup padding of every batch (all examples should be
+#    of same seq_len and setup dataloader)
+# Note that loading the image is very easy compared to the text!
+
+# Download with: python -m spacy download en
+spacy_eng = spacy.load("en")
+
+
+class Vocabulary:
+    def __init__(self, freq_threshold):
+        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
+        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
+        self.freq_threshold = freq_threshold
+
+    def __len__(self):
+        return len(self.itos)
+
+    @staticmethod
+    def tokenizer_eng(text):
+        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
+
+    def build_vocabulary(self, sentence_list):
+        frequencies = {}
+        idx = 4
+
+        for sentence in sentence_list:
+            for word in self.tokenizer_eng(sentence):
+                if word not in frequencies:
+                    frequencies[word] = 1
+
+                else:
+                    frequencies[word] += 1
+
+                if frequencies[word] == self.freq_threshold:
+                    self.stoi[word] = idx
+                    self.itos[idx] = word
+                    idx += 1
+
+    def numericalize(self, text):
+        tokenized_text = self.tokenizer_eng(text)
+
+        return [
+            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
+            for token in tokenized_text
+        ]
+
+
+class FlickrDataset(Dataset):
+    def __init__(self, root_dir, captions_file, transform=None, freq_threshold=5):
+        self.root_dir = root_dir
+        self.df = pd.read_csv(captions_file)
+        self.transform = transform
+
+        # Get img, caption columns
+        self.imgs = self.df["image"]
+        self.captions = self.df["caption"]
+
+        # Initialize vocabulary and build vocab
+        self.vocab = Vocabulary(freq_threshold)
+        self.vocab.build_vocabulary(self.captions.tolist())
+
+    def __len__(self):
+        return len(self.df)
+
+    def __getitem__(self, index):
+        caption = self.captions[index]
+        img_id = self.imgs[index]
+        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
+
+        if self.transform is not None:
+            img = self.transform(img)
+
+        numericalized_caption = [self.vocab.stoi["<SOS>"]]
+        numericalized_caption += self.vocab.numericalize(caption)
+        numericalized_caption.append(self.vocab.stoi["<EOS>"])
+
+        return img, torch.tensor(numericalized_caption)
+
+
+class MyCollate:
+    def __init__(self, pad_idx):
+        self.pad_idx = pad_idx
+
+    def __call__(self, batch):
+        imgs = [item[0].unsqueeze(0) for item in batch]
+        imgs = torch.cat(imgs, dim=0)
+        targets = [item[1] for item in batch]
+        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)
+
+        return imgs, targets
+
+
+def get_loader(
+    root_folder,
+    annotation_file,
+    transform,
+    batch_size=32,
+    num_workers=8,
+    shuffle=True,
+    pin_memory=True,
+):
+    dataset = FlickrDataset(root_folder, annotation_file, transform=transform)
+
+    pad_idx = dataset.vocab.stoi["<PAD>"]
+
+    loader = DataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=shuffle,
+        pin_memory=pin_memory,
+        collate_fn=MyCollate(pad_idx=pad_idx),
+    )
+
+    return loader, dataset
+
+
+if __name__ == "__main__":
+    transform = transforms.Compose(
+        [transforms.Resize((224, 224)), transforms.ToTensor(),]
+    )
+
+    loader, dataset = get_loader(
+        "flickr8k/images/", "flickr8k/captions.txt", transform=transform
+    )
+
+    for idx, (imgs, captions) in enumerate(loader):
+        print(imgs.shape)
+        print(captions.shape)
--- a/ML/Pytorch/Basics/pytorch_bidirectional_lstm.py
+++ b/ML/Pytorch/Basics/pytorch_bidirectional_lstm.py
@@ -0,0 +1,125 @@
+"""
+Example code of a simple bidirectional LSTM on the MNIST dataset.
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-05-09 Initial coding
+
+"""
+
+
+# Imports
+import torch
+import torchvision
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+import torch.nn.functional as F  # All functions that don't have any parameters
+from torch.utils.data import (
+    DataLoader,
+)  # Gives easier dataset managment and creates mini batches
+import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+input_size = 28
+sequence_length = 28
+num_layers = 2
+hidden_size = 256
+num_classes = 10
+learning_rate = 0.001
+batch_size = 64
+num_epochs = 2
+
+# Create a bidirectional LSTM
+class BRNN(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, num_classes):
+        super(BRNN, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.lstm = nn.LSTM(
+            input_size, hidden_size, num_layers, batch_first=True, bidirectional=True
+        )
+        self.fc = nn.Linear(hidden_size * 2, num_classes)
+
+    def forward(self, x):
+        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
+        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
+
+        out, _ = self.lstm(x, (h0, c0))
+        out = self.fc(out[:, -1, :])
+
+        return out
+
+
+# Load Data
+train_dataset = datasets.MNIST(
+    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+)
+
+test_dataset = datasets.MNIST(
+    root="dataset/", train=False, transform=transforms.ToTensor(), download=True
+)
+
+train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
+test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
+
+# Initialize network
+model = BRNN(input_size, hidden_size, num_layers, num_classes).to(device)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Train Network
+for epoch in range(num_epochs):
+    for batch_idx, (data, targets) in enumerate(train_loader):
+        # Get data to cuda if possible
+        data = data.to(device=device).squeeze(1)
+        targets = targets.to(device=device)
+
+        # forward
+        scores = model(data)
+        loss = criterion(scores, targets)
+
+        # backward
+        optimizer.zero_grad()
+        loss.backward()
+
+        # gradient descent or adam step
+        optimizer.step()
+
+# Check accuracy on training & test to see how good our model
+
+
+def check_accuracy(loader, model):
+    if loader.dataset.train:
+        print("Checking accuracy on training data")
+    else:
+        print("Checking accuracy on test data")
+
+    num_correct = 0
+    num_samples = 0
+    model.eval()
+
+    with torch.no_grad():
+        for x, y in loader:
+            x = x.to(device=device).squeeze(1)
+            y = y.to(device=device)
+
+            scores = model(x)
+            _, predictions = scores.max(1)
+            num_correct += (predictions == y).sum()
+            num_samples += predictions.size(0)
+
+        print(
+            f"Got {num_correct} / {num_samples} with accuracy  \
+              {float(num_correct)/float(num_samples)*100:.2f}"
+        )
+
+    model.train()
+
+
+check_accuracy(train_loader, model)
+check_accuracy(test_loader, model)
--- a/ML/Pytorch/Basics/pytorch_init_weights.py
+++ b/ML/Pytorch/Basics/pytorch_init_weights.py
@@ -0,0 +1,69 @@
+"""
+Example code of how to initialize weights for a simple CNN network.
+
+Video explanation: https://youtu.be/xWQ-p_o0Uik
+Got any questions leave a comment on youtube :)
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-10 Initial coding
+
+"""
+
+# Imports
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.nn.functional as F  # All functions that don't have any parameters
+
+
+class CNN(nn.Module):
+    def __init__(self, in_channels, num_classes):
+        super(CNN, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=6,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+        )
+        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
+        self.conv2 = nn.Conv2d(
+            in_channels=6,
+            out_channels=16,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+        )
+        self.fc1 = nn.Linear(16 * 7 * 7, num_classes)
+        self.initialize_weights()
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = self.pool(x)
+        x = F.relu(self.conv2(x))
+        x = self.pool(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.fc1(x)
+
+        return x
+
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight)
+
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_uniform_(m.weight)
+                nn.init.constant_(m.bias, 0)
+
+
+if __name__ == "__main__":
+    model = CNN(in_channels=3, num_classes=10)
+
+    for param in model.parameters():
+        print(param)
--- a/ML/Pytorch/Basics/pytorch_loadsave.py
+++ b/ML/Pytorch/Basics/pytorch_loadsave.py
@@ -0,0 +1,54 @@
+"""
+Small code example of how to save and load checkpoint of a model.
+This example doesn't perform any training, so it would be quite useless.
+
+In practice you would save the model as you train, and then load before 
+continuining training at another point.
+
+Video explanation of code & how to save and load model: https://youtu.be/g6kQl_EFn84
+Got any questions leave a comment on youtube :)
+
+Coded by Aladdin Persson <aladdin dot person at hotmail dot com>
+-   2020-04-07 Initial programming
+
+"""
+
+# Imports
+import torch
+import torchvision
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+import torch.nn.functional as F  # All functions that don't have any parameters
+from torch.utils.data import (
+    DataLoader,
+)  # Gives easier dataset managment and creates mini batches
+import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+
+
+def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+
+
+def load_checkpoint(checkpoint, model, optimizer):
+    print("=> Loading checkpoint")
+    model.load_state_dict(checkpoint["state_dict"])
+    optimizer.load_state_dict(checkpoint["optimizer"])
+
+
+def main():
+    # Initialize network
+    model = torchvision.models.vgg16(pretrained=False)
+    optimizer = optim.Adam(model.parameters())
+
+    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
+    # Try save checkpoint
+    save_checkpoint(checkpoint)
+
+    # Try load checkpoint
+    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
+
+
+if __name__ == "__main__":
+    main()
--- a/ML/Pytorch/Basics/pytorch_lr_ratescheduler.py
+++ b/ML/Pytorch/Basics/pytorch_lr_ratescheduler.py
@@ -0,0 +1,107 @@
+"""
+Example code of how to use a learning rate scheduler simple, in this
+case with a (very) small and simple Feedforward Network training on MNIST
+dataset with a learning rate scheduler. In this case ReduceLROnPlateau
+scheduler is used, but can easily be changed to any of the other schedulers
+available.
+
+Video explanation: https://youtu.be/P31hB37g4Ak
+Got any questions leave a comment on youtube :)
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-10 Initial programming
+
+"""
+
+# Imports
+import torch
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+from torch.utils.data import (
+    DataLoader,
+)  # Gives easier dataset managment and creates mini batches
+import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+num_classes = 10
+learning_rate = 0.1
+batch_size = 128
+num_epochs = 100
+
+# Define a very simple model
+model = nn.Sequential(nn.Linear(784, 50), nn.ReLU(), nn.Linear(50, 10)).to(device)
+
+# Load Data
+train_dataset = datasets.MNIST(
+    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+)
+train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Define Scheduler
+scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+    optimizer, factor=0.1, patience=5, verbose=True
+)
+
+# Train Network
+for epoch in range(1, num_epochs):
+    losses = []
+
+    for batch_idx, (data, targets) in enumerate(train_loader):
+        # Get data to cuda if possible
+        data = data.reshape(data.shape[0], -1)
+        data = data.to(device=device)
+        targets = targets.to(device=device)
+
+        # forward
+        scores = model(data)
+        loss = criterion(scores, targets)
+
+        losses.append(loss.item())
+
+        # backward
+        loss.backward()
+
+        # gradient descent or adam step
+        # scheduler.step(loss)
+        optimizer.step()
+        optimizer.zero_grad()
+
+    mean_loss = sum(losses) / len(losses)
+
+    # After each epoch do scheduler.step, note in this scheduler we need to send
+    # in loss for that epoch!
+    scheduler.step(mean_loss)
+    print(f"Cost at epoch {epoch} is {mean_loss}")
+
+# Check accuracy on training & test to see how good our model
+def check_accuracy(loader, model):
+    num_correct = 0
+    num_samples = 0
+    model.eval()
+
+    with torch.no_grad():
+        for x, y in loader:
+            x = x.to(device=device)
+            y = y.to(device=device)
+
+            scores = model(x)
+            _, predictions = scores.max(1)
+            num_correct += (predictions == y).sum()
+            num_samples += predictions.size(0)
+
+        print(
+            f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
+        )
+
+    model.train()
+
+
+check_accuracy(train_loader, model)
--- a/ML/Pytorch/Basics/pytorch_mixed_precision_example.py
+++ b/ML/Pytorch/Basics/pytorch_mixed_precision_example.py
@@ -0,0 +1,99 @@
+# Imports
+import torch
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+import torch.nn.functional as F  # All functions that don't have any parameters
+from torch.utils.data import DataLoader  # Gives easier dataset managment and creates mini batches
+import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+
+
+# Simple CNN
+class CNN(nn.Module):
+    def __init__(self, in_channels=1, num_classes=10):
+        super(CNN, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=1, out_channels=420, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
+        self.conv2 = nn.Conv2d(in_channels=420, out_channels=1000, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.fc1 = nn.Linear(1000 * 7 * 7, num_classes)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = self.pool(x)
+        x = F.relu(self.conv2(x))
+        x = self.pool(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.fc1(x)
+
+        return x
+
+
+# Set device
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+# Hyperparameters
+in_channel = 1
+num_classes = 10
+learning_rate = 0.001
+batch_size = 100
+num_epochs = 5
+
+# Load Data
+train_dataset = datasets.MNIST(root='dataset/', train=True, transform=transforms.ToTensor(), download=True)
+train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
+test_dataset = datasets.MNIST(root='dataset/', train=False, transform=transforms.ToTensor(), download=True)
+test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
+
+# Initialize network
+model = CNN().to(device)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Necessary for FP16
+scaler = torch.cuda.amp.GradScaler()
+
+# Train Network
+for epoch in range(num_epochs):
+    for batch_idx, (data, targets) in enumerate(train_loader):
+        # Get data to cuda if possible
+        data = data.to(device=device)
+        targets = targets.to(device=device)
+
+        # forward
+        with torch.cuda.amp.autocast():
+            scores = model(data)
+            loss = criterion(scores, targets)
+
+        # backward
+        optimizer.zero_grad()
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+
+
+# Check accuracy on training & test to see how good our model
+
+def check_accuracy(loader, model):
+    num_correct = 0
+    num_samples = 0
+    model.eval()
+
+    with torch.no_grad():
+        for x, y in loader:
+            x = x.to(device=device)
+            y = y.to(device=device)
+
+            scores = model(x)
+            _, predictions = scores.max(1)
+            num_correct += (predictions == y).sum()
+            num_samples += predictions.size(0)
+
+        print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct) / float(num_samples) * 100:.2f}')
+
+    model.train()
+
+
+check_accuracy(train_loader, model)
+check_accuracy(test_loader, model)
--- a/ML/Pytorch/Basics/pytorch_pretrain_finetune.py
+++ b/ML/Pytorch/Basics/pytorch_pretrain_finetune.py
@@ -0,0 +1,123 @@
+"""
+Shows a small example of how to load a pretrain model (VGG16) from PyTorch,
+and modifies this to train on the CIFAR10 dataset. The same method generalizes
+well to other datasets, but the modifications to the network may need to be changed.
+
+Video explanation: https://youtu.be/U4bHxEhMGNk
+Got any questions leave a comment on youtube :)
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-08 Initial coding
+
+"""
+
+# Imports
+import torch
+import torchvision
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+import torch.nn.functional as F  # All functions that don't have any parameters
+from torch.utils.data import (
+    DataLoader,
+)  # Gives easier dataset managment and creates mini batches
+import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+num_classes = 10
+learning_rate = 1e-3
+batch_size = 1024
+num_epochs = 5
+
+# Simple Identity class that let's input pass without changes
+class Identity(nn.Module):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+# Load pretrain model & modify it
+model = torchvision.models.vgg16(pretrained=True)
+
+# If you want to do finetuning then set requires_grad = False
+# Remove these two lines if you want to train entire model,
+# and only want to load the pretrain weights.
+for param in model.parameters():
+    param.requires_grad = False
+
+model.avgpool = Identity()
+model.classifier = nn.Sequential(
+    nn.Linear(512, 100), nn.ReLU(), nn.Linear(100, num_classes)
+)
+model.to(device)
+
+
+# Load Data
+train_dataset = datasets.CIFAR10(
+    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+)
+train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Train Network
+for epoch in range(num_epochs):
+    losses = []
+
+    for batch_idx, (data, targets) in enumerate(train_loader):
+        # Get data to cuda if possible
+        data = data.to(device=device)
+        targets = targets.to(device=device)
+
+        # forward
+        scores = model(data)
+        loss = criterion(scores, targets)
+
+        losses.append(loss.item())
+        # backward
+        optimizer.zero_grad()
+        loss.backward()
+
+        # gradient descent or adam step
+        optimizer.step()
+
+    print(f"Cost at epoch {epoch} is {sum(losses)/len(losses):.5f}")
+
+# Check accuracy on training & test to see how good our model
+
+
+def check_accuracy(loader, model):
+    if loader.dataset.train:
+        print("Checking accuracy on training data")
+    else:
+        print("Checking accuracy on test data")
+
+    num_correct = 0
+    num_samples = 0
+    model.eval()
+
+    with torch.no_grad():
+        for x, y in loader:
+            x = x.to(device=device)
+            y = y.to(device=device)
+
+            scores = model(x)
+            _, predictions = scores.max(1)
+            num_correct += (predictions == y).sum()
+            num_samples += predictions.size(0)
+
+        print(
+            f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
+        )
+
+    model.train()
+
+
+check_accuracy(train_loader, model)
--- a/ML/Pytorch/Basics/pytorch_progress_bar.py
+++ b/ML/Pytorch/Basics/pytorch_progress_bar.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from torch.utils.data import TensorDataset, DataLoader
+
+# Create a simple toy dataset example, normally this
+# would be doing custom class with __getitem__ etc,
+# which we have done in custom dataset tutorials
+x = torch.randn((1000, 3, 224, 224))
+y = torch.randint(low=0, high=10, size=(1000, 1))
+ds = TensorDataset(x, y)
+loader = DataLoader(ds, batch_size=8)
+
+
+model = nn.Sequential(
+    nn.Conv2d(3, 10, kernel_size=3, padding=1, stride=1),
+    nn.Flatten(),
+    nn.Linear(10*224*224, 10),
+)
+
+NUM_EPOCHS = 100
+for epoch in range(NUM_EPOCHS):
+    loop = tqdm(loader)
+    for idx, (x, y) in enumerate(loop):
+        scores = model(x)
+
+        # here we would compute loss, backward, optimizer step etc.
+        # you know how it goes, but now you have a nice progress bar
+        # with tqdm
+
+        # then at the bottom if you want additional info shown, you can
+        # add it here, for loss and accuracy you would obviously compute
+        # but now we just set them to random values
+        loop.set_description(f"Epoch [{epoch}/{NUM_EPOCHS}]")
+        loop.set_postfix(loss=torch.rand(1).item(), acc=torch.rand(1).item())
+
+# There you go. Hope it was useful :)
+
+
+
+
--- a/ML/Pytorch/Basics/pytorch_rnn_gru_lstm.py
+++ b/ML/Pytorch/Basics/pytorch_rnn_gru_lstm.py
@@ -0,0 +1,172 @@
+"""
+Example code of a simple RNN, GRU, LSTM on the MNIST dataset.
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-05-09 Initial coding
+
+"""
+
+# Imports
+import torch
+import torchvision
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+import torch.nn.functional as F  # All functions that don't have any parameters
+from torch.utils.data import (
+    DataLoader,
+)  # Gives easier dataset managment and creates mini batches
+import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+input_size = 28
+hidden_size = 256
+num_layers = 2
+num_classes = 10
+sequence_length = 28
+learning_rate = 0.005
+batch_size = 64
+num_epochs = 2
+
+# Recurrent neural network (many-to-one)
+class RNN(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, num_classes):
+        super(RNN, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
+        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
+
+    def forward(self, x):
+        # Set initial hidden and cell states
+        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
+
+        # Forward propagate LSTM
+        out, _ = self.rnn(x, h0)
+        out = out.reshape(out.shape[0], -1)
+
+        # Decode the hidden state of the last time step
+        out = self.fc(out)
+        return out
+
+
+# Recurrent neural network with GRU (many-to-one)
+class RNN_GRU(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, num_classes):
+        super(RNN_GRU, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
+        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
+
+    def forward(self, x):
+        # Set initial hidden and cell states
+        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
+
+        # Forward propagate LSTM
+        out, _ = self.gru(x, h0)
+        out = out.reshape(out.shape[0], -1)
+
+        # Decode the hidden state of the last time step
+        out = self.fc(out)
+        return out
+
+
+# Recurrent neural network with LSTM (many-to-one)
+class RNN_LSTM(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, num_classes):
+        super(RNN_LSTM, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
+        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
+
+    def forward(self, x):
+        # Set initial hidden and cell states
+        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
+        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
+
+        # Forward propagate LSTM
+        out, _ = self.lstm(
+            x, (h0, c0)
+        )  # out: tensor of shape (batch_size, seq_length, hidden_size)
+        out = out.reshape(out.shape[0], -1)
+
+        # Decode the hidden state of the last time step
+        out = self.fc(out)
+        return out
+
+
+# Load Data
+train_dataset = datasets.MNIST(
+    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+)
+
+test_dataset = datasets.MNIST(
+    root="dataset/", train=False, transform=transforms.ToTensor(), download=True
+)
+
+train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
+test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
+
+# Initialize network
+model = RNN_LSTM(input_size, hidden_size, num_layers, num_classes).to(device)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Train Network
+for epoch in range(num_epochs):
+    for batch_idx, (data, targets) in enumerate(train_loader):
+        # Get data to cuda if possible
+        data = data.to(device=device).squeeze(1)
+        targets = targets.to(device=device)
+
+        # forward
+        scores = model(data)
+        loss = criterion(scores, targets)
+
+        # backward
+        optimizer.zero_grad()
+        loss.backward()
+
+        # gradient descent or adam step
+        optimizer.step()
+
+# Check accuracy on training & test to see how good our model
+def check_accuracy(loader, model):
+    if loader.dataset.train:
+        print("Checking accuracy on training data")
+    else:
+        print("Checking accuracy on test data")
+
+    num_correct = 0
+    num_samples = 0
+
+    # Set model to eval
+    model.eval()
+
+    with torch.no_grad():
+        for x, y in loader:
+            x = x.to(device=device).squeeze(1)
+            y = y.to(device=device)
+
+            scores = model(x)
+            _, predictions = scores.max(1)
+            num_correct += (predictions == y).sum()
+            num_samples += predictions.size(0)
+
+        print(
+            f"Got {num_correct} / {num_samples} with \
+              accuracy {float(num_correct)/float(num_samples)*100:.2f}"
+        )
+    # Set model back to train
+    model.train()
+
+
+check_accuracy(train_loader, model)
+check_accuracy(test_loader, model)
--- a/ML/Pytorch/Basics/pytorch_simple_CNN.py
+++ b/ML/Pytorch/Basics/pytorch_simple_CNN.py
@@ -0,0 +1,134 @@
+"""
+Example code of a simple CNN network training on MNIST dataset.
+The code is intended to show how to create a CNN network as well
+as how to initialize loss, optimizer, etc. in a simple way to get
+training to work with function that checks accuracy as well.
+
+Video explanation: https://youtu.be/wnK3uWv_WkU
+Got any questions leave a comment on youtube :)
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-08 Initial coding
+
+"""
+
+# Imports
+import torch
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+import torch.nn.functional as F  # All functions that don't have any parameters
+from torch.utils.data import (
+    DataLoader,
+)  # Gives easier dataset managment and creates mini batches
+import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+
+# Simple CNN
+class CNN(nn.Module):
+    def __init__(self, in_channels=1, num_classes=10):
+        super(CNN, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=1,
+            out_channels=8,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+        )
+        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
+        self.conv2 = nn.Conv2d(
+            in_channels=8,
+            out_channels=16,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+        )
+        self.fc1 = nn.Linear(16 * 7 * 7, num_classes)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = self.pool(x)
+        x = F.relu(self.conv2(x))
+        x = self.pool(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.fc1(x)
+
+        return x
+
+
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+in_channel = 1
+num_classes = 10
+learning_rate = 0.001
+batch_size = 64
+num_epochs = 5
+
+# Load Data
+train_dataset = datasets.MNIST(
+    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+)
+train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
+test_dataset = datasets.MNIST(
+    root="dataset/", train=False, transform=transforms.ToTensor(), download=True
+)
+test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
+
+# Initialize network
+model = CNN().to(device)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Train Network
+for epoch in range(num_epochs):
+    for batch_idx, (data, targets) in enumerate(train_loader):
+        # Get data to cuda if possible
+        data = data.to(device=device)
+        targets = targets.to(device=device)
+
+        # forward
+        scores = model(data)
+        loss = criterion(scores, targets)
+
+        # backward
+        optimizer.zero_grad()
+        loss.backward()
+
+        # gradient descent or adam step
+        optimizer.step()
+
+# Check accuracy on training & test to see how good our model
+
+
+def check_accuracy(loader, model):
+    if loader.dataset.train:
+        print("Checking accuracy on training data")
+    else:
+        print("Checking accuracy on test data")
+
+    num_correct = 0
+    num_samples = 0
+    model.eval()
+
+    with torch.no_grad():
+        for x, y in loader:
+            x = x.to(device=device)
+            y = y.to(device=device)
+
+            scores = model(x)
+            _, predictions = scores.max(1)
+            num_correct += (predictions == y).sum()
+            num_samples += predictions.size(0)
+
+        print(
+            f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
+        )
+
+    model.train()
+
+
+check_accuracy(train_loader, model)
+check_accuracy(test_loader, model)
--- a/ML/Pytorch/Basics/pytorch_simple_fullynet.py
+++ b/ML/Pytorch/Basics/pytorch_simple_fullynet.py
@@ -0,0 +1,120 @@
+"""
+Working code of a simple Fully Connected (FC) network training on MNIST dataset.
+The code is intended to show how to create a FC network as well
+as how to initialize loss, optimizer, etc. in a simple way to get
+training to work with function that checks accuracy as well.
+
+Video explanation: https://youtu.be/Jy4wM2X21u0
+Got any questions leave a comment on youtube :)
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-08 Initial coding
+
+"""
+
+# Imports
+import torch
+import torchvision
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+import torch.nn.functional as F  # All functions that don't have any parameters
+from torch.utils.data import (
+    DataLoader,
+)  # Gives easier dataset managment and creates mini batches
+import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+
+# Create Fully Connected Network
+class NN(nn.Module):
+    def __init__(self, input_size, num_classes):
+        super(NN, self).__init__()
+        self.fc1 = nn.Linear(input_size, 50)
+        self.fc2 = nn.Linear(50, num_classes)
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return x
+
+
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+input_size = 784
+num_classes = 10
+learning_rate = 0.001
+batch_size = 64
+num_epochs = 1
+
+# Load Data
+train_dataset = datasets.MNIST(
+    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+)
+train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
+test_dataset = datasets.MNIST(
+    root="dataset/", train=False, transform=transforms.ToTensor(), download=True
+)
+test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
+
+# Initialize network
+model = NN(input_size=input_size, num_classes=num_classes).to(device)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Train Network
+for epoch in range(num_epochs):
+    for batch_idx, (data, targets) in enumerate(train_loader):
+        # Get data to cuda if possible
+        data = data.to(device=device)
+        targets = targets.to(device=device)
+
+        # Get to correct shape
+        data = data.reshape(data.shape[0], -1)
+
+        # forward
+        scores = model(data)
+        loss = criterion(scores, targets)
+
+        # backward
+        optimizer.zero_grad()
+        loss.backward()
+
+        # gradient descent or adam step
+        optimizer.step()
+
+# Check accuracy on training & test to see how good our model
+
+
+def check_accuracy(loader, model):
+    if loader.dataset.train:
+        print("Checking accuracy on training data")
+    else:
+        print("Checking accuracy on test data")
+
+    num_correct = 0
+    num_samples = 0
+    model.eval()
+
+    with torch.no_grad():
+        for x, y in loader:
+            x = x.to(device=device)
+            y = y.to(device=device)
+            x = x.reshape(x.shape[0], -1)
+
+            scores = model(x)
+            _, predictions = scores.max(1)
+            num_correct += (predictions == y).sum()
+            num_samples += predictions.size(0)
+
+        print(
+            f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
+        )
+
+    model.train()
+
+
+check_accuracy(train_loader, model)
+check_accuracy(test_loader, model)
--- a/ML/Pytorch/Basics/pytorch_std_mean.py
+++ b/ML/Pytorch/Basics/pytorch_std_mean.py
@@ -0,0 +1,28 @@
+import torch
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+import torchvision.datasets as datasets
+from tqdm import tqdm
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+train_set = datasets.CIFAR10(root="ds/", transform=transforms.ToTensor(), download=True)
+train_loader = DataLoader(dataset=train_set, batch_size=64, shuffle=True)
+
+def get_mean_std(loader):
+    # var[X] = E[X**2] - E[X]**2
+    channels_sum, channels_sqrd_sum, num_batches = 0, 0, 0
+
+    for data, _ in tqdm(loader):
+        channels_sum += torch.mean(data, dim=[0, 2, 3])
+        channels_sqrd_sum += torch.mean(data ** 2, dim=[0, 2, 3])
+        num_batches += 1
+
+    mean = channels_sum / num_batches
+    std = (channels_sqrd_sum / num_batches - mean ** 2) ** 0.5
+
+    return mean, std
+
+
+mean, std = get_mean_std(train_loader)
+print(mean)
+print(std)
--- a/ML/Pytorch/Basics/pytorch_tensorbasics.py
+++ b/ML/Pytorch/Basics/pytorch_tensorbasics.py
@@ -0,0 +1,299 @@
+"""
+Walk through of a lot of different useful Tensor Operations, where we
+go through what I think are four main parts in:
+
+1. Initialization of a Tensor
+2. Tensor Mathematical Operations and Comparison
+3. Tensor Indexing
+4. Tensor Reshaping
+
+But also other things such as setting the device (GPU/CPU) and converting
+between different types (int, float etc) and how to convert a tensor to an
+numpy array and vice-versa.
+
+"""
+
+import torch
+
+# ================================================================= #
+#                        Initializing Tensor                        #
+# ================================================================= #
+
+device = "cuda" if torch.cuda.is_available() else "cpu"  # Cuda to run on GPU!
+
+# Initializing a Tensor in this case of shape 2x3 (2 rows, 3 columns)
+my_tensor = torch.tensor(
+    [[1, 2, 3], [4, 5, 6]], dtype=torch.float32, device=device, requires_grad=True
+)
+
+# A few tensor attributes
+print(
+    f"Information about tensor: {my_tensor}"
+)  # Prints data of the tensor, device and grad info
+print(
+    "Type of Tensor {my_tensor.dtype}"
+)  # Prints dtype of the tensor (torch.float32, etc)
+print(
+    f"Device Tensor is on {my_tensor.device}"
+)  # Prints cpu/cuda (followed by gpu number)
+print(f"Shape of tensor {my_tensor.shape}")  # Prints shape, in this case 2x3
+print(f"Requires gradient: {my_tensor.requires_grad}")  # Prints true/false
+
+# Other common initialization methods (there exists a ton more)
+x = torch.empty(size=(3, 3))  # Tensor of shape 3x3 with uninitialized data
+x = torch.zeros((3, 3))  # Tensor of shape 3x3 with values of 0
+x = torch.rand(
+    (3, 3)
+)  # Tensor of shape 3x3 with values from uniform distribution in interval [0,1)
+x = torch.ones((3, 3))  # Tensor of shape 3x3 with values of 1
+x = torch.eye(5, 5)  # Returns Identity Matrix I, (I <-> Eye), matrix of shape 2x3
+x = torch.arange(
+    start=0, end=5, step=1
+)  # Tensor [0, 1, 2, 3, 4], note, can also do: torch.arange(11)
+x = torch.linspace(start=0.1, end=1, steps=10)  # x = [0.1, 0.2, ..., 1]
+x = torch.empty(size=(1, 5)).normal_(
+    mean=0, std=1
+)  # Normally distributed with mean=0, std=1
+x = torch.empty(size=(1, 5)).uniform_(
+    0, 1
+)  # Values from a uniform distribution low=0, high=1
+x = torch.diag(torch.ones(3))  # Diagonal matrix of shape 3x3
+
+# How to make initialized tensors to other types (int, float, double)
+# These will work even if you're on CPU or CUDA!
+tensor = torch.arange(4)  # [0, 1, 2, 3] Initialized as int64 by default
+print(f"Converted Boolean: {tensor.bool()}")  # Converted to Boolean: 1 if nonzero
+print(f"Converted int16 {tensor.short()}")  # Converted to int16
+print(
+    f"Converted int64 {tensor.long()}"
+)  # Converted to int64 (This one is very important, used super often)
+print(f"Converted float16 {tensor.half()}")  # Converted to float16
+print(
+    f"Converted float32 {tensor.float()}"
+)  # Converted to float32 (This one is very important, used super often)
+print(f"Converted float64 {tensor.double()}")  # Converted to float64
+
+# Array to Tensor conversion and vice-versa
+import numpy as np
+
+np_array = np.zeros((5, 5))
+tensor = torch.from_numpy(np_array)
+np_array_again = (
+    tensor.numpy()
+)  # np_array_again will be same as np_array (perhaps with numerical round offs)
+
+# =============================================================================== #
+#                        Tensor Math & Comparison Operations                      #
+# =============================================================================== #
+
+x = torch.tensor([1, 2, 3])
+y = torch.tensor([9, 8, 7])
+
+# -- Addition --
+z1 = torch.empty(3)
+torch.add(x, y, out=z1)  # This is one way
+z2 = torch.add(x, y)  # This is another way
+z = x + y  # This is my preferred way, simple and clean.
+
+# -- Subtraction --
+z = x - y  # We can do similarly as the preferred way of addition
+
+# -- Division (A bit clunky) --
+z = torch.true_divide(x, y)  # Will do element wise division if of equal shape
+
+# -- Inplace Operations --
+t = torch.zeros(3)
+
+t.add_(x)  # Whenever we have operation followed by _ it will mutate the tensor in place
+t += x  # Also inplace: t = t + x is not inplace, bit confusing.
+
+# -- Exponentiation (Element wise if vector or matrices) --
+z = x.pow(2)  # z = [1, 4, 9]
+z = x ** 2  # z = [1, 4, 9]
+
+
+# -- Simple Comparison --
+z = x > 0  # Returns [True, True, True]
+z = x < 0  # Returns [False, False, False]
+
+# -- Matrix Multiplication --
+x1 = torch.rand((2, 5))
+x2 = torch.rand((5, 3))
+x3 = torch.mm(x1, x2)  # Matrix multiplication of x1 and x2, out shape: 2x3
+x3 = x1.mm(x2)  # Similar as line above
+
+# -- Matrix Exponentiation --
+matrix_exp = torch.rand(5, 5)
+print(
+    matrix_exp.matrix_power(3)
+)  # is same as matrix_exp (mm) matrix_exp (mm) matrix_exp
+
+# -- Element wise Multiplication --
+z = x * y  # z = [9, 16, 21] = [1*9, 2*8, 3*7]
+
+# -- Dot product --
+z = torch.dot(x, y)  # Dot product, in this case z = 1*9 + 2*8 + 3*7
+
+# -- Batch Matrix Multiplication --
+batch = 32
+n = 10
+m = 20
+p = 30
+tensor1 = torch.rand((batch, n, m))
+tensor2 = torch.rand((batch, m, p))
+out_bmm = torch.bmm(tensor1, tensor2)  # Will be shape: (b x n x p)
+
+# -- Example of broadcasting --
+x1 = torch.rand((5, 5))
+x2 = torch.ones((1, 5))
+z = (
+    x1 - x2
+)  # Shape of z is 5x5: How? The 1x5 vector (x2) is subtracted for each row in the 5x5 (x1)
+z = (
+    x1 ** x2
+)  # Shape of z is 5x5: How? Broadcasting! Element wise exponentiation for every row
+
+# Other useful tensor operations
+sum_x = torch.sum(
+    x, dim=0
+)  # Sum of x across dim=0 (which is the only dim in our case), sum_x = 6
+values, indices = torch.max(x, dim=0)  # Can also do x.max(dim=0)
+values, indices = torch.min(x, dim=0)  # Can also do x.min(dim=0)
+abs_x = torch.abs(x)  # Returns x where abs function has been applied to every element
+z = torch.argmax(x, dim=0)  # Gets index of the maximum value
+z = torch.argmin(x, dim=0)  # Gets index of the minimum value
+mean_x = torch.mean(x.float(), dim=0)  # mean requires x to be float
+z = torch.eq(x, y)  # Element wise comparison, in this case z = [False, False, False]
+sorted_y, indices = torch.sort(y, dim=0, descending=False)
+
+z = torch.clamp(x, min=0)
+# All values < 0 set to 0 and values > 0 unchanged (this is exactly ReLU function)
+# If you want to values over max_val to be clamped, do torch.clamp(x, min=min_val, max=max_val)
+
+x = torch.tensor([1, 0, 1, 1, 1], dtype=torch.bool)  # True/False values
+z = torch.any(x)  # will return True, can also do x.any() instead of torch.any(x)
+z = torch.all(
+    x
+)  # will return False (since not all are True), can also do x.all() instead of torch.all()
+
+# ============================================================= #
+#                        Tensor Indexing                        #
+# ============================================================= #
+
+batch_size = 10
+features = 25
+x = torch.rand((batch_size, features))
+
+# Get first examples features
+print(x[0].shape)  # shape [25], this is same as doing x[0,:]
+
+# Get the first feature for all examples
+print(x[:, 0].shape)  # shape [10]
+
+# For example: Want to access third example in the batch and the first ten features
+print(x[2, 0:10].shape)  # shape: [10]
+
+# For example we can use this to, assign certain elements
+x[0, 0] = 100
+
+# Fancy Indexing
+x = torch.arange(10)
+indices = [2, 5, 8]
+print(x[indices])  # x[indices] = [2, 5, 8]
+
+x = torch.rand((3, 5))
+rows = torch.tensor([1, 0])
+cols = torch.tensor([4, 0])
+print(x[rows, cols])  # Gets second row fifth column and first row first column
+
+# More advanced indexing
+x = torch.arange(10)
+print(x[(x < 2) | (x > 8)])  # will be [0, 1, 9]
+print(x[x.remainder(2) == 0])  # will be [0, 2, 4, 6, 8]
+
+# Useful operations for indexing
+print(
+    torch.where(x > 5, x, x * 2)
+)  # gives [0, 2, 4, 6, 8, 10, 6, 7, 8, 9], all values x > 5 yield x, else x*2
+x = torch.tensor([0, 0, 1, 2, 2, 3, 4]).unique()  # x = [0, 1, 2, 3, 4]
+print(
+    x.ndimension()
+)  # The number of dimensions, in this case 1. if x.shape is 5x5x5 ndim would be 3
+x = torch.arange(10)
+print(
+    x.numel()
+)  # The number of elements in x (in this case it's trivial because it's just a vector)
+
+# ============================================================= #
+#                        Tensor Reshaping                       #
+# ============================================================= #
+
+x = torch.arange(9)
+
+# Let's say we want to reshape it to be 3x3
+x_3x3 = x.view(3, 3)
+
+# We can also do (view and reshape are very similar)
+# and the differences are in simple terms (I'm no expert at this),
+# is that view acts on contiguous tensors meaning if the
+# tensor is stored contiguously in memory or not, whereas
+# for reshape it doesn't matter because it will copy the
+# tensor to make it contiguously stored, which might come
+# with some performance loss.
+x_3x3 = x.reshape(3, 3)
+
+# If we for example do:
+y = x_3x3.t()
+print(
+    y.is_contiguous()
+)  # This will return False and if we try to use view now, it won't work!
+# y.view(9) would cause an error, reshape however won't
+
+# This is because in memory it was stored [0, 1, 2, ... 8], whereas now it's [0, 3, 6, 1, 4, 7, 2, 5, 8]
+# The jump is no longer 1 in memory for one element jump (matrices are stored as a contiguous block, and
+# using pointers to construct these matrices). This is a bit complicated and I need to explore this more
+# as well, at least you know it's a problem to be cautious of! A solution is to do the following
+print(y.contiguous().view(9))  # Calling .contiguous() before view and it works
+
+# Moving on to another operation, let's say we want to add two tensors dimensions togethor
+x1 = torch.rand(2, 5)
+x2 = torch.rand(2, 5)
+print(torch.cat((x1, x2), dim=0).shape)  # Shape: 4x5
+print(torch.cat((x1, x2), dim=1).shape)  # Shape 2x10
+
+# Let's say we want to unroll x1 into one long vector with 10 elements, we can do:
+z = x1.view(-1)  # And -1 will unroll everything
+
+# If we instead have an additional dimension and we wish to keep those as is we can do:
+batch = 64
+x = torch.rand((batch, 2, 5))
+z = x.view(
+    batch, -1
+)  # And z.shape would be 64x10, this is very useful stuff and is used all the time
+
+# Let's say we want to switch x axis so that instead of 64x2x5 we have 64x5x2
+# I.e we want dimension 0 to stay, dimension 1 to become dimension 2, dimension 2 to become dimension 1
+# Basically you tell permute where you want the new dimensions to be, torch.transpose is a special case
+# of permute (why?)
+z = x.permute(0, 2, 1)
+
+# Splits x last dimension into chunks of 2 (since 5 is not integer div by 2) the last dimension
+# will be smaller, so it will split it into two tensors: 64x2x3 and 64x2x2
+z = torch.chunk(x, chunks=2, dim=1)
+print(z[0].shape)
+print(z[1].shape)
+
+# Let's say we want to add an additional dimension
+x = torch.arange(
+    10
+)  # Shape is [10], let's say we want to add an additional so we have 1x10
+print(x.unsqueeze(0).shape)  # 1x10
+print(x.unsqueeze(1).shape)  # 10x1
+
+# Let's say we have x which is 1x1x10 and we want to remove a dim so we have 1x10
+x = torch.arange(10).unsqueeze(0).unsqueeze(1)
+
+# Perhaps unsurprisingly
+z = x.squeeze(1)  # can also do .squeeze(0) both returns 1x10
+
+# That was some essential Tensor operations, hopefully you found it useful!
--- a/ML/Pytorch/Basics/pytorch_tensorboard_.py
+++ b/ML/Pytorch/Basics/pytorch_tensorboard_.py
@@ -0,0 +1,142 @@
+"""
+Example code of how to use the TensorBoard in PyTorch.
+This code uses a lot of different functions from TensorBoard
+and tries to have them all in a compact way, it might not be
+super clear exactly what calls does what, for that I recommend
+watching the YouTube video.
+
+Video explanation: https://youtu.be/RLqsxWaQdHE
+Got any questions leave a comment on youtube :)
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-17 Initial coding
+"""
+
+# Imports
+import torch
+import torchvision
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+import torch.nn.functional as F  # All functions that don't have any parameters
+import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+from torch.utils.data import (
+    DataLoader,
+)  # Gives easier dataset managment and creates mini batches
+from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
+
+# Simple CNN
+class CNN(nn.Module):
+    def __init__(self, in_channels=1, num_classes=10):
+        super(CNN, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels, out_channels=8, kernel_size=3, stride=1, padding=1
+        )
+        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
+        self.conv2 = nn.Conv2d(
+            in_channels=8, out_channels=16, kernel_size=3, stride=1, padding=1
+        )
+        self.fc1 = nn.Linear(16 * 7 * 7, num_classes)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = self.pool(x)
+        x = F.relu(self.conv2(x))
+        x = self.pool(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.fc1(x)
+        return x
+
+
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+in_channels = 1
+num_classes = 10
+num_epochs = 1
+
+# Load Data
+train_dataset = datasets.MNIST(
+    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+)
+
+# To do hyperparameter search, include more batch_sizes you want to try
+# and more learning rates!
+batch_sizes = [256]
+learning_rates = [0.001]
+classes = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
+
+for batch_size in batch_sizes:
+    for learning_rate in learning_rates:
+        step = 0
+        # Initialize network
+        model = CNN(in_channels=in_channels, num_classes=num_classes)
+        model.to(device)
+        model.train()
+        criterion = nn.CrossEntropyLoss()
+        train_loader = DataLoader(
+            dataset=train_dataset, batch_size=batch_size, shuffle=True
+        )
+        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0)
+        writer = SummaryWriter(
+            f"runs/MNIST/MiniBatchSize {batch_size} LR {learning_rate}"
+        )
+
+        # Visualize model in TensorBoard
+        images, _ = next(iter(train_loader))
+        writer.add_graph(model, images.to(device))
+        writer.close()
+
+        for epoch in range(num_epochs):
+            losses = []
+            accuracies = []
+
+            for batch_idx, (data, targets) in enumerate(train_loader):
+                # Get data to cuda if possible
+                data = data.to(device=device)
+                targets = targets.to(device=device)
+
+                # forward
+                scores = model(data)
+                loss = criterion(scores, targets)
+                losses.append(loss.item())
+
+                # backward
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+                # Calculate 'running' training accuracy
+                features = data.reshape(data.shape[0], -1)
+                img_grid = torchvision.utils.make_grid(data)
+                _, predictions = scores.max(1)
+                num_correct = (predictions == targets).sum()
+                running_train_acc = float(num_correct) / float(data.shape[0])
+                accuracies.append(running_train_acc)
+
+                # Plot things to tensorboard
+                class_labels = [classes[label] for label in predictions]
+                writer.add_image("mnist_images", img_grid)
+                writer.add_histogram("fc1", model.fc1.weight)
+                writer.add_scalar("Training loss", loss, global_step=step)
+                writer.add_scalar(
+                    "Training Accuracy", running_train_acc, global_step=step
+                )
+
+                if batch_idx == 230:
+                    writer.add_embedding(
+                        features,
+                        metadata=class_labels,
+                        label_img=data,
+                        global_step=batch_idx,
+                    )
+                step += 1
+
+            writer.add_hparams(
+                {"lr": learning_rate, "bsize": batch_size},
+                {
+                    "accuracy": sum(accuracies) / len(accuracies),
+                    "loss": sum(losses) / len(losses),
+                },
+            )
--- a/ML/Pytorch/Basics/pytorch_transforms.py
+++ b/ML/Pytorch/Basics/pytorch_transforms.py
@@ -0,0 +1,155 @@
+"""
+Shows a small example of how to use transformations (perhaps unecessarily many)
+on CIFAR10 dataset and training on a small CNN toy network.
+
+Video explanation: https://youtu.be/Zvd276j9sZ8
+Got any questions leave a comment I'm pretty good at responding on youtube
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-09 Initial coding
+"""
+
+# Imports
+import torch
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+import torch.nn.functional as F  # All functions that don't have any parameters
+from torch.utils.data import (
+    DataLoader,
+)  # Gives easier dataset managment and creates mini batches
+import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+
+# Simple CNN
+class CNN(nn.Module):
+    def __init__(self, in_channels, num_classes):
+        super(CNN, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=8,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+        )
+        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
+        self.conv2 = nn.Conv2d(
+            in_channels=8,
+            out_channels=16,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+        )
+        self.fc1 = nn.Linear(16 * 8 * 8, num_classes)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = self.pool(x)
+        x = F.relu(self.conv2(x))
+        x = self.pool(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.fc1(x)
+
+        return x
+
+
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+learning_rate = 1e-4
+batch_size = 64
+num_epochs = 5
+
+
+# Load pretrain model & modify it
+model = CNN(in_channels=3, num_classes=10)
+model.classifier = nn.Sequential(nn.Linear(512, 100), nn.ReLU(), nn.Linear(100, 10))
+model.to(device)
+
+# Load Data
+my_transforms = transforms.Compose(
+    [  # Compose makes it possible to have many transforms
+        transforms.Resize((36, 36)),  # Resizes (32,32) to (36,36)
+        transforms.RandomCrop((32, 32)),  # Takes a random (32,32) crop
+        transforms.ColorJitter(brightness=0.5),  # Change brightness of image
+        transforms.RandomRotation(
+            degrees=45
+        ),  # Perhaps a random rotation from -45 to 45 degrees
+        transforms.RandomHorizontalFlip(
+            p=0.5
+        ),  # Flips the image horizontally with probability 0.5
+        transforms.RandomVerticalFlip(
+            p=0.05
+        ),  # Flips image vertically with probability 0.05
+        transforms.RandomGrayscale(p=0.2),  # Converts to grayscale with probability 0.2
+        transforms.ToTensor(),  # Finally converts PIL image to tensor so we can train w. pytorch
+        transforms.Normalize(
+            mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]
+        ),  # Note: these values aren't optimal
+    ]
+)
+
+
+train_dataset = datasets.CIFAR10(
+    root="dataset/", train=True, transform=my_transforms, download=True
+)
+train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Train Network
+for epoch in range(num_epochs):
+    losses = []
+
+    for batch_idx, (data, targets) in enumerate(train_loader):
+        # Get data to cuda if possible
+        data = data.to(device=device)
+        targets = targets.to(device=device)
+
+        # forward
+        scores = model(data)
+        loss = criterion(scores, targets)
+
+        losses.append(loss.item())
+        # backward
+        optimizer.zero_grad()
+        loss.backward()
+
+        # gradient descent or adam step
+        optimizer.step()
+
+    print(f"Cost at epoch {epoch} is {sum(losses)/len(losses):.5f}")
+
+# Check accuracy on training & test to see how good our model
+
+
+def check_accuracy(loader, model):
+    if loader.dataset.train:
+        print("Checking accuracy on training data")
+    else:
+        print("Checking accuracy on test data")
+
+    num_correct = 0
+    num_samples = 0
+    model.eval()
+
+    with torch.no_grad():
+        for x, y in loader:
+            x = x.to(device=device)
+            y = y.to(device=device)
+
+            scores = model(x)
+            _, predictions = scores.max(1)
+            num_correct += (predictions == y).sum()
+            num_samples += predictions.size(0)
+
+        print(
+            f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
+        )
+
+    model.train()
+
+
+check_accuracy(train_loader, model)
--- a/ML/Pytorch/Basics/set_deterministic_behavior/pytorch_set_seeds.py
+++ b/ML/Pytorch/Basics/set_deterministic_behavior/pytorch_set_seeds.py
@@ -0,0 +1,15 @@
+import random, torch, os, numpy as np
+
+def seed_everything(seed=42):
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+seed_everything()
+
+# Do training etc after running seed_everything
--- a/ML/Pytorch/CNN_architectures/lenet5_pytorch.py
+++ b/ML/Pytorch/CNN_architectures/lenet5_pytorch.py
@@ -0,0 +1,67 @@
+"""
+An implementation of LeNet CNN architecture.
+
+Video explanation: https://youtu.be/fcOW-Zyb5Bo
+Got any questions leave a comment on youtube :)
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-05 Initial coding
+
+"""
+
+import torch
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+
+
+class LeNet(nn.Module):
+    def __init__(self):
+        super(LeNet, self).__init__()
+        self.relu = nn.ReLU()
+        self.pool = nn.AvgPool2d(kernel_size=(2, 2), stride=(2, 2))
+        self.conv1 = nn.Conv2d(
+            in_channels=1,
+            out_channels=6,
+            kernel_size=(5, 5),
+            stride=(1, 1),
+            padding=(0, 0),
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels=6,
+            out_channels=16,
+            kernel_size=(5, 5),
+            stride=(1, 1),
+            padding=(0, 0),
+        )
+        self.conv3 = nn.Conv2d(
+            in_channels=16,
+            out_channels=120,
+            kernel_size=(5, 5),
+            stride=(1, 1),
+            padding=(0, 0),
+        )
+        self.linear1 = nn.Linear(120, 84)
+        self.linear2 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.relu(self.conv1(x))
+        x = self.pool(x)
+        x = self.relu(self.conv2(x))
+        x = self.pool(x)
+        x = self.relu(
+            self.conv3(x)
+        )  # num_examples x 120 x 1 x 1 --> num_examples x 120
+        x = x.reshape(x.shape[0], -1)
+        x = self.relu(self.linear1(x))
+        x = self.linear2(x)
+        return x
+
+
+def test_lenet():
+    x = torch.randn(64, 1, 32, 32)
+    model = LeNet()
+    return model(x)
+
+
+if __name__ == "__main__":
+    out = test_lenet()
+    print(out.shape)
--- a/ML/Pytorch/CNN_architectures/pytorch_inceptionet.py
+++ b/ML/Pytorch/CNN_architectures/pytorch_inceptionet.py
@@ -0,0 +1,166 @@
+"""
+An implementation of GoogLeNet / InceptionNet from scratch.
+
+Video explanation: https://youtu.be/uQc4Fs7yx5I
+Got any questions leave a comment on youtube :)
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-07 Initial coding
+
+"""
+
+# Imports
+import torch
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+
+
+class GoogLeNet(nn.Module):
+    def __init__(self, aux_logits=True, num_classes=1000):
+        super(GoogLeNet, self).__init__()
+        assert aux_logits == True or aux_logits == False
+        self.aux_logits = aux_logits
+
+        # Write in_channels, etc, all explicit in self.conv1, rest will write to
+        # make everything as compact as possible, kernel_size=3 instead of (3,3)
+        self.conv1 = conv_block(
+            in_channels=3,
+            out_channels=64,
+            kernel_size=(7, 7),
+            stride=(2, 2),
+            padding=(3, 3),
+        )
+
+        self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.conv2 = conv_block(64, 192, kernel_size=3, stride=1, padding=1)
+        self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        # In this order: in_channels, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, out_1x1pool
+        self.inception3a = Inception_block(192, 64, 96, 128, 16, 32, 32)
+        self.inception3b = Inception_block(256, 128, 128, 192, 32, 96, 64)
+        self.maxpool3 = nn.MaxPool2d(kernel_size=(3, 3), stride=2, padding=1)
+
+        self.inception4a = Inception_block(480, 192, 96, 208, 16, 48, 64)
+        self.inception4b = Inception_block(512, 160, 112, 224, 24, 64, 64)
+        self.inception4c = Inception_block(512, 128, 128, 256, 24, 64, 64)
+        self.inception4d = Inception_block(512, 112, 144, 288, 32, 64, 64)
+        self.inception4e = Inception_block(528, 256, 160, 320, 32, 128, 128)
+        self.maxpool4 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.inception5a = Inception_block(832, 256, 160, 320, 32, 128, 128)
+        self.inception5b = Inception_block(832, 384, 192, 384, 48, 128, 128)
+
+        self.avgpool = nn.AvgPool2d(kernel_size=7, stride=1)
+        self.dropout = nn.Dropout(p=0.4)
+        self.fc1 = nn.Linear(1024, 1000)
+
+        if self.aux_logits:
+            self.aux1 = InceptionAux(512, num_classes)
+            self.aux2 = InceptionAux(528, num_classes)
+        else:
+            self.aux1 = self.aux2 = None
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool1(x)
+        x = self.conv2(x)
+        # x = self.conv3(x)
+        x = self.maxpool2(x)
+
+        x = self.inception3a(x)
+        x = self.inception3b(x)
+        x = self.maxpool3(x)
+
+        x = self.inception4a(x)
+
+        # Auxiliary Softmax classifier 1
+        if self.aux_logits and self.training:
+            aux1 = self.aux1(x)
+
+        x = self.inception4b(x)
+        x = self.inception4c(x)
+        x = self.inception4d(x)
+
+        # Auxiliary Softmax classifier 2
+        if self.aux_logits and self.training:
+            aux2 = self.aux2(x)
+
+        x = self.inception4e(x)
+        x = self.maxpool4(x)
+        x = self.inception5a(x)
+        x = self.inception5b(x)
+        x = self.avgpool(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.dropout(x)
+        x = self.fc1(x)
+
+        if self.aux_logits and self.training:
+            return aux1, aux2, x
+        else:
+            return x
+
+
+class Inception_block(nn.Module):
+    def __init__(
+        self, in_channels, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, out_1x1pool
+    ):
+        super(Inception_block, self).__init__()
+        self.branch1 = conv_block(in_channels, out_1x1, kernel_size=(1, 1))
+
+        self.branch2 = nn.Sequential(
+            conv_block(in_channels, red_3x3, kernel_size=(1, 1)),
+            conv_block(red_3x3, out_3x3, kernel_size=(3, 3), padding=(1, 1)),
+        )
+
+        self.branch3 = nn.Sequential(
+            conv_block(in_channels, red_5x5, kernel_size=(1, 1)),
+            conv_block(red_5x5, out_5x5, kernel_size=(5, 5), padding=(2, 2)),
+        )
+
+        self.branch4 = nn.Sequential(
+            nn.MaxPool2d(kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
+            conv_block(in_channels, out_1x1pool, kernel_size=(1, 1)),
+        )
+
+    def forward(self, x):
+        return torch.cat(
+            [self.branch1(x), self.branch2(x), self.branch3(x), self.branch4(x)], 1
+        )
+
+
+class InceptionAux(nn.Module):
+    def __init__(self, in_channels, num_classes):
+        super(InceptionAux, self).__init__()
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(p=0.7)
+        self.pool = nn.AvgPool2d(kernel_size=5, stride=3)
+        self.conv = conv_block(in_channels, 128, kernel_size=1)
+        self.fc1 = nn.Linear(2048, 1024)
+        self.fc2 = nn.Linear(1024, num_classes)
+
+    def forward(self, x):
+        x = self.pool(x)
+        x = self.conv(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.relu(self.fc1(x))
+        x = self.dropout(x)
+        x = self.fc2(x)
+
+        return x
+
+
+class conv_block(nn.Module):
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(conv_block, self).__init__()
+        self.relu = nn.ReLU()
+        self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
+        self.batchnorm = nn.BatchNorm2d(out_channels)
+
+    def forward(self, x):
+        return self.relu(self.batchnorm(self.conv(x)))
+
+
+if __name__ == "__main__":
+    # N = 3 (Mini batch size)
+    x = torch.randn(3, 3, 224, 224)
+    model = GoogLeNet(aux_logits=True, num_classes=1000)
+    print(model(x)[2].shape)
--- a/ML/Pytorch/CNN_architectures/pytorch_resnet.py
+++ b/ML/Pytorch/CNN_architectures/pytorch_resnet.py
@@ -0,0 +1,163 @@
+# -*- coding: utf-8 -*-
+"""
+From scratch implementation of the famous ResNet models.
+The intuition for ResNet is simple and clear, but to code
+it didn't feel super clear at first, even when reading Pytorch own
+implementation. 
+
+Video explanation: 
+Got any questions leave a comment on youtube :)
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-12 Initial coding
+"""
+
+import torch
+import torch.nn as nn
+
+
+class block(nn.Module):
+    def __init__(
+        self, in_channels, intermediate_channels, identity_downsample=None, stride=1
+    ):
+        super(block, self).__init__()
+        self.expansion = 4
+        self.conv1 = nn.Conv2d(
+            in_channels, intermediate_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.bn1 = nn.BatchNorm2d(intermediate_channels)
+        self.conv2 = nn.Conv2d(
+            intermediate_channels,
+            intermediate_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+        )
+        self.bn2 = nn.BatchNorm2d(intermediate_channels)
+        self.conv3 = nn.Conv2d(
+            intermediate_channels,
+            intermediate_channels * self.expansion,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.bn3 = nn.BatchNorm2d(intermediate_channels * self.expansion)
+        self.relu = nn.ReLU()
+        self.identity_downsample = identity_downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x.clone()
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        x = self.conv3(x)
+        x = self.bn3(x)
+
+        if self.identity_downsample is not None:
+            identity = self.identity_downsample(identity)
+
+        x += identity
+        x = self.relu(x)
+        return x
+
+
+class ResNet(nn.Module):
+    def __init__(self, block, layers, image_channels, num_classes):
+        super(ResNet, self).__init__()
+        self.in_channels = 64
+        self.conv1 = nn.Conv2d(image_channels, 64, kernel_size=7, stride=2, padding=3)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        # Essentially the entire ResNet architecture are in these 4 lines below
+        self.layer1 = self._make_layer(
+            block, layers[0], intermediate_channels=64, stride=1
+        )
+        self.layer2 = self._make_layer(
+            block, layers[1], intermediate_channels=128, stride=2
+        )
+        self.layer3 = self._make_layer(
+            block, layers[2], intermediate_channels=256, stride=2
+        )
+        self.layer4 = self._make_layer(
+            block, layers[3], intermediate_channels=512, stride=2
+        )
+
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * 4, num_classes)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.fc(x)
+
+        return x
+
+    def _make_layer(self, block, num_residual_blocks, intermediate_channels, stride):
+        identity_downsample = None
+        layers = []
+
+        # Either if we half the input space for ex, 56x56 -> 28x28 (stride=2), or channels changes
+        # we need to adapt the Identity (skip connection) so it will be able to be added
+        # to the layer that's ahead
+        if stride != 1 or self.in_channels != intermediate_channels * 4:
+            identity_downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.in_channels,
+                    intermediate_channels * 4,
+                    kernel_size=1,
+                    stride=stride,
+                ),
+                nn.BatchNorm2d(intermediate_channels * 4),
+            )
+
+        layers.append(
+            block(self.in_channels, intermediate_channels, identity_downsample, stride)
+        )
+
+        # The expansion size is always 4 for ResNet 50,101,152
+        self.in_channels = intermediate_channels * 4
+
+        # For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,
+        # then finally back to 256. Hence no identity downsample is needed, since stride = 1,
+        # and also same amount of channels.
+        for i in range(num_residual_blocks - 1):
+            layers.append(block(self.in_channels, intermediate_channels))
+
+        return nn.Sequential(*layers)
+
+
+def ResNet50(img_channel=3, num_classes=1000):
+    return ResNet(block, [3, 4, 6, 3], img_channel, num_classes)
+
+
+def ResNet101(img_channel=3, num_classes=1000):
+    return ResNet(block, [3, 4, 23, 3], img_channel, num_classes)
+
+
+def ResNet152(img_channel=3, num_classes=1000):
+    return ResNet(block, [3, 8, 36, 3], img_channel, num_classes)
+
+
+def test():
+    net = ResNet101(img_channel=3, num_classes=1000)
+    y = net(torch.randn(4, 3, 224, 224)).to("cuda")
+    print(y.size())
+
+
+test()
--- a/ML/Pytorch/CNN_architectures/pytorch_vgg_implementation.py
+++ b/ML/Pytorch/CNN_architectures/pytorch_vgg_implementation.py
@@ -0,0 +1,119 @@
+"""
+A from scratch implementation of the VGG architecture.
+
+Video explanation: https://youtu.be/ACmuBbuXn20
+Got any questions leave a comment on youtube :)
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-05 Initial coding
+
+"""
+
+# Imports
+import torch
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+
+VGG_types = {
+    "VGG11": [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
+    "VGG13": [64, 64, "M", 128, 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
+    "VGG16": [
+        64,
+        64,
+        "M",
+        128,
+        128,
+        "M",
+        256,
+        256,
+        256,
+        "M",
+        512,
+        512,
+        512,
+        "M",
+        512,
+        512,
+        512,
+        "M",
+    ],
+    "VGG19": [
+        64,
+        64,
+        "M",
+        128,
+        128,
+        "M",
+        256,
+        256,
+        256,
+        256,
+        "M",
+        512,
+        512,
+        512,
+        512,
+        "M",
+        512,
+        512,
+        512,
+        512,
+        "M",
+    ],
+}
+
+
+class VGG_net(nn.Module):
+    def __init__(self, in_channels=3, num_classes=1000):
+        super(VGG_net, self).__init__()
+        self.in_channels = in_channels
+        self.conv_layers = self.create_conv_layers(VGG_types["VGG16"])
+
+        self.fcs = nn.Sequential(
+            nn.Linear(512 * 7 * 7, 4096),
+            nn.ReLU(),
+            nn.Dropout(p=0.5),
+            nn.Linear(4096, 4096),
+            nn.ReLU(),
+            nn.Dropout(p=0.5),
+            nn.Linear(4096, num_classes),
+        )
+
+    def forward(self, x):
+        x = self.conv_layers(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.fcs(x)
+        return x
+
+    def create_conv_layers(self, architecture):
+        layers = []
+        in_channels = self.in_channels
+
+        for x in architecture:
+            if type(x) == int:
+                out_channels = x
+
+                layers += [
+                    nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        kernel_size=(3, 3),
+                        stride=(1, 1),
+                        padding=(1, 1),
+                    ),
+                    nn.BatchNorm2d(x),
+                    nn.ReLU(),
+                ]
+                in_channels = x
+            elif x == "M":
+                layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]
+
+        return nn.Sequential(*layers)
+
+
+if __name__ == "__main__":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = VGG_net(in_channels=3, num_classes=1000).to(device)
+    print(model)
+    ## N = 3 (Mini batch size)
+    # x = torch.randn(3, 3, 224, 224).to(device)
+    # print(model(x).shape)
--- a/SimpleGAN/fc_gan.py
+++ b/SimpleGAN/fc_gan.py
@@ -0,0 +1,107 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision
+import torchvision.datasets as datasets
+from torch.utils.data import DataLoader
+import torchvision.transforms as transforms
+from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
+
+
+class Discriminator(nn.Module):
+    def __init__(self, in_features):
+        super().__init__()
+        self.disc = nn.Sequential(
+            nn.Linear(in_features, 128),
+            nn.LeakyReLU(0.01),
+            nn.Linear(128, 1),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        return self.disc(x)
+
+
+class Generator(nn.Module):
+    def __init__(self, z_dim, img_dim):
+        super().__init__()
+        self.gen = nn.Sequential(
+            nn.Linear(z_dim, 256),
+            nn.LeakyReLU(0.01),
+            nn.Linear(256, img_dim),
+            nn.Tanh(),  # normalize inputs to [-1, 1] so make outputs [-1, 1]
+        )
+
+    def forward(self, x):
+        return self.gen(x)
+
+
+# Hyperparameters etc.
+device = "cuda" if torch.cuda.is_available() else "cpu"
+lr = 3e-4
+z_dim = 64
+image_dim = 28 * 28 * 1  # 784
+batch_size = 32
+num_epochs = 50
+
+disc = Discriminator(image_dim).to(device)
+gen = Generator(z_dim, image_dim).to(device)
+fixed_noise = torch.randn((batch_size, z_dim)).to(device)
+transforms = transforms.Compose(
+    [transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,)),]
+)
+
+dataset = datasets.MNIST(root="dataset/", transform=transforms, download=True)
+loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+opt_disc = optim.Adam(disc.parameters(), lr=lr)
+opt_gen = optim.Adam(gen.parameters(), lr=lr)
+criterion = nn.BCELoss()
+writer_fake = SummaryWriter(f"logs/fake")
+writer_real = SummaryWriter(f"logs/real")
+step = 0
+
+for epoch in range(num_epochs):
+    for batch_idx, (real, _) in enumerate(loader):
+        real = real.view(-1, 784).to(device)
+        batch_size = real.shape[0]
+
+        ### Train Discriminator: max log(D(x)) + log(1 - D(G(z)))
+        noise = torch.randn(batch_size, z_dim).to(device)
+        fake = gen(noise)
+        disc_real = disc(real).view(-1)
+        lossD_real = criterion(disc_real, torch.ones_like(disc_real))
+        disc_fake = disc(fake).view(-1)
+        lossD_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
+        lossD = (lossD_real + lossD_fake) / 2
+        disc.zero_grad()
+        lossD.backward(retain_graph=True)
+        opt_disc.step()
+
+        ### Train Generator: min log(1 - D(G(z))) <-> max log(D(G(z))
+        # where the second option of maximizing doesn't suffer from
+        # saturating gradients
+        output = disc(fake).view(-1)
+        lossG = criterion(output, torch.ones_like(output))
+        gen.zero_grad()
+        lossG.backward()
+        opt_gen.step()
+
+        if batch_idx == 0:
+            print(
+                f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \
+                      Loss D: {lossD:.4f}, loss G: {lossG:.4f}"
+            )
+
+            with torch.no_grad():
+                fake = gen(fixed_noise).reshape(-1, 1, 28, 28)
+                data = real.reshape(-1, 1, 28, 28)
+                img_grid_fake = torchvision.utils.make_grid(fake, normalize=True)
+                img_grid_real = torchvision.utils.make_grid(data, normalize=True)
+
+                writer_fake.add_image(
+                    "Mnist Fake Images", img_grid_fake, global_step=step
+                )
+                writer_real.add_image(
+                    "Mnist Real Images", img_grid_real, global_step=step
+                )
+                step += 1
--- a/ML/Pytorch/GANs/2.
+++ b/ML/Pytorch/GANs/2.
@@ -0,0 +1,96 @@
+"""
+Discriminator and Generator implementation from DCGAN paper
+"""
+
+import torch
+import torch.nn as nn
+
+
+class Discriminator(nn.Module):
+    def __init__(self, channels_img, features_d):
+        super(Discriminator, self).__init__()
+        self.disc = nn.Sequential(
+            # input: N x channels_img x 64 x 64
+            nn.Conv2d(
+                channels_img, features_d, kernel_size=4, stride=2, padding=1
+            ),
+            nn.LeakyReLU(0.2),
+            # _block(in_channels, out_channels, kernel_size, stride, padding)
+            self._block(features_d, features_d * 2, 4, 2, 1),
+            self._block(features_d * 2, features_d * 4, 4, 2, 1),
+            self._block(features_d * 4, features_d * 8, 4, 2, 1),
+            # After all _block img output is 4x4 (Conv2d below makes into 1x1)
+            nn.Conv2d(features_d * 8, 1, kernel_size=4, stride=2, padding=0),
+            nn.Sigmoid(),
+        )
+
+    def _block(self, in_channels, out_channels, kernel_size, stride, padding):
+        return nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                bias=False,
+            ),
+            #nn.BatchNorm2d(out_channels),
+            nn.LeakyReLU(0.2),
+        )
+
+    def forward(self, x):
+        return self.disc(x)
+
+
+class Generator(nn.Module):
+    def __init__(self, channels_noise, channels_img, features_g):
+        super(Generator, self).__init__()
+        self.net = nn.Sequential(
+            # Input: N x channels_noise x 1 x 1
+            self._block(channels_noise, features_g * 16, 4, 1, 0),  # img: 4x4
+            self._block(features_g * 16, features_g * 8, 4, 2, 1),  # img: 8x8
+            self._block(features_g * 8, features_g * 4, 4, 2, 1),  # img: 16x16
+            self._block(features_g * 4, features_g * 2, 4, 2, 1),  # img: 32x32
+            nn.ConvTranspose2d(
+                features_g * 2, channels_img, kernel_size=4, stride=2, padding=1
+            ),
+            # Output: N x channels_img x 64 x 64
+            nn.Tanh(),
+        )
+
+    def _block(self, in_channels, out_channels, kernel_size, stride, padding):
+        return nn.Sequential(
+            nn.ConvTranspose2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                bias=False,
+            ),
+            #nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+def initialize_weights(model):
+    # Initializes weights according to the DCGAN paper
+    for m in model.modules():
+        if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d)):
+            nn.init.normal_(m.weight.data, 0.0, 0.02)
+
+def test():
+    N, in_channels, H, W = 8, 3, 64, 64
+    noise_dim = 100
+    x = torch.randn((N, in_channels, H, W))
+    disc = Discriminator(in_channels, 8)
+    assert disc(x).shape == (N, 1, 1, 1), "Discriminator test failed"
+    gen = Generator(noise_dim, in_channels, 8)
+    z = torch.randn((N, noise_dim, 1, 1))
+    assert gen(z).shape == (N, in_channels, H, W), "Generator test failed"
+
+
+# test()
--- a/ML/Pytorch/GANs/2.
+++ b/ML/Pytorch/GANs/2.
@@ -0,0 +1,105 @@
+"""
+Training of DCGAN network on MNIST dataset with Discriminator
+and Generator imported from models.py
+"""
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from model import Discriminator, Generator, initialize_weights
+
+# Hyperparameters etc.
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+LEARNING_RATE = 2e-4  # could also use two lrs, one for gen and one for disc
+BATCH_SIZE = 128
+IMAGE_SIZE = 64
+CHANNELS_IMG = 1
+NOISE_DIM = 100
+NUM_EPOCHS = 5
+FEATURES_DISC = 64
+FEATURES_GEN = 64
+
+transforms = transforms.Compose(
+    [
+        transforms.Resize(IMAGE_SIZE),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            [0.5 for _ in range(CHANNELS_IMG)], [0.5 for _ in range(CHANNELS_IMG)]
+        ),
+    ]
+)
+
+# If you train on MNIST, remember to set channels_img to 1
+dataset = datasets.MNIST(root="dataset/", train=True, transform=transforms,
+                       download=True)
+
+# comment mnist above and uncomment below if train on CelebA
+#dataset = datasets.ImageFolder(root="celeb_dataset", transform=transforms)
+dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
+gen = Generator(NOISE_DIM, CHANNELS_IMG, FEATURES_GEN).to(device)
+disc = Discriminator(CHANNELS_IMG, FEATURES_DISC).to(device)
+initialize_weights(gen)
+initialize_weights(disc)
+
+opt_gen = optim.Adam(gen.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999))
+opt_disc = optim.Adam(disc.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999))
+criterion = nn.BCELoss()
+
+fixed_noise = torch.randn(32, NOISE_DIM, 1, 1).to(device)
+writer_real = SummaryWriter(f"logs/real")
+writer_fake = SummaryWriter(f"logs/fake")
+step = 0
+
+gen.train()
+disc.train()
+
+for epoch in range(NUM_EPOCHS):
+    # Target labels not needed! <3 unsupervised
+    for batch_idx, (real, _) in enumerate(dataloader):
+        real = real.to(device)
+        noise = torch.randn(BATCH_SIZE, NOISE_DIM, 1, 1).to(device)
+        fake = gen(noise)
+
+        ### Train Discriminator: max log(D(x)) + log(1 - D(G(z)))
+        disc_real = disc(real).reshape(-1)
+        loss_disc_real = criterion(disc_real, torch.ones_like(disc_real))
+        disc_fake = disc(fake.detach()).reshape(-1)
+        loss_disc_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
+        loss_disc = (loss_disc_real + loss_disc_fake) / 2
+        disc.zero_grad()
+        loss_disc.backward()
+        opt_disc.step()
+
+        ### Train Generator: min log(1 - D(G(z))) <-> max log(D(G(z))
+        output = disc(fake).reshape(-1)
+        loss_gen = criterion(output, torch.ones_like(output))
+        gen.zero_grad()
+        loss_gen.backward()
+        opt_gen.step()
+
+        # Print losses occasionally and print to tensorboard
+        if batch_idx % 100 == 0:
+            print(
+                f"Epoch [{epoch}/{NUM_EPOCHS}] Batch {batch_idx}/{len(dataloader)} \
+                  Loss D: {loss_disc:.4f}, loss G: {loss_gen:.4f}"
+            )
+
+            with torch.no_grad():
+                fake = gen(fixed_noise)
+                # take out (up to) 32 examples
+                img_grid_real = torchvision.utils.make_grid(
+                    real[:32], normalize=True
+                )
+                img_grid_fake = torchvision.utils.make_grid(
+                    fake[:32], normalize=True
+                )
+
+                writer_real.add_image("Real", img_grid_real, global_step=step)
+                writer_fake.add_image("Fake", img_grid_fake, global_step=step)
+
+            step += 1
--- a/ML/Pytorch/GANs/3.
+++ b/ML/Pytorch/GANs/3.
@@ -0,0 +1,98 @@
+"""
+Discriminator and Generator implementation from DCGAN paper,
+with removed Sigmoid() as output from Discriminator (and therefor
+it should be called critic)
+"""
+
+import torch
+import torch.nn as nn
+
+
+class Discriminator(nn.Module):
+    def __init__(self, channels_img, features_d):
+        super(Discriminator, self).__init__()
+        self.disc = nn.Sequential(
+            # input: N x channels_img x 64 x 64
+            nn.Conv2d(
+                channels_img, features_d, kernel_size=4, stride=2, padding=1
+            ),
+            nn.LeakyReLU(0.2),
+            # _block(in_channels, out_channels, kernel_size, stride, padding)
+            self._block(features_d, features_d * 2, 4, 2, 1),
+            self._block(features_d * 2, features_d * 4, 4, 2, 1),
+            self._block(features_d * 4, features_d * 8, 4, 2, 1),
+            # After all _block img output is 4x4 (Conv2d below makes into 1x1)
+            nn.Conv2d(features_d * 8, 1, kernel_size=4, stride=2, padding=0),
+        )
+
+    def _block(self, in_channels, out_channels, kernel_size, stride, padding):
+        return nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                bias=False,
+            ),
+            nn.InstanceNorm2d(out_channels, affine=True),
+            nn.LeakyReLU(0.2),
+        )
+
+    def forward(self, x):
+        return self.disc(x)
+
+
+class Generator(nn.Module):
+    def __init__(self, channels_noise, channels_img, features_g):
+        super(Generator, self).__init__()
+        self.net = nn.Sequential(
+            # Input: N x channels_noise x 1 x 1
+            self._block(channels_noise, features_g * 16, 4, 1, 0),  # img: 4x4
+            self._block(features_g * 16, features_g * 8, 4, 2, 1),  # img: 8x8
+            self._block(features_g * 8, features_g * 4, 4, 2, 1),  # img: 16x16
+            self._block(features_g * 4, features_g * 2, 4, 2, 1),  # img: 32x32
+            nn.ConvTranspose2d(
+                features_g * 2, channels_img, kernel_size=4, stride=2, padding=1
+            ),
+            # Output: N x channels_img x 64 x 64
+            nn.Tanh(),
+        )
+
+    def _block(self, in_channels, out_channels, kernel_size, stride, padding):
+        return nn.Sequential(
+            nn.ConvTranspose2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+def initialize_weights(model):
+    # Initializes weights according to the DCGAN paper
+    for m in model.modules():
+        if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d)):
+            nn.init.normal_(m.weight.data, 0.0, 0.02)
+
+
+def test():
+    N, in_channels, H, W = 8, 3, 64, 64
+    noise_dim = 100
+    x = torch.randn((N, in_channels, H, W))
+    disc = Discriminator(in_channels, 8)
+    assert disc(x).shape == (N, 1, 1, 1), "Discriminator test failed"
+    gen = Generator(noise_dim, in_channels, 8)
+    z = torch.randn((N, noise_dim, 1, 1))
+    assert gen(z).shape == (N, in_channels, H, W), "Generator test failed"
+
+
+# test()
--- a/ML/Pytorch/GANs/3.
+++ b/ML/Pytorch/GANs/3.
@@ -0,0 +1,114 @@
+"""
+Training of DCGAN network with WGAN loss
+"""
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from model import Discriminator, Generator, initialize_weights
+
+# Hyperparameters etc
+device = "cuda" if torch.cuda.is_available() else "cpu"
+LEARNING_RATE = 5e-5
+BATCH_SIZE = 64
+IMAGE_SIZE = 64
+CHANNELS_IMG = 1
+Z_DIM = 128
+NUM_EPOCHS = 5
+FEATURES_CRITIC = 64
+FEATURES_GEN = 64
+CRITIC_ITERATIONS = 5
+WEIGHT_CLIP = 0.01
+
+transforms = transforms.Compose(
+    [
+        transforms.Resize(IMAGE_SIZE),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            [0.5 for _ in range(CHANNELS_IMG)], [0.5 for _ in range(CHANNELS_IMG)]
+        ),
+    ]
+)
+
+dataset = datasets.MNIST(root="dataset/", transform=transforms, download=True)
+#comment mnist and uncomment below if you want to train on CelebA dataset
+#dataset = datasets.ImageFolder(root="celeb_dataset", transform=transforms)
+loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
+
+# initialize gen and disc/critic
+gen = Generator(Z_DIM, CHANNELS_IMG, FEATURES_GEN).to(device)
+critic = Discriminator(CHANNELS_IMG, FEATURES_CRITIC).to(device)
+initialize_weights(gen)
+initialize_weights(critic)
+
+# initializate optimizer
+opt_gen = optim.RMSprop(gen.parameters(), lr=LEARNING_RATE)
+opt_critic = optim.RMSprop(critic.parameters(), lr=LEARNING_RATE)
+
+# for tensorboard plotting
+fixed_noise = torch.randn(32, Z_DIM, 1, 1).to(device)
+writer_real = SummaryWriter(f"logs/real")
+writer_fake = SummaryWriter(f"logs/fake")
+step = 0
+
+gen.train()
+critic.train()
+
+for epoch in range(NUM_EPOCHS):
+    # Target labels not needed! <3 unsupervised
+    for batch_idx, (data, _) in enumerate(loader):
+        data = data.to(device)
+        cur_batch_size = data.shape[0]
+
+        # Train Critic: max E[critic(real)] - E[critic(fake)]
+        for _ in range(CRITIC_ITERATIONS):
+            noise = torch.randn(cur_batch_size, Z_DIM, 1, 1).to(device)
+            fake = gen(noise)
+            critic_real = critic(data).reshape(-1)
+            critic_fake = critic(fake).reshape(-1)
+            loss_critic = -(torch.mean(critic_real) - torch.mean(critic_fake))
+            critic.zero_grad()
+            loss_critic.backward(retain_graph=True)
+            opt_critic.step()
+
+            # clip critic weights between -0.01, 0.01
+            for p in critic.parameters():
+                p.data.clamp_(-WEIGHT_CLIP, WEIGHT_CLIP)
+
+        # Train Generator: max E[critic(gen_fake)] <-> min -E[critic(gen_fake)]
+        gen_fake = critic(fake).reshape(-1)
+        loss_gen = -torch.mean(gen_fake)
+        gen.zero_grad()
+        loss_gen.backward()
+        opt_gen.step()
+
+        # Print losses occasionally and print to tensorboard
+        if batch_idx % 100 == 0 and batch_idx > 0:
+            gen.eval()
+            critic.eval()
+            print(
+                f"Epoch [{epoch}/{NUM_EPOCHS}] Batch {batch_idx}/{len(loader)} \
+                  Loss D: {loss_critic:.4f}, loss G: {loss_gen:.4f}"
+            )
+
+            with torch.no_grad():
+                fake = gen(noise)
+                # take out (up to) 32 examples
+                img_grid_real = torchvision.utils.make_grid(
+                    data[:32], normalize=True
+                )
+                img_grid_fake = torchvision.utils.make_grid(
+                    fake[:32], normalize=True
+                )
+
+                writer_real.add_image("Real", img_grid_real, global_step=step)
+                writer_fake.add_image("Fake", img_grid_fake, global_step=step)
+
+            step += 1
+            gen.train()
+            critic.train()
--- a/ML/Pytorch/GANs/4.
+++ b/ML/Pytorch/GANs/4.
@@ -0,0 +1,84 @@
+"""
+Discriminator and Generator implementation from DCGAN paper
+"""
+
+import torch
+import torch.nn as nn
+
+
+class Discriminator(nn.Module):
+    def __init__(self, channels_img, features_d):
+        super(Discriminator, self).__init__()
+        self.disc = nn.Sequential(
+            # input: N x channels_img x 64 x 64
+            nn.Conv2d(channels_img, features_d, kernel_size=4, stride=2, padding=1),
+            nn.LeakyReLU(0.2),
+            # _block(in_channels, out_channels, kernel_size, stride, padding)
+            self._block(features_d, features_d * 2, 4, 2, 1),
+            self._block(features_d * 2, features_d * 4, 4, 2, 1),
+            self._block(features_d * 4, features_d * 8, 4, 2, 1),
+            # After all _block img output is 4x4 (Conv2d below makes into 1x1)
+            nn.Conv2d(features_d * 8, 1, kernel_size=4, stride=2, padding=0),
+        )
+
+    def _block(self, in_channels, out_channels, kernel_size, stride, padding):
+        return nn.Sequential(
+            nn.Conv2d(
+                in_channels, out_channels, kernel_size, stride, padding, bias=False,
+            ),
+            nn.InstanceNorm2d(out_channels, affine=True),
+            nn.LeakyReLU(0.2),
+        )
+
+    def forward(self, x):
+        return self.disc(x)
+
+
+class Generator(nn.Module):
+    def __init__(self, channels_noise, channels_img, features_g):
+        super(Generator, self).__init__()
+        self.net = nn.Sequential(
+            # Input: N x channels_noise x 1 x 1
+            self._block(channels_noise, features_g * 16, 4, 1, 0),  # img: 4x4
+            self._block(features_g * 16, features_g * 8, 4, 2, 1),  # img: 8x8
+            self._block(features_g * 8, features_g * 4, 4, 2, 1),  # img: 16x16
+            self._block(features_g * 4, features_g * 2, 4, 2, 1),  # img: 32x32
+            nn.ConvTranspose2d(
+                features_g * 2, channels_img, kernel_size=4, stride=2, padding=1
+            ),
+            # Output: N x channels_img x 64 x 64
+            nn.Tanh(),
+        )
+
+    def _block(self, in_channels, out_channels, kernel_size, stride, padding):
+        return nn.Sequential(
+            nn.ConvTranspose2d(
+                in_channels, out_channels, kernel_size, stride, padding, bias=False,
+            ),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+def initialize_weights(model):
+    # Initializes weights according to the DCGAN paper
+    for m in model.modules():
+        if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d)):
+            nn.init.normal_(m.weight.data, 0.0, 0.02)
+
+
+def test():
+    N, in_channels, H, W = 8, 3, 64, 64
+    noise_dim = 100
+    x = torch.randn((N, in_channels, H, W))
+    disc = Discriminator(in_channels, 8)
+    assert disc(x).shape == (N, 1, 1, 1), "Discriminator test failed"
+    gen = Generator(noise_dim, in_channels, 8)
+    z = torch.randn((N, noise_dim, 1, 1))
+    assert gen(z).shape == (N, in_channels, H, W), "Generator test failed"
+
+
+# test()
--- a/ML/Pytorch/GANs/4.
+++ b/ML/Pytorch/GANs/4.
@@ -0,0 +1,111 @@
+"""
+Training of WGAN-GP
+"""
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from utils import gradient_penalty, save_checkpoint, load_checkpoint
+from model import Discriminator, Generator, initialize_weights
+
+# Hyperparameters etc.
+device = "cuda" if torch.cuda.is_available() else "cpu"
+LEARNING_RATE = 1e-4
+BATCH_SIZE = 64
+IMAGE_SIZE = 64
+CHANNELS_IMG = 1
+Z_DIM = 100
+NUM_EPOCHS = 100
+FEATURES_CRITIC = 16
+FEATURES_GEN = 16
+CRITIC_ITERATIONS = 5
+LAMBDA_GP = 10
+
+transforms = transforms.Compose(
+    [
+        transforms.Resize(IMAGE_SIZE),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            [0.5 for _ in range(CHANNELS_IMG)], [0.5 for _ in range(CHANNELS_IMG)]),
+    ]
+)
+
+dataset = datasets.MNIST(root="dataset/", transform=transforms, download=True)
+# comment mnist above and uncomment below for training on CelebA
+#dataset = datasets.ImageFolder(root="celeb_dataset", transform=transforms)
+loader = DataLoader(
+    dataset,
+    batch_size=BATCH_SIZE,
+    shuffle=True,
+)
+
+# initialize gen and disc, note: discriminator should be called critic,
+# according to WGAN paper (since it no longer outputs between [0, 1])
+gen = Generator(Z_DIM, CHANNELS_IMG, FEATURES_GEN).to(device)
+critic = Discriminator(CHANNELS_IMG, FEATURES_CRITIC).to(device)
+initialize_weights(gen)
+initialize_weights(critic)
+
+# initializate optimizer
+opt_gen = optim.Adam(gen.parameters(), lr=LEARNING_RATE, betas=(0.0, 0.9))
+opt_critic = optim.Adam(critic.parameters(), lr=LEARNING_RATE, betas=(0.0, 0.9))
+
+# for tensorboard plotting
+fixed_noise = torch.randn(32, Z_DIM, 1, 1).to(device)
+writer_real = SummaryWriter(f"logs/GAN_MNIST/real")
+writer_fake = SummaryWriter(f"logs/GAN_MNIST/fake")
+step = 0
+
+gen.train()
+critic.train()
+
+for epoch in range(NUM_EPOCHS):
+    # Target labels not needed! <3 unsupervised
+    for batch_idx, (real, _) in enumerate(loader):
+        real = real.to(device)
+        cur_batch_size = real.shape[0]
+
+        # Train Critic: max E[critic(real)] - E[critic(fake)]
+        # equivalent to minimizing the negative of that
+        for _ in range(CRITIC_ITERATIONS):
+            noise = torch.randn(cur_batch_size, Z_DIM, 1, 1).to(device)
+            fake = gen(noise)
+            critic_real = critic(real).reshape(-1)
+            critic_fake = critic(fake).reshape(-1)
+            gp = gradient_penalty(critic, real, fake, device=device)
+            loss_critic = (
+                -(torch.mean(critic_real) - torch.mean(critic_fake)) + LAMBDA_GP * gp
+            )
+            critic.zero_grad()
+            loss_critic.backward(retain_graph=True)
+            opt_critic.step()
+
+        # Train Generator: max E[critic(gen_fake)] <-> min -E[critic(gen_fake)]
+        gen_fake = critic(fake).reshape(-1)
+        loss_gen = -torch.mean(gen_fake)
+        gen.zero_grad()
+        loss_gen.backward()
+        opt_gen.step()
+
+        # Print losses occasionally and print to tensorboard
+        if batch_idx % 100 == 0 and batch_idx > 0:
+            print(
+                f"Epoch [{epoch}/{NUM_EPOCHS}] Batch {batch_idx}/{len(loader)} \
+                  Loss D: {loss_critic:.4f}, loss G: {loss_gen:.4f}"
+            )
+
+            with torch.no_grad():
+                fake = gen(fixed_noise)
+                # take out (up to) 32 examples
+                img_grid_real = torchvision.utils.make_grid(real[:32], normalize=True)
+                img_grid_fake = torchvision.utils.make_grid(fake[:32], normalize=True)
+
+                writer_real.add_image("Real", img_grid_real, global_step=step)
+                writer_fake.add_image("Fake", img_grid_fake, global_step=step)
+
+            step += 1
--- a/ML/Pytorch/GANs/4.
+++ b/ML/Pytorch/GANs/4.
@@ -0,0 +1,35 @@
+import torch
+import torch.nn as nn
+
+
+def gradient_penalty(critic, real, fake, device="cpu"):
+    BATCH_SIZE, C, H, W = real.shape
+    alpha = torch.rand((BATCH_SIZE, 1, 1, 1)).repeat(1, C, H, W).to(device)
+    interpolated_images = real * alpha + fake * (1 - alpha)
+
+    # Calculate critic scores
+    mixed_scores = critic(interpolated_images)
+
+    # Take the gradient of the scores with respect to the images
+    gradient = torch.autograd.grad(
+        inputs=interpolated_images,
+        outputs=mixed_scores,
+        grad_outputs=torch.ones_like(mixed_scores),
+        create_graph=True,
+        retain_graph=True,
+    )[0]
+    gradient = gradient.view(gradient.shape[0], -1)
+    gradient_norm = gradient.norm(2, dim=1)
+    gradient_penalty = torch.mean((gradient_norm - 1) ** 2)
+    return gradient_penalty
+
+
+def save_checkpoint(state, filename="celeba_wgan_gp.pth.tar"):
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+
+
+def load_checkpoint(checkpoint, gen, disc):
+    print("=> Loading checkpoint")
+    gen.load_state_dict(checkpoint['gen'])
+    disc.load_state_dict(checkpoint['disc'])
--- a/ML/Pytorch/GANs/5.
+++ b/ML/Pytorch/GANs/5.
@@ -0,0 +1,205 @@
+"""
+Implementation of ProGAN generator and discriminator with the key
+attributions from the paper. We have tried to make the implementation
+compact but a goal is also to keep it readable and understandable.
+Specifically the key points implemented are:
+
+1) Progressive growing (of model and layers)
+2) Minibatch std on Discriminator
+3) Normalization with PixelNorm
+4) Equalized Learning Rate (here I cheated and only did it on Conv layers)
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from math import log2
+
+"""
+Factors is used in Discrmininator and Generator for how much
+the channels should be multiplied and expanded for each layer,
+so specifically the first 5 layers the channels stay the same,
+whereas when we increase the img_size (towards the later layers)
+we decrease the number of chanels by 1/2, 1/4, etc.
+"""
+factors = [1, 1, 1, 1, 1/2, 1/4, 1/4, 1/8, 1/16]
+
+
+class WSConv2d(nn.Module):
+    """
+    Weight scaled Conv2d (Equalized Learning Rate)
+    Note that input is multiplied rather than changing weights
+    this will have the same result.
+
+    Inspired by:
+    https://github.com/nvnbny/progressive_growing_of_gans/blob/master/modelUtils.py
+    """
+
+    def __init__(
+        self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, gain=2
+    ):
+        super(WSConv2d, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, kernel_size, stride, padding
+        )
+        self.scale = (gain / (self.conv.weight[0].numel())) ** 0.5
+
+        # initialize conv layer
+        nn.init.normal_(self.conv.weight)
+        nn.init.zeros_(self.conv.bias)
+
+    def forward(self, x):
+        return self.conv(x * self.scale)
+
+
+class PixelNorm(nn.Module):
+    def __init__(self):
+        super(PixelNorm, self).__init__()
+        self.epsilon = 1e-8
+
+    def forward(self, x):
+        return x / torch.sqrt(
+            torch.mean(x ** 2, dim=1, keepdim=True) + self.epsilon
+        )
+
+
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, use_pixelnorm=True):
+        super(ConvBlock, self).__init__()
+        self.use_pn = use_pixelnorm
+        self.conv1 = WSConv2d(in_channels, out_channels)
+        self.conv2 = WSConv2d(out_channels, out_channels)
+        self.leaky = nn.LeakyReLU(0.2)
+        self.pn = PixelNorm()
+
+    def forward(self, x):
+        x = self.leaky(self.conv1(x))
+        x = self.pn(x) if self.use_pn else x
+        x = self.leaky(self.conv2(x))
+        x = self.pn(x) if self.use_pn else x
+        return x
+
+
+class Generator(nn.Module):
+    def __init__(self, z_dim, in_channels, img_size, img_channels=3):
+        super(Generator, self).__init__()
+        self.prog_blocks, self.rgb_layers = nn.ModuleList([]), nn.ModuleList([])
+
+        # initial takes 1x1 -> 4x4
+        self.initial = nn.Sequential(
+            nn.ConvTranspose2d(z_dim, in_channels, 4, 1, 0),
+            nn.LeakyReLU(0.2),
+            PixelNorm(),
+        )
+
+        # Create progression blocks and rgb layers
+        channels = in_channels
+
+        # we need to double img for log2(img_size/4) and
+        # +1 in loop for initial 4x4
+        for idx in range(int(log2(img_size/4)) + 1):
+            conv_in = channels
+            conv_out = int(in_channels*factors[idx])
+            self.prog_blocks.append(ConvBlock(conv_in, conv_out))
+            self.rgb_layers.append(WSConv2d(conv_out, img_channels, kernel_size=1, stride=1, padding=0))
+            channels = conv_out
+
+    def fade_in(self, alpha, upscaled, generated):
+        #assert 0 <= alpha <= 1, "Alpha not between 0 and 1"
+        #assert upscaled.shape == generated.shape
+        return torch.tanh(alpha * generated + (1 - alpha) * upscaled)
+
+    def forward(self, x, alpha, steps):
+        upscaled = self.initial(x)
+        out = self.prog_blocks[0](upscaled)
+
+        if steps == 0:
+            return self.rgb_layers[0](out)
+
+        for step in range(1, steps+1):
+            upscaled = F.interpolate(out, scale_factor=2, mode="nearest")
+            out = self.prog_blocks[step](upscaled)
+
+        # The number of channels in upscale will stay the same, while
+        # out which has moved through prog_blocks might change. To ensure
+        # we can convert both to rgb we use different rgb_layers
+        # (steps-1) and steps for upscaled, out respectively
+        final_upscaled = self.rgb_layers[steps - 1](upscaled)
+        final_out = self.rgb_layers[steps](out)
+        return self.fade_in(alpha, final_upscaled, final_out)
+
+
+class Discriminator(nn.Module):
+    def __init__(self, img_size, z_dim, in_channels, img_channels=3):
+        super(Discriminator, self).__init__()
+        self.prog_blocks, self.rgb_layers = nn.ModuleList([]), nn.ModuleList([])
+
+        # Create progression blocks and rgb layers
+        channels = in_channels
+        for idx in range(int(log2(img_size/4)) + 1):
+            conv_in = int(in_channels * factors[idx])
+            conv_out = channels
+            self.rgb_layers.append(WSConv2d(img_channels, conv_in, kernel_size=1, stride=1, padding=0))
+            self.prog_blocks.append(ConvBlock(conv_in, conv_out, use_pixelnorm=False))
+            channels = conv_in
+
+        self.avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
+        # +1 to in_channels because we concatenate from minibatch std
+        self.conv = WSConv2d(in_channels + 1, z_dim, kernel_size=4, stride=1, padding=0)
+        self.linear = nn.Linear(z_dim, 1)
+
+    def fade_in(self, alpha, downscaled, out):
+        """Used to fade in downscaled using avgpooling and output from CNN"""
+        #assert 0 <= alpha <= 1, "Alpha needs to be between [0, 1]"
+        #assert downscaled.shape == out.shape
+        return alpha * out + (1 - alpha) * downscaled
+
+    def minibatch_std(self, x):
+        batch_statistics = (
+            torch.std(x, dim=0)
+            .mean()
+            .repeat(x.shape[0], 1, x.shape[2], x.shape[3])
+        )
+        return torch.cat([x, batch_statistics], dim=1)
+
+    def forward(self, x, alpha, steps):
+        out = self.rgb_layers[steps](x) # convert from rgb as initial step
+
+        if steps == 0: # i.e, image is 4x4
+            out = self.minibatch_std(out)
+            out = self.conv(out)
+            return self.linear(out.view(-1, out.shape[1]))
+
+        # index steps which has the "reverse" fade_in
+        downscaled = self.rgb_layers[steps - 1](self.avg_pool(x))
+        out = self.avg_pool(self.prog_blocks[steps](out))
+        out = self.fade_in(alpha, downscaled, out)
+
+        for step in range(steps - 1, 0, -1):
+            downscaled = self.avg_pool(out)
+            out = self.prog_blocks[step](downscaled)
+
+        out = self.minibatch_std(out)
+        out = self.conv(out)
+        return self.linear(out.view(-1, out.shape[1]))
+
+
+if __name__ == "__main__":
+    import time
+    Z_DIM = 100
+    IN_CHANNELS = 16
+    img_size = 512
+    num_steps = int(log2(img_size / 4))
+    x = torch.randn((5, Z_DIM, 1, 1))
+    gen = Generator(Z_DIM, IN_CHANNELS, img_size=img_size)
+    disc = Discriminator(img_size, Z_DIM, IN_CHANNELS)
+    start = time.time()
+    with torch.autograd.profiler.profile(use_cuda=True) as prof:
+        z = gen(x, alpha=0.5, steps=num_steps)
+    print(prof)
+    gen_time = time.time()-start
+    t = time.time()
+    out = disc(z, 0.01, num_steps)
+    disc_time = time.time()-t
+    print(gen_time, disc_time)
+    #print(disc(z, 0.01, num_steps).shape)
--- a/ML/Pytorch/GANs/5.
+++ b/ML/Pytorch/GANs/5.
@@ -0,0 +1,5 @@
+def func(x=1, y=2, **kwargs):
+    print(x, y)
+
+
+print(func(x=3, y=4))
--- a/ML/Pytorch/GANs/5.
+++ b/ML/Pytorch/GANs/5.
@@ -0,0 +1,165 @@
+""" Training of ProGAN using WGAN-GP loss"""
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from utils import gradient_penalty, plot_to_tensorboard, save_checkpoint, load_checkpoint
+from model import Discriminator, Generator
+from math import log2
+from tqdm import tqdm
+import time
+
+torch.backends.cudnn.benchmarks = True
+torch.manual_seed(0)
+
+# Hyperparameters etc.
+device = "cuda" if torch.cuda.is_available() else "cpu"
+LEARNING_RATE = 1e-4
+BATCH_SIZES = [128, 128, 64, 16, 8, 4, 2, 2, 1]
+IMAGE_SIZE = 128
+CHANNELS_IMG = 3
+Z_DIM = 128
+IN_CHANNELS = 128
+CRITIC_ITERATIONS = 1
+LAMBDA_GP = 10
+NUM_STEPS = int(log2(IMAGE_SIZE / 4)) + 1
+PROGRESSIVE_EPOCHS = [2 ** i for i in range(int(log2(IMAGE_SIZE / 4) + 1))]
+PROGRESSIVE_EPOCHS = [8 for i in range(int(log2(IMAGE_SIZE / 4) + 1))]
+fixed_noise = torch.randn(8, Z_DIM, 1, 1).to(device)
+NUM_WORKERS = 4
+
+def get_loader(image_size):
+    transform = transforms.Compose(
+        [
+            transforms.Resize((image_size, image_size)),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                [0.5 for _ in range(CHANNELS_IMG)],
+                [0.5 for _ in range(CHANNELS_IMG)],
+            ),
+        ]
+    )
+    batch_size = BATCH_SIZES[int(log2(image_size/4))]
+    dataset = datasets.ImageFolder(root="celeb_dataset", transform=transform)
+    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
+    return loader, dataset
+
+def train_fn(
+    critic,
+    gen,
+    loader,
+    dataset,
+    step,
+    alpha,
+    opt_critic,
+    opt_gen,
+    tensorboard_step,
+    writer,
+):
+    start = time.time()
+    total_time = 0
+    training = tqdm(loader, leave=True)
+    for batch_idx, (real, _) in enumerate(training):
+        real = real.to(device)
+        cur_batch_size = real.shape[0]
+        model_start = time.time()
+
+        # Train Critic: max E[critic(real)] - E[critic(fake)]
+        # which is equivalent to minimizing the negative of the expression
+        for _ in range(CRITIC_ITERATIONS):
+            critic.zero_grad()
+            noise = torch.randn(cur_batch_size, Z_DIM, 1, 1).to(device)
+            fake = gen(noise, alpha, step)
+            critic_real = critic(real, alpha, step).reshape(-1)
+            critic_fake = critic(fake, alpha, step).reshape(-1)
+            gp = gradient_penalty(critic, real, fake, alpha, step, device=device)
+            loss_critic = (
+                -(torch.mean(critic_real) - torch.mean(critic_fake))
+                + LAMBDA_GP * gp
+            )
+            loss_critic.backward(retain_graph=True)
+            opt_critic.step()
+
+        # Train Generator: max E[critic(gen_fake)] <-> min -E[critic(gen_fake)]
+        gen.zero_grad()
+        fake = gen(noise, alpha, step)
+        gen_fake = critic(fake, alpha, step).reshape(-1)
+        loss_gen = -torch.mean(gen_fake)
+        loss_gen.backward()
+        opt_gen.step()
+
+        # Update alpha and ensure less than 1
+        alpha += cur_batch_size / (
+            (PROGRESSIVE_EPOCHS[step]*0.5) * len(dataset) # - step
+        )
+        alpha = min(alpha, 1)
+        total_time += time.time()-model_start
+
+        if batch_idx % 300 == 0:
+            with torch.no_grad():
+                fixed_fakes = gen(fixed_noise, alpha, step)
+            plot_to_tensorboard(
+                writer, loss_critic, loss_gen, real, fixed_fakes, tensorboard_step
+            )
+            tensorboard_step += 1
+
+    print(f'Fraction spent on model training: {total_time/(time.time()-start)}')
+    return tensorboard_step, alpha
+
+
+def main():
+    # initialize gen and disc, note: discriminator should be called critic,
+    # according to WGAN paper (since it no longer outputs between [0, 1])
+    gen = Generator(Z_DIM, IN_CHANNELS, img_size=IMAGE_SIZE, img_channels=CHANNELS_IMG).to(device)
+    critic = Discriminator(IMAGE_SIZE, Z_DIM, IN_CHANNELS, img_channels=CHANNELS_IMG).to(device)
+
+    # initializate optimizer
+    opt_gen = optim.Adam(gen.parameters(), lr=LEARNING_RATE, betas=(0.0, 0.99))
+    opt_critic = optim.Adam(critic.parameters(), lr=LEARNING_RATE, betas=(0.0, 0.99))
+
+    # for tensorboard plotting
+    writer = SummaryWriter(f"logs/gan")
+
+    load_checkpoint(torch.load("celeba_wgan_gp.pth.tar"), gen, critic)
+    gen.train()
+    critic.train()
+
+    tensorboard_step = 0
+    for step, num_epochs in enumerate(PROGRESSIVE_EPOCHS):
+        alpha = 0.01
+        if step < 3:
+            continue
+
+        if step == 4:
+            print(f"Img size is: {4*2**step}")
+
+        loader, dataset = get_loader(4 * 2 ** step)
+        for epoch in range(num_epochs):
+            print(f"Epoch [{epoch+1}/{num_epochs}]")
+            tensorboard_step, alpha = train_fn(
+                critic,
+                gen,
+                loader,
+                dataset,
+                step,
+                alpha,
+                opt_critic,
+                opt_gen,
+                tensorboard_step,
+                writer,
+            )
+
+            checkpoint = {'gen': gen.state_dict(),
+                          'critic': critic.state_dict(),
+                          'opt_gen': opt_gen.state_dict(),
+                          'opt_critic': opt_critic.state_dict()}
+
+            save_checkpoint(checkpoint)
+
+if __name__ == "__main__":
+    main()
--- a/ML/Pytorch/GANs/5.
+++ b/ML/Pytorch/GANs/5.
@@ -0,0 +1,54 @@
+import torch
+import torchvision
+import torch.nn as nn
+
+# Print losses occasionally and print to tensorboard
+def plot_to_tensorboard(
+    writer, loss_critic, loss_gen, real, fake, tensorboard_step
+):
+    writer.add_scalar("Loss Critic", loss_critic, global_step=tensorboard_step)
+
+    with torch.no_grad():
+        # take out (up to) 32 examples
+        img_grid_real = torchvision.utils.make_grid(real[:8], normalize=True)
+        img_grid_fake = torchvision.utils.make_grid(fake[:8], normalize=True)
+        writer.add_image("Real", img_grid_real, global_step=tensorboard_step)
+        writer.add_image("Fake", img_grid_fake, global_step=tensorboard_step)
+
+
+def gradient_penalty(critic, real, fake, alpha, train_step, device="cpu"):
+    BATCH_SIZE, C, H, W = real.shape
+    beta = torch.rand((BATCH_SIZE, 1, 1, 1)).repeat(1, C, H, W).to(device)
+    interpolated_images = real * beta + fake * (1 - beta)
+
+    # Calculate critic scores
+    mixed_scores = critic(interpolated_images, alpha, train_step)
+
+    # Take the gradient of the scores with respect to the images
+    gradient = torch.autograd.grad(
+        inputs=interpolated_images,
+        outputs=mixed_scores,
+        grad_outputs=torch.ones_like(mixed_scores),
+        create_graph=True,
+        retain_graph=True,
+    )[0]
+    gradient = gradient.view(gradient.shape[0], -1)
+    gradient_norm = gradient.norm(2, dim=1)
+    gradient_penalty = torch.mean((gradient_norm - 1) ** 2)
+    return gradient_penalty
+
+
+def save_checkpoint(state, filename="celeba_wgan_gp.pth.tar"):
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+
+def load_checkpoint(checkpoint, gen, disc, opt_gen=None, opt_disc=None):
+    print("=> Loading checkpoint")
+    gen.load_state_dict(checkpoint['gen'])
+    disc.load_state_dict(checkpoint['critic'])
+
+    if opt_gen != None and opt_disc != None:
+        opt_gen.load_state_dict(checkpoint['opt_gen'])
+        opt_disc.load_state_dict(checkpoint['opt_critic'])
+
+
--- a/ML/Pytorch/image_segmentation/semantic_segmentation_unet/UNET_architecture.png
+++ b/ML/Pytorch/image_segmentation/semantic_segmentation_unet/UNET_architecture.png
--- a/ML/Pytorch/image_segmentation/semantic_segmentation_unet/dataset.py
+++ b/ML/Pytorch/image_segmentation/semantic_segmentation_unet/dataset.py
@@ -0,0 +1,29 @@
+import os
+from PIL import Image
+from torch.utils.data import Dataset
+import numpy as np
+
+class CarvanaDataset(Dataset):
+    def __init__(self, image_dir, mask_dir, transform=None):
+        self.image_dir = image_dir
+        self.mask_dir = mask_dir
+        self.transform = transform
+        self.images = os.listdir(image_dir)
+
+    def __len__(self):
+        return len(self.images)
+
+    def __getitem__(self, index):
+        img_path = os.path.join(self.image_dir, self.images[index])
+        mask_path = os.path.join(self.mask_dir, self.images[index].replace(".jpg", "_mask.gif"))
+        image = np.array(Image.open(img_path).convert("RGB"))
+        mask = np.array(Image.open(mask_path).convert("L"), dtype=np.float32)
+        mask[mask == 255.0] = 1.0
+
+        if self.transform is not None:
+            augmentations = self.transform(image=image, mask=mask)
+            image = augmentations["image"]
+            mask = augmentations["mask"]
+
+        return image, mask
+
--- a/ML/Pytorch/image_segmentation/semantic_segmentation_unet/model.py
+++ b/ML/Pytorch/image_segmentation/semantic_segmentation_unet/model.py
@@ -0,0 +1,76 @@
+import torch
+import torch.nn as nn
+import torchvision.transforms.functional as TF
+
+class DoubleConv(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(DoubleConv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+class UNET(nn.Module):
+    def __init__(
+            self, in_channels=3, out_channels=1, features=[64, 128, 256, 512],
+    ):
+        super(UNET, self).__init__()
+        self.ups = nn.ModuleList()
+        self.downs = nn.ModuleList()
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+
+        # Down part of UNET
+        for feature in features:
+            self.downs.append(DoubleConv(in_channels, feature))
+            in_channels = feature
+
+        # Up part of UNET
+        for feature in reversed(features):
+            self.ups.append(
+                nn.ConvTranspose2d(
+                    feature*2, feature, kernel_size=2, stride=2,
+                )
+            )
+            self.ups.append(DoubleConv(feature*2, feature))
+
+        self.bottleneck = DoubleConv(features[-1], features[-1]*2)
+        self.final_conv = nn.Conv2d(features[0], out_channels, kernel_size=1)
+
+    def forward(self, x):
+        skip_connections = []
+
+        for down in self.downs:
+            x = down(x)
+            skip_connections.append(x)
+            x = self.pool(x)
+
+        x = self.bottleneck(x)
+        skip_connections = skip_connections[::-1]
+
+        for idx in range(0, len(self.ups), 2):
+            x = self.ups[idx](x)
+            skip_connection = skip_connections[idx//2]
+
+            if x.shape != skip_connection.shape:
+                x = TF.resize(x, size=skip_connection.shape[2:])
+
+            concat_skip = torch.cat((skip_connection, x), dim=1)
+            x = self.ups[idx+1](concat_skip)
+
+        return self.final_conv(x)
+
+def test():
+    x = torch.randn((3, 1, 161, 161))
+    model = UNET(in_channels=1, out_channels=1)
+    preds = model(x)
+    assert preds.shape == x.shape
+
+if __name__ == "__main__":
+    test()
--- a/ML/Pytorch/image_segmentation/semantic_segmentation_unet/train.py
+++ b/ML/Pytorch/image_segmentation/semantic_segmentation_unet/train.py
@@ -0,0 +1,124 @@
+import torch
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+from tqdm import tqdm
+import torch.nn as nn
+import torch.optim as optim
+from model import UNET
+from utils import (
+    load_checkpoint,
+    save_checkpoint,
+    get_loaders,
+    check_accuracy,
+    save_predictions_as_imgs,
+)
+
+# Hyperparameters etc.
+LEARNING_RATE = 1e-4
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+BATCH_SIZE = 16
+NUM_EPOCHS = 3
+NUM_WORKERS = 2
+IMAGE_HEIGHT = 160  # 1280 originally
+IMAGE_WIDTH = 240  # 1918 originally
+PIN_MEMORY = True
+LOAD_MODEL = True
+TRAIN_IMG_DIR = "data/train_images/"
+TRAIN_MASK_DIR = "data/train_masks/"
+VAL_IMG_DIR = "data/val_images/"
+VAL_MASK_DIR = "data/val_masks/"
+
+def train_fn(loader, model, optimizer, loss_fn, scaler):
+    loop = tqdm(loader)
+
+    for batch_idx, (data, targets) in enumerate(loop):
+        data = data.to(device=DEVICE)
+        targets = targets.float().unsqueeze(1).to(device=DEVICE)
+
+        # forward
+        with torch.cuda.amp.autocast():
+            predictions = model(data)
+            loss = loss_fn(predictions, targets)
+
+        # backward
+        optimizer.zero_grad()
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+
+        # update tqdm loop
+        loop.set_postfix(loss=loss.item())
+
+
+def main():
+    train_transform = A.Compose(
+        [
+            A.Resize(height=IMAGE_HEIGHT, width=IMAGE_WIDTH),
+            A.Rotate(limit=35, p=1.0),
+            A.HorizontalFlip(p=0.5),
+            A.VerticalFlip(p=0.1),
+            A.Normalize(
+                mean=[0.0, 0.0, 0.0],
+                std=[1.0, 1.0, 1.0],
+                max_pixel_value=255.0,
+            ),
+            ToTensorV2(),
+        ],
+    )
+
+    val_transforms = A.Compose(
+        [
+            A.Resize(height=IMAGE_HEIGHT, width=IMAGE_WIDTH),
+            A.Normalize(
+                mean=[0.0, 0.0, 0.0],
+                std=[1.0, 1.0, 1.0],
+                max_pixel_value=255.0,
+            ),
+            ToTensorV2(),
+        ],
+    )
+
+    model = UNET(in_channels=3, out_channels=1).to(DEVICE)
+    loss_fn = nn.BCEWithLogitsLoss()
+    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
+
+    train_loader, val_loader = get_loaders(
+        TRAIN_IMG_DIR,
+        TRAIN_MASK_DIR,
+        VAL_IMG_DIR,
+        VAL_MASK_DIR,
+        BATCH_SIZE,
+        train_transform,
+        val_transforms,
+        NUM_WORKERS,
+        PIN_MEMORY,
+    )
+
+    if LOAD_MODEL:
+        load_checkpoint(torch.load("my_checkpoint.pth.tar"), model)
+
+
+    check_accuracy(val_loader, model, device=DEVICE)
+    scaler = torch.cuda.amp.GradScaler()
+
+    for epoch in range(NUM_EPOCHS):
+        train_fn(train_loader, model, optimizer, loss_fn, scaler)
+
+        # save model
+        checkpoint = {
+            "state_dict": model.state_dict(),
+            "optimizer":optimizer.state_dict(),
+        }
+        save_checkpoint(checkpoint)
+
+        # check accuracy
+        check_accuracy(val_loader, model, device=DEVICE)
+
+        # print some examples to a folder
+        save_predictions_as_imgs(
+            val_loader, model, folder="saved_images/", device=DEVICE
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/ML/Pytorch/image_segmentation/semantic_segmentation_unet/utils.py
+++ b/ML/Pytorch/image_segmentation/semantic_segmentation_unet/utils.py
@@ -0,0 +1,93 @@
+import torch
+import torchvision
+from dataset import CarvanaDataset
+from torch.utils.data import DataLoader
+
+def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+
+def load_checkpoint(checkpoint, model):
+    print("=> Loading checkpoint")
+    model.load_state_dict(checkpoint["state_dict"])
+
+def get_loaders(
+    train_dir,
+    train_maskdir,
+    val_dir,
+    val_maskdir,
+    batch_size,
+    train_transform,
+    val_transform,
+    num_workers=4,
+    pin_memory=True,
+):
+    train_ds = CarvanaDataset(
+        image_dir=train_dir,
+        mask_dir=train_maskdir,
+        transform=train_transform,
+    )
+
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        shuffle=True,
+    )
+
+    val_ds = CarvanaDataset(
+        image_dir=val_dir,
+        mask_dir=val_maskdir,
+        transform=val_transform,
+    )
+
+    val_loader = DataLoader(
+        val_ds,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        shuffle=False,
+    )
+
+    return train_loader, val_loader
+
+def check_accuracy(loader, model, device="cuda"):
+    num_correct = 0
+    num_pixels = 0
+    dice_score = 0
+    model.eval()
+
+    with torch.no_grad():
+        for x, y in loader:
+            x = x.to(device)
+            y = y.to(device).unsqueeze(1)
+            preds = torch.sigmoid(model(x))
+            preds = (preds > 0.5).float()
+            num_correct += (preds == y).sum()
+            num_pixels += torch.numel(preds)
+            dice_score += (2 * (preds * y).sum()) / (
+                (preds + y).sum() + 1e-8
+            )
+
+    print(
+        f"Got {num_correct}/{num_pixels} with acc {num_correct/num_pixels*100:.2f}"
+    )
+    print(f"Dice score: {dice_score/len(loader)}")
+    model.train()
+
+def save_predictions_as_imgs(
+    loader, model, folder="saved_images/", device="cuda"
+):
+    model.eval()
+    for idx, (x, y) in enumerate(loader):
+        x = x.to(device=device)
+        with torch.no_grad():
+            preds = torch.sigmoid(model(x))
+            preds = (preds > 0.5).float()
+        torchvision.utils.save_image(
+            preds, f"{folder}/pred_{idx}.png"
+        )
+        torchvision.utils.save_image(y.unsqueeze(1), f"{folder}{idx}.png")
+
+    model.train()
--- a/ML/Pytorch/more_advanced/GANs/DCGAN_mnist.py
+++ b/ML/Pytorch/more_advanced/GANs/DCGAN_mnist.py
@@ -0,0 +1,131 @@
+"""
+Example code of how to code GANs and more specifically DCGAN,
+for more information about DCGANs read: https://arxiv.org/abs/1511.06434
+
+We then train the DCGAN on the MNIST dataset (toy dataset of handwritten digits)
+and then generate our own. You can apply this more generally on really any dataset
+but MNIST is simple enough to get the overall idea.
+
+Video explanation: https://youtu.be/5RYETbFFQ7s
+Got any questions leave a comment on youtube :)
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-20 Initial coding
+
+"""
+
+# Imports
+import torch
+import torchvision
+import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
+import torch.optim as optim  # For all Optimization algorithms, SGD, Adam, etc.
+import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset
+from torch.utils.data import (
+    DataLoader,
+)  # Gives easier dataset managment and creates mini batches
+from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
+from model_utils import (
+    Discriminator,
+    Generator,
+)  # Import our models we've defined (from DCGAN paper)
+
+# Hyperparameters
+lr = 0.0005
+batch_size = 64
+image_size = 64
+channels_img = 1
+channels_noise = 256
+num_epochs = 10
+
+# For how many channels Generator and Discriminator should use
+features_d = 16
+features_g = 16
+
+my_transforms = transforms.Compose(
+    [
+        transforms.Resize(image_size),
+        transforms.ToTensor(),
+        transforms.Normalize((0.5,), (0.5,)),
+    ]
+)
+
+dataset = datasets.MNIST(
+    root="dataset/", train=True, transform=my_transforms, download=True
+)
+dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Create discriminator and generator
+netD = Discriminator(channels_img, features_d).to(device)
+netG = Generator(channels_noise, channels_img, features_g).to(device)
+
+# Setup Optimizer for G and D
+optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(0.5, 0.999))
+optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(0.5, 0.999))
+
+netG.train()
+netD.train()
+
+criterion = nn.BCELoss()
+
+real_label = 1
+fake_label = 0
+
+fixed_noise = torch.randn(64, channels_noise, 1, 1).to(device)
+writer_real = SummaryWriter(f"runs/GAN_MNIST/test_real")
+writer_fake = SummaryWriter(f"runs/GAN_MNIST/test_fake")
+step = 0
+
+print("Starting Training...")
+
+for epoch in range(num_epochs):
+    for batch_idx, (data, targets) in enumerate(dataloader):
+        data = data.to(device)
+        batch_size = data.shape[0]
+
+        ### Train Discriminator: max log(D(x)) + log(1 - D(G(z)))
+        netD.zero_grad()
+        label = (torch.ones(batch_size) * 0.9).to(device)
+        output = netD(data).reshape(-1)
+        lossD_real = criterion(output, label)
+        D_x = output.mean().item()
+
+        noise = torch.randn(batch_size, channels_noise, 1, 1).to(device)
+        fake = netG(noise)
+        label = (torch.ones(batch_size) * 0.1).to(device)
+
+        output = netD(fake.detach()).reshape(-1)
+        lossD_fake = criterion(output, label)
+
+        lossD = lossD_real + lossD_fake
+        lossD.backward()
+        optimizerD.step()
+
+        ### Train Generator: max log(D(G(z)))
+        netG.zero_grad()
+        label = torch.ones(batch_size).to(device)
+        output = netD(fake).reshape(-1)
+        lossG = criterion(output, label)
+        lossG.backward()
+        optimizerG.step()
+
+        # Print losses ocassionally and print to tensorboard
+        if batch_idx % 100 == 0:
+            step += 1
+            print(
+                f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(dataloader)} \
+                  Loss D: {lossD:.4f}, loss G: {lossG:.4f} D(x): {D_x:.4f}"
+            )
+
+            with torch.no_grad():
+                fake = netG(fixed_noise)
+                img_grid_real = torchvision.utils.make_grid(data[:32], normalize=True)
+                img_grid_fake = torchvision.utils.make_grid(fake[:32], normalize=True)
+                writer_real.add_image(
+                    "Mnist Real Images", img_grid_real, global_step=step
+                )
+                writer_fake.add_image(
+                    "Mnist Fake Images", img_grid_fake, global_step=step
+                )
--- a/ML/Pytorch/more_advanced/GANs/README.md
+++ b/ML/Pytorch/more_advanced/GANs/README.md
@@ -0,0 +1,4 @@
+### Generative Adversarial Network
+
+DCGAN_mnist.py: main file and training network
+model_utils.py: Generator and discriminator implementation
--- a/ML/Pytorch/more_advanced/GANs/model_utils.py
+++ b/ML/Pytorch/more_advanced/GANs/model_utils.py
@@ -0,0 +1,76 @@
+"""
+Discriminator and Generator implementation from DCGAN paper
+that we import in the main (DCGAN_mnist.py) file.
+"""
+
+import torch
+import torch.nn as nn
+
+
+class Discriminator(nn.Module):
+    def __init__(self, channels_img, features_d):
+        super(Discriminator, self).__init__()
+        self.net = nn.Sequential(
+            # N x channels_img x 64 x 64
+            nn.Conv2d(channels_img, features_d, kernel_size=4, stride=2, padding=1),
+            nn.LeakyReLU(0.2),
+            # N x features_d x 32 x 32
+            nn.Conv2d(features_d, features_d * 2, kernel_size=4, stride=2, padding=1),
+            nn.BatchNorm2d(features_d * 2),
+            nn.LeakyReLU(0.2),
+            nn.Conv2d(
+                features_d * 2, features_d * 4, kernel_size=4, stride=2, padding=1
+            ),
+            nn.BatchNorm2d(features_d * 4),
+            nn.LeakyReLU(0.2),
+            nn.Conv2d(
+                features_d * 4, features_d * 8, kernel_size=4, stride=2, padding=1
+            ),
+            nn.BatchNorm2d(features_d * 8),
+            nn.LeakyReLU(0.2),
+            # N x features_d*8 x 4 x 4
+            nn.Conv2d(features_d * 8, 1, kernel_size=4, stride=2, padding=0),
+            # N x 1 x 1 x 1
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class Generator(nn.Module):
+    def __init__(self, channels_noise, channels_img, features_g):
+        super(Generator, self).__init__()
+
+        self.net = nn.Sequential(
+            # N x channels_noise x 1 x 1
+            nn.ConvTranspose2d(
+                channels_noise, features_g * 16, kernel_size=4, stride=1, padding=0
+            ),
+            nn.BatchNorm2d(features_g * 16),
+            nn.ReLU(),
+            # N x features_g*16 x 4 x 4
+            nn.ConvTranspose2d(
+                features_g * 16, features_g * 8, kernel_size=4, stride=2, padding=1
+            ),
+            nn.BatchNorm2d(features_g * 8),
+            nn.ReLU(),
+            nn.ConvTranspose2d(
+                features_g * 8, features_g * 4, kernel_size=4, stride=2, padding=1
+            ),
+            nn.BatchNorm2d(features_g * 4),
+            nn.ReLU(),
+            nn.ConvTranspose2d(
+                features_g * 4, features_g * 2, kernel_size=4, stride=2, padding=1
+            ),
+            nn.BatchNorm2d(features_g * 2),
+            nn.ReLU(),
+            nn.ConvTranspose2d(
+                features_g * 2, channels_img, kernel_size=4, stride=2, padding=1
+            ),
+            # N x channels_img x 64 x 64
+            nn.Tanh(),
+        )
+
+    def forward(self, x):
+        return self.net(x)
--- a/ML/Pytorch/more_advanced/Seq2Seq/seq2seq.py
+++ b/ML/Pytorch/more_advanced/Seq2Seq/seq2seq.py
@@ -0,0 +1,242 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torchtext.datasets import Multi30k
+from torchtext.data import Field, BucketIterator
+import numpy as np
+import spacy
+import random
+from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
+from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
+
+spacy_ger = spacy.load("de")
+spacy_eng = spacy.load("en")
+
+
+def tokenize_ger(text):
+    return [tok.text for tok in spacy_ger.tokenizer(text)]
+
+
+def tokenize_eng(text):
+    return [tok.text for tok in spacy_eng.tokenizer(text)]
+
+
+german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")
+
+english = Field(
+    tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
+)
+
+train_data, valid_data, test_data = Multi30k.splits(
+    exts=(".de", ".en"), fields=(german, english)
+)
+
+german.build_vocab(train_data, max_size=10000, min_freq=2)
+english.build_vocab(train_data, max_size=10000, min_freq=2)
+
+
+class Encoder(nn.Module):
+    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
+        super(Encoder, self).__init__()
+        self.dropout = nn.Dropout(p)
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+
+        self.embedding = nn.Embedding(input_size, embedding_size)
+        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
+
+    def forward(self, x):
+        # x shape: (seq_length, N) where N is batch size
+
+        embedding = self.dropout(self.embedding(x))
+        # embedding shape: (seq_length, N, embedding_size)
+
+        outputs, (hidden, cell) = self.rnn(embedding)
+        # outputs shape: (seq_length, N, hidden_size)
+
+        return hidden, cell
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
+    ):
+        super(Decoder, self).__init__()
+        self.dropout = nn.Dropout(p)
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+
+        self.embedding = nn.Embedding(input_size, embedding_size)
+        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
+        self.fc = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x, hidden, cell):
+        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
+        # is 1 here because we are sending in a single word and not a sentence
+        x = x.unsqueeze(0)
+
+        embedding = self.dropout(self.embedding(x))
+        # embedding shape: (1, N, embedding_size)
+
+        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
+        # outputs shape: (1, N, hidden_size)
+
+        predictions = self.fc(outputs)
+
+        # predictions shape: (1, N, length_target_vocabulary) to send it to
+        # loss function we want it to be (N, length_target_vocabulary) so we're
+        # just gonna remove the first dim
+        predictions = predictions.squeeze(0)
+
+        return predictions, hidden, cell
+
+
+class Seq2Seq(nn.Module):
+    def __init__(self, encoder, decoder):
+        super(Seq2Seq, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+
+    def forward(self, source, target, teacher_force_ratio=0.5):
+        batch_size = source.shape[1]
+        target_len = target.shape[0]
+        target_vocab_size = len(english.vocab)
+
+        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
+
+        hidden, cell = self.encoder(source)
+
+        # Grab the first input to the Decoder which will be <SOS> token
+        x = target[0]
+
+        for t in range(1, target_len):
+            # Use previous hidden, cell as context from encoder at start
+            output, hidden, cell = self.decoder(x, hidden, cell)
+
+            # Store next output prediction
+            outputs[t] = output
+
+            # Get the best word the Decoder predicted (index in the vocabulary)
+            best_guess = output.argmax(1)
+
+            # With probability of teacher_force_ratio we take the actual next word
+            # otherwise we take the word that the Decoder predicted it to be.
+            # Teacher Forcing is used so that the model gets used to seeing
+            # similar inputs at training and testing time, if teacher forcing is 1
+            # then inputs at test time might be completely different than what the
+            # network is used to. This was a long comment.
+            x = target[t] if random.random() < teacher_force_ratio else best_guess
+
+        return outputs
+
+
+### We're ready to define everything we need for training our Seq2Seq model ###
+
+# Training hyperparameters
+num_epochs = 100
+learning_rate = 0.001
+batch_size = 64
+
+# Model hyperparameters
+load_model = False
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+input_size_encoder = len(german.vocab)
+input_size_decoder = len(english.vocab)
+output_size = len(english.vocab)
+encoder_embedding_size = 300
+decoder_embedding_size = 300
+hidden_size = 1024  # Needs to be the same for both RNN's
+num_layers = 2
+enc_dropout = 0.5
+dec_dropout = 0.5
+
+# Tensorboard to get nice loss plot
+writer = SummaryWriter(f"runs/loss_plot")
+step = 0
+
+train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
+    (train_data, valid_data, test_data),
+    batch_size=batch_size,
+    sort_within_batch=True,
+    sort_key=lambda x: len(x.src),
+    device=device,
+)
+
+encoder_net = Encoder(
+    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
+).to(device)
+
+decoder_net = Decoder(
+    input_size_decoder,
+    decoder_embedding_size,
+    hidden_size,
+    output_size,
+    num_layers,
+    dec_dropout,
+).to(device)
+
+model = Seq2Seq(encoder_net, decoder_net).to(device)
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+pad_idx = english.vocab.stoi["<pad>"]
+criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
+
+if load_model:
+    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
+
+
+sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."
+
+for epoch in range(num_epochs):
+    print(f"[Epoch {epoch} / {num_epochs}]")
+
+    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
+    save_checkpoint(checkpoint)
+
+    model.eval()
+
+    translated_sentence = translate_sentence(
+        model, sentence, german, english, device, max_length=50
+    )
+
+    print(f"Translated example sentence: \n {translated_sentence}")
+
+    model.train()
+
+    for batch_idx, batch in enumerate(train_iterator):
+        # Get input and targets and get to cuda
+        inp_data = batch.src.to(device)
+        target = batch.trg.to(device)
+
+        # Forward prop
+        output = model(inp_data, target)
+
+        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
+        # doesn't take input in that form. For example if we have MNIST we want to have
+        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
+        # way that we have output_words * batch_size that we want to send in into
+        # our cost function, so we need to do some reshapin. While we're at it
+        # Let's also remove the start token while we're at it
+        output = output[1:].reshape(-1, output.shape[2])
+        target = target[1:].reshape(-1)
+
+        optimizer.zero_grad()
+        loss = criterion(output, target)
+
+        # Back prop
+        loss.backward()
+
+        # Clip to avoid exploding gradient issues, makes sure grads are
+        # within a healthy range
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
+
+        # Gradient descent step
+        optimizer.step()
+
+        # Plot to tensorboard
+        writer.add_scalar("Training loss", loss, global_step=step)
+        step += 1
+
+
+score = bleu(test_data[1:100], model, german, english, device)
+print(f"Bleu score {score*100:.2f}")
--- a/ML/Pytorch/more_advanced/Seq2Seq/utils.py
+++ b/ML/Pytorch/more_advanced/Seq2Seq/utils.py
@@ -0,0 +1,84 @@
+import torch
+import spacy
+from torchtext.data.metrics import bleu_score
+import sys
+
+
+def translate_sentence(model, sentence, german, english, device, max_length=50):
+    # print(sentence)
+
+    # sys.exit()
+
+    # Load german tokenizer
+    spacy_ger = spacy.load("de")
+
+    # Create tokens using spacy and everything in lower case (which is what our vocab is)
+    if type(sentence) == str:
+        tokens = [token.text.lower() for token in spacy_ger(sentence)]
+    else:
+        tokens = [token.lower() for token in sentence]
+
+    # print(tokens)
+
+    # sys.exit()
+    # Add <SOS> and <EOS> in beginning and end respectively
+    tokens.insert(0, german.init_token)
+    tokens.append(german.eos_token)
+
+    # Go through each german token and convert to an index
+    text_to_indices = [german.vocab.stoi[token] for token in tokens]
+
+    # Convert to Tensor
+    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
+
+    # Build encoder hidden, cell state
+    with torch.no_grad():
+        hidden, cell = model.encoder(sentence_tensor)
+
+    outputs = [english.vocab.stoi["<sos>"]]
+
+    for _ in range(max_length):
+        previous_word = torch.LongTensor([outputs[-1]]).to(device)
+
+        with torch.no_grad():
+            output, hidden, cell = model.decoder(previous_word, hidden, cell)
+            best_guess = output.argmax(1).item()
+
+        outputs.append(best_guess)
+
+        # Model predicts it's the end of the sentence
+        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
+            break
+
+    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
+
+    # remove start token
+    return translated_sentence[1:]
+
+
+def bleu(data, model, german, english, device):
+    targets = []
+    outputs = []
+
+    for example in data:
+        src = vars(example)["src"]
+        trg = vars(example)["trg"]
+
+        prediction = translate_sentence(model, src, german, english, device)
+        prediction = prediction[:-1]  # remove <eos> token
+
+        targets.append([trg])
+        outputs.append(prediction)
+
+    return bleu_score(outputs, targets)
+
+
+def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+
+
+def load_checkpoint(checkpoint, model, optimizer):
+    print("=> Loading checkpoint")
+    model.load_state_dict(checkpoint["state_dict"])
+    optimizer.load_state_dict(checkpoint["optimizer"])
--- a/ML/Pytorch/more_advanced/Seq2Seq_attention/seq2seq_attention.py
+++ b/ML/Pytorch/more_advanced/Seq2Seq_attention/seq2seq_attention.py
@@ -0,0 +1,279 @@
+import random
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+import spacy
+from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
+from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
+from torchtext.datasets import Multi30k
+from torchtext.data import Field, BucketIterator
+
+"""
+To install spacy languages do:
+python -m spacy download en
+python -m spacy download de
+"""
+spacy_ger = spacy.load("de")
+spacy_eng = spacy.load("en")
+
+
+def tokenize_ger(text):
+    return [tok.text for tok in spacy_ger.tokenizer(text)]
+
+
+def tokenize_eng(text):
+    return [tok.text for tok in spacy_eng.tokenizer(text)]
+
+
+german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")
+
+english = Field(
+    tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
+)
+
+train_data, valid_data, test_data = Multi30k.splits(
+    exts=(".de", ".en"), fields=(german, english)
+)
+
+german.build_vocab(train_data, max_size=10000, min_freq=2)
+english.build_vocab(train_data, max_size=10000, min_freq=2)
+
+
+class Encoder(nn.Module):
+    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
+        super(Encoder, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+
+        self.embedding = nn.Embedding(input_size, embedding_size)
+        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional=True)
+
+        self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
+        self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
+        self.dropout = nn.Dropout(p)
+
+    def forward(self, x):
+        # x: (seq_length, N) where N is batch size
+
+        embedding = self.dropout(self.embedding(x))
+        # embedding shape: (seq_length, N, embedding_size)
+
+        encoder_states, (hidden, cell) = self.rnn(embedding)
+        # outputs shape: (seq_length, N, hidden_size)
+
+        # Use forward, backward cells and hidden through a linear layer
+        # so that it can be input to the decoder which is not bidirectional
+        # Also using index slicing ([idx:idx+1]) to keep the dimension
+        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
+        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))
+
+        return encoder_states, hidden, cell
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
+    ):
+        super(Decoder, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+
+        self.embedding = nn.Embedding(input_size, embedding_size)
+        self.rnn = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size, num_layers)
+
+        self.energy = nn.Linear(hidden_size * 3, 1)
+        self.fc = nn.Linear(hidden_size, output_size)
+        self.dropout = nn.Dropout(p)
+        self.softmax = nn.Softmax(dim=0)
+        self.relu = nn.ReLU()
+
+    def forward(self, x, encoder_states, hidden, cell):
+        x = x.unsqueeze(0)
+        # x: (1, N) where N is the batch size
+
+        embedding = self.dropout(self.embedding(x))
+        # embedding shape: (1, N, embedding_size)
+
+        sequence_length = encoder_states.shape[0]
+        h_reshaped = hidden.repeat(sequence_length, 1, 1)
+        # h_reshaped: (seq_length, N, hidden_size*2)
+
+        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
+        # energy: (seq_length, N, 1)
+
+        attention = self.softmax(energy)
+        # attention: (seq_length, N, 1)
+
+        # attention: (seq_length, N, 1), snk
+        # encoder_states: (seq_length, N, hidden_size*2), snl
+        # we want context_vector: (1, N, hidden_size*2), i.e knl
+        context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)
+
+        rnn_input = torch.cat((context_vector, embedding), dim=2)
+        # rnn_input: (1, N, hidden_size*2 + embedding_size)
+
+        outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
+        # outputs shape: (1, N, hidden_size)
+
+        predictions = self.fc(outputs).squeeze(0)
+        # predictions: (N, hidden_size)
+
+        return predictions, hidden, cell
+
+
+class Seq2Seq(nn.Module):
+    def __init__(self, encoder, decoder):
+        super(Seq2Seq, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+
+    def forward(self, source, target, teacher_force_ratio=0.5):
+        batch_size = source.shape[1]
+        target_len = target.shape[0]
+        target_vocab_size = len(english.vocab)
+
+        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
+        encoder_states, hidden, cell = self.encoder(source)
+
+        # First input will be <SOS> token
+        x = target[0]
+
+        for t in range(1, target_len):
+            # At every time step use encoder_states and update hidden, cell
+            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)
+
+            # Store prediction for current time step
+            outputs[t] = output
+
+            # Get the best word the Decoder predicted (index in the vocabulary)
+            best_guess = output.argmax(1)
+
+            # With probability of teacher_force_ratio we take the actual next word
+            # otherwise we take the word that the Decoder predicted it to be.
+            # Teacher Forcing is used so that the model gets used to seeing
+            # similar inputs at training and testing time, if teacher forcing is 1
+            # then inputs at test time might be completely different than what the
+            # network is used to. This was a long comment.
+            x = target[t] if random.random() < teacher_force_ratio else best_guess
+
+        return outputs
+
+
+### We're ready to define everything we need for training our Seq2Seq model ###
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+load_model = False
+save_model = True
+
+# Training hyperparameters
+num_epochs = 100
+learning_rate = 3e-4
+batch_size = 32
+
+# Model hyperparameters
+input_size_encoder = len(german.vocab)
+input_size_decoder = len(english.vocab)
+output_size = len(english.vocab)
+encoder_embedding_size = 300
+decoder_embedding_size = 300
+hidden_size = 1024
+num_layers = 1
+enc_dropout = 0.0
+dec_dropout = 0.0
+
+# Tensorboard to get nice loss plot
+writer = SummaryWriter(f"runs/loss_plot")
+step = 0
+
+train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
+    (train_data, valid_data, test_data),
+    batch_size=batch_size,
+    sort_within_batch=True,
+    sort_key=lambda x: len(x.src),
+    device=device,
+)
+
+encoder_net = Encoder(
+    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
+).to(device)
+
+decoder_net = Decoder(
+    input_size_decoder,
+    decoder_embedding_size,
+    hidden_size,
+    output_size,
+    num_layers,
+    dec_dropout,
+).to(device)
+
+model = Seq2Seq(encoder_net, decoder_net).to(device)
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+pad_idx = english.vocab.stoi["<pad>"]
+criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
+
+if load_model:
+    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
+
+sentence = (
+    "ein boot mit mehreren männern darauf wird von einem großen"
+    "pferdegespann ans ufer gezogen."
+)
+
+for epoch in range(num_epochs):
+    print(f"[Epoch {epoch} / {num_epochs}]")
+
+    if save_model:
+        checkpoint = {
+            "state_dict": model.state_dict(),
+            "optimizer": optimizer.state_dict(),
+        }
+        save_checkpoint(checkpoint)
+
+    model.eval()
+
+    translated_sentence = translate_sentence(
+        model, sentence, german, english, device, max_length=50
+    )
+
+    print(f"Translated example sentence: \n {translated_sentence}")
+
+    model.train()
+
+    for batch_idx, batch in enumerate(train_iterator):
+        # Get input and targets and get to cuda
+        inp_data = batch.src.to(device)
+        target = batch.trg.to(device)
+
+        # Forward prop
+        output = model(inp_data, target)
+
+        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
+        # doesn't take input in that form. For example if we have MNIST we want to have
+        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
+        # way that we have output_words * batch_size that we want to send in into
+        # our cost function, so we need to do some reshapin. While we're at it
+        # Let's also remove the start token while we're at it
+        output = output[1:].reshape(-1, output.shape[2])
+        target = target[1:].reshape(-1)
+
+        optimizer.zero_grad()
+        loss = criterion(output, target)
+
+        # Back prop
+        loss.backward()
+
+        # Clip to avoid exploding gradient issues, makes sure grads are
+        # within a healthy range
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
+
+        # Gradient descent step
+        optimizer.step()
+
+        # Plot to tensorboard
+        writer.add_scalar("Training loss", loss, global_step=step)
+        step += 1
+
+# running on entire test data takes a while
+score = bleu(test_data[1:100], model, german, english, device)
+print(f"Bleu score {score * 100:.2f}")
--- a/ML/Pytorch/more_advanced/Seq2Seq_attention/utils.py
+++ b/ML/Pytorch/more_advanced/Seq2Seq_attention/utils.py
@@ -0,0 +1,79 @@
+import torch
+import spacy
+from torchtext.data.metrics import bleu_score
+import sys
+
+
+def translate_sentence(model, sentence, german, english, device, max_length=50):
+    # Load german tokenizer
+    spacy_ger = spacy.load("de")
+
+    # Create tokens using spacy and everything in lower case (which is what our vocab is)
+    if type(sentence) == str:
+        tokens = [token.text.lower() for token in spacy_ger(sentence)]
+    else:
+        tokens = [token.lower() for token in sentence]
+
+    # Add <SOS> and <EOS> in beginning and end respectively
+    tokens.insert(0, german.init_token)
+    tokens.append(german.eos_token)
+
+    # Go through each german token and convert to an index
+    text_to_indices = [german.vocab.stoi[token] for token in tokens]
+
+    # Convert to Tensor
+    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
+
+    # Build encoder hidden, cell state
+    with torch.no_grad():
+        outputs_encoder, hiddens, cells = model.encoder(sentence_tensor)
+
+    outputs = [english.vocab.stoi["<sos>"]]
+
+    for _ in range(max_length):
+        previous_word = torch.LongTensor([outputs[-1]]).to(device)
+
+        with torch.no_grad():
+            output, hiddens, cells = model.decoder(
+                previous_word, outputs_encoder, hiddens, cells
+            )
+            best_guess = output.argmax(1).item()
+
+        outputs.append(best_guess)
+
+        # Model predicts it's the end of the sentence
+        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
+            break
+
+    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
+
+    # remove start token
+    return translated_sentence[1:]
+
+
+def bleu(data, model, german, english, device):
+    targets = []
+    outputs = []
+
+    for example in data:
+        src = vars(example)["src"]
+        trg = vars(example)["trg"]
+
+        prediction = translate_sentence(model, src, german, english, device)
+        prediction = prediction[:-1]  # remove <eos> token
+
+        targets.append([trg])
+        outputs.append(prediction)
+
+    return bleu_score(outputs, targets)
+
+
+def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+
+
+def load_checkpoint(checkpoint, model, optimizer):
+    print("=> Loading checkpoint")
+    model.load_state_dict(checkpoint["state_dict"])
+    optimizer.load_state_dict(checkpoint["optimizer"])
--- a/ML/Pytorch/more_advanced/image_captioning/README.md
+++ b/ML/Pytorch/more_advanced/image_captioning/README.md
@@ -0,0 +1,12 @@
+### Image Captioning
+
+Download the dataset used: https://www.kaggle.com/dataset/e1cd22253a9b23b073794872bf565648ddbe4f17e7fa9e74766ad3707141adeb
+Then set images folder, captions.txt inside a folder Flickr8k.
+
+train.py: For training the network
+
+model.py: creating the encoderCNN, decoderRNN and hooking them togethor
+
+get_loader.py: Loading the data, creating vocabulary
+
+utils.py: Load model, save model, printing few test cases downloaded online
--- a/ML/Pytorch/more_advanced/image_captioning/get_loader.py
+++ b/ML/Pytorch/more_advanced/image_captioning/get_loader.py
@@ -0,0 +1,142 @@
+import os  # when loading file paths
+import pandas as pd  # for lookup in annotation file
+import spacy  # for tokenizer
+import torch
+from torch.nn.utils.rnn import pad_sequence  # pad batch
+from torch.utils.data import DataLoader, Dataset
+from PIL import Image  # Load img
+import torchvision.transforms as transforms
+
+
+# We want to convert text -> numerical values
+# 1. We need a Vocabulary mapping each word to a index
+# 2. We need to setup a Pytorch dataset to load the data
+# 3. Setup padding of every batch (all examples should be
+#    of same seq_len and setup dataloader)
+# Note that loading the image is very easy compared to the text!
+
+# Download with: python -m spacy download en
+spacy_eng = spacy.load("en")
+
+
+class Vocabulary:
+    def __init__(self, freq_threshold):
+        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
+        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
+        self.freq_threshold = freq_threshold
+
+    def __len__(self):
+        return len(self.itos)
+
+    @staticmethod
+    def tokenizer_eng(text):
+        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
+
+    def build_vocabulary(self, sentence_list):
+        frequencies = {}
+        idx = 4
+
+        for sentence in sentence_list:
+            for word in self.tokenizer_eng(sentence):
+                if word not in frequencies:
+                    frequencies[word] = 1
+
+                else:
+                    frequencies[word] += 1
+
+                if frequencies[word] == self.freq_threshold:
+                    self.stoi[word] = idx
+                    self.itos[idx] = word
+                    idx += 1
+
+    def numericalize(self, text):
+        tokenized_text = self.tokenizer_eng(text)
+
+        return [
+            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
+            for token in tokenized_text
+        ]
+
+
+class FlickrDataset(Dataset):
+    def __init__(self, root_dir, captions_file, transform=None, freq_threshold=5):
+        self.root_dir = root_dir
+        self.df = pd.read_csv(captions_file)
+        self.transform = transform
+
+        # Get img, caption columns
+        self.imgs = self.df["image"]
+        self.captions = self.df["caption"]
+
+        # Initialize vocabulary and build vocab
+        self.vocab = Vocabulary(freq_threshold)
+        self.vocab.build_vocabulary(self.captions.tolist())
+
+    def __len__(self):
+        return len(self.df)
+
+    def __getitem__(self, index):
+        caption = self.captions[index]
+        img_id = self.imgs[index]
+        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
+
+        if self.transform is not None:
+            img = self.transform(img)
+
+        numericalized_caption = [self.vocab.stoi["<SOS>"]]
+        numericalized_caption += self.vocab.numericalize(caption)
+        numericalized_caption.append(self.vocab.stoi["<EOS>"])
+
+        return img, torch.tensor(numericalized_caption)
+
+
+class MyCollate:
+    def __init__(self, pad_idx):
+        self.pad_idx = pad_idx
+
+    def __call__(self, batch):
+        imgs = [item[0].unsqueeze(0) for item in batch]
+        imgs = torch.cat(imgs, dim=0)
+        targets = [item[1] for item in batch]
+        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)
+
+        return imgs, targets
+
+
+def get_loader(
+    root_folder,
+    annotation_file,
+    transform,
+    batch_size=32,
+    num_workers=8,
+    shuffle=True,
+    pin_memory=True,
+):
+    dataset = FlickrDataset(root_folder, annotation_file, transform=transform)
+
+    pad_idx = dataset.vocab.stoi["<PAD>"]
+
+    loader = DataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=shuffle,
+        pin_memory=pin_memory,
+        collate_fn=MyCollate(pad_idx=pad_idx),
+    )
+
+    return loader, dataset
+
+
+if __name__ == "__main__":
+    transform = transforms.Compose(
+        [transforms.Resize((224, 224)), transforms.ToTensor(),]
+    )
+
+    loader, dataset = get_loader(
+        "flickr8k/images/", "flickr8k/captions.txt", transform=transform
+    )
+
+    for idx, (imgs, captions) in enumerate(loader):
+        print(imgs.shape)
+        print(captions.shape)
--- a/ML/Pytorch/more_advanced/image_captioning/model.py
+++ b/ML/Pytorch/more_advanced/image_captioning/model.py
@@ -0,0 +1,66 @@
+import torch
+import torch.nn as nn
+import statistics
+import torchvision.models as models
+
+
+class EncoderCNN(nn.Module):
+    def __init__(self, embed_size, train_CNN=False):
+        super(EncoderCNN, self).__init__()
+        self.train_CNN = train_CNN
+        self.inception = models.inception_v3(pretrained=True, aux_logits=False)
+        self.inception.fc = nn.Linear(self.inception.fc.in_features, embed_size)
+        self.relu = nn.ReLU()
+        self.times = []
+        self.dropout = nn.Dropout(0.5)
+
+    def forward(self, images):
+        features = self.inception(images)
+        return self.dropout(self.relu(features))
+
+
+class DecoderRNN(nn.Module):
+    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
+        super(DecoderRNN, self).__init__()
+        self.embed = nn.Embedding(vocab_size, embed_size)
+        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
+        self.linear = nn.Linear(hidden_size, vocab_size)
+        self.dropout = nn.Dropout(0.5)
+
+    def forward(self, features, captions):
+        embeddings = self.dropout(self.embed(captions))
+        embeddings = torch.cat((features.unsqueeze(0), embeddings), dim=0)
+        hiddens, _ = self.lstm(embeddings)
+        outputs = self.linear(hiddens)
+        return outputs
+
+
+class CNNtoRNN(nn.Module):
+    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
+        super(CNNtoRNN, self).__init__()
+        self.encoderCNN = EncoderCNN(embed_size)
+        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)
+
+    def forward(self, images, captions):
+        features = self.encoderCNN(images)
+        outputs = self.decoderRNN(features, captions)
+        return outputs
+
+    def caption_image(self, image, vocabulary, max_length=50):
+        result_caption = []
+
+        with torch.no_grad():
+            x = self.encoderCNN(image).unsqueeze(0)
+            states = None
+
+            for _ in range(max_length):
+                hiddens, states = self.decoderRNN.lstm(x, states)
+                output = self.decoderRNN.linear(hiddens.squeeze(0))
+                predicted = output.argmax(1)
+                result_caption.append(predicted.item())
+                x = self.decoderRNN.embed(predicted).unsqueeze(0)
+
+                if vocabulary.itos[predicted.item()] == "<EOS>":
+                    break
+
+        return [vocabulary.itos[idx] for idx in result_caption]
--- a/ML/Pytorch/more_advanced/image_captioning/test_examples/boat.png
+++ b/ML/Pytorch/more_advanced/image_captioning/test_examples/boat.png
--- a/ML/Pytorch/more_advanced/image_captioning/test_examples/bus.png
+++ b/ML/Pytorch/more_advanced/image_captioning/test_examples/bus.png
--- a/ML/Pytorch/more_advanced/image_captioning/test_examples/child.jpg
+++ b/ML/Pytorch/more_advanced/image_captioning/test_examples/child.jpg
--- a/ML/Pytorch/more_advanced/image_captioning/test_examples/dog.jpg
+++ b/ML/Pytorch/more_advanced/image_captioning/test_examples/dog.jpg
--- a/ML/Pytorch/more_advanced/image_captioning/test_examples/horse.png
+++ b/ML/Pytorch/more_advanced/image_captioning/test_examples/horse.png
--- a/ML/Pytorch/more_advanced/image_captioning/train.py
+++ b/ML/Pytorch/more_advanced/image_captioning/train.py
@@ -0,0 +1,96 @@
+import torch
+from tqdm import tqdm
+import torch.nn as nn
+import torch.optim as optim
+import torchvision.transforms as transforms
+from torch.utils.tensorboard import SummaryWriter
+from utils import save_checkpoint, load_checkpoint, print_examples
+from get_loader import get_loader
+from model import CNNtoRNN
+
+
+def train():
+    transform = transforms.Compose(
+        [
+            transforms.Resize((356, 356)),
+            transforms.RandomCrop((299, 299)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]
+    )
+
+    train_loader, dataset = get_loader(
+        root_folder="flickr8k/images",
+        annotation_file="flickr8k/captions.txt",
+        transform=transform,
+        num_workers=2,
+    )
+
+    torch.backends.cudnn.benchmark = True
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    load_model = False
+    save_model = False
+    train_CNN = False
+
+    # Hyperparameters
+    embed_size = 256
+    hidden_size = 256
+    vocab_size = len(dataset.vocab)
+    num_layers = 1
+    learning_rate = 3e-4
+    num_epochs = 100
+
+    # for tensorboard
+    writer = SummaryWriter("runs/flickr")
+    step = 0
+
+    # initialize model, loss etc
+    model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
+    criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi["<PAD>"])
+    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+    # Only finetune the CNN
+    for name, param in model.encoderCNN.inception.named_parameters():
+        if "fc.weight" in name or "fc.bias" in name:
+            param.requires_grad = True
+        else:
+            param.requires_grad = train_CNN
+
+    if load_model:
+        step = load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
+
+    model.train()
+
+    for epoch in range(num_epochs):
+        # Uncomment the line below to see a couple of test cases
+        # print_examples(model, device, dataset)
+
+        if save_model:
+            checkpoint = {
+                "state_dict": model.state_dict(),
+                "optimizer": optimizer.state_dict(),
+                "step": step,
+            }
+            save_checkpoint(checkpoint)
+
+        for idx, (imgs, captions) in tqdm(
+            enumerate(train_loader), total=len(train_loader), leave=False
+        ):
+            imgs = imgs.to(device)
+            captions = captions.to(device)
+
+            outputs = model(imgs, captions[:-1])
+            loss = criterion(
+                outputs.reshape(-1, outputs.shape[2]), captions.reshape(-1)
+            )
+
+            writer.add_scalar("Training loss", loss.item(), global_step=step)
+            step += 1
+
+            optimizer.zero_grad()
+            loss.backward(loss)
+            optimizer.step()
+
+
+if __name__ == "__main__":
+    train()
--- a/ML/Pytorch/more_advanced/image_captioning/utils.py
+++ b/ML/Pytorch/more_advanced/image_captioning/utils.py
@@ -0,0 +1,69 @@
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+
+
+def print_examples(model, device, dataset):
+    transform = transforms.Compose(
+        [
+            transforms.Resize((299, 299)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]
+    )
+
+    model.eval()
+    test_img1 = transform(Image.open("test_examples/dog.jpg").convert("RGB")).unsqueeze(
+        0
+    )
+    print("Example 1 CORRECT: Dog on a beach by the ocean")
+    print(
+        "Example 1 OUTPUT: "
+        + " ".join(model.caption_image(test_img1.to(device), dataset.vocab))
+    )
+    test_img2 = transform(
+        Image.open("test_examples/child.jpg").convert("RGB")
+    ).unsqueeze(0)
+    print("Example 2 CORRECT: Child holding red frisbee outdoors")
+    print(
+        "Example 2 OUTPUT: "
+        + " ".join(model.caption_image(test_img2.to(device), dataset.vocab))
+    )
+    test_img3 = transform(Image.open("test_examples/bus.png").convert("RGB")).unsqueeze(
+        0
+    )
+    print("Example 3 CORRECT: Bus driving by parked cars")
+    print(
+        "Example 3 OUTPUT: "
+        + " ".join(model.caption_image(test_img3.to(device), dataset.vocab))
+    )
+    test_img4 = transform(
+        Image.open("test_examples/boat.png").convert("RGB")
+    ).unsqueeze(0)
+    print("Example 4 CORRECT: A small boat in the ocean")
+    print(
+        "Example 4 OUTPUT: "
+        + " ".join(model.caption_image(test_img4.to(device), dataset.vocab))
+    )
+    test_img5 = transform(
+        Image.open("test_examples/horse.png").convert("RGB")
+    ).unsqueeze(0)
+    print("Example 5 CORRECT: A cowboy riding a horse in the desert")
+    print(
+        "Example 5 OUTPUT: "
+        + " ".join(model.caption_image(test_img5.to(device), dataset.vocab))
+    )
+    model.train()
+
+
+def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+
+
+def load_checkpoint(checkpoint, model, optimizer):
+    print("=> Loading checkpoint")
+    model.load_state_dict(checkpoint["state_dict"])
+    optimizer.load_state_dict(checkpoint["optimizer"])
+    step = checkpoint["step"]
+    return step
--- a/ML/Pytorch/more_advanced/neuralstyle/annahathaway.png
+++ b/ML/Pytorch/more_advanced/neuralstyle/annahathaway.png
--- a/ML/Pytorch/more_advanced/neuralstyle/nst.py
+++ b/ML/Pytorch/more_advanced/neuralstyle/nst.py
@@ -0,0 +1,112 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from PIL import Image
+import torchvision.transforms as transforms
+import torchvision.models as models
+from torchvision.utils import save_image
+
+
+class VGG(nn.Module):
+    def __init__(self):
+        super(VGG, self).__init__()
+        # The first number x in convx_y gets added by 1 after it has gone
+        # through a maxpool, and the second y if we have several conv layers
+        # in between a max pool. These strings (0, 5, 10, ..) then correspond
+        # to conv1_1, conv2_1, conv3_1, conv4_1, conv5_1 mentioned in NST paper
+        self.chosen_features = ["0", "5", "10", "19", "28"]
+
+        # We don't need to run anything further than conv5_1 (the 28th module in vgg)
+        # Since remember, we dont actually care about the output of VGG: the only thing
+        # that is modified is the generated image (i.e, the input).
+        self.model = models.vgg19(pretrained=True).features[:29]
+
+    def forward(self, x):
+        # Store relevant features
+        features = []
+
+        # Go through each layer in model, if the layer is in the chosen_features,
+        # store it in features. At the end we'll just return all the activations
+        # for the specific layers we have in chosen_features
+        for layer_num, layer in enumerate(self.model):
+            x = layer(x)
+
+            if str(layer_num) in self.chosen_features:
+                features.append(x)
+
+        return features
+
+
+def load_image(image_name):
+    image = Image.open(image_name)
+    image = loader(image).unsqueeze(0)
+    return image.to(device)
+
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+imsize = 356
+
+# Here we may want to use the Normalization constants used in the original
+# VGG network (to get similar values net was originally trained on), but
+# I found it didn't matter too much so I didn't end of using it. If you
+# use it make sure to normalize back so the images don't look weird.
+loader = transforms.Compose(
+    [
+        transforms.Resize((imsize, imsize)),
+        transforms.ToTensor(),
+        # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ]
+)
+
+original_img = load_image("annahathaway.png")
+style_img = load_image("style.jpg")
+
+# initialized generated as white noise or clone of original image.
+# Clone seemed to work better for me.
+
+# generated = torch.randn(original_img.data.shape, device=device, requires_grad=True)
+generated = original_img.clone().requires_grad_(True)
+model = VGG().to(device).eval()
+
+# Hyperparameters
+total_steps = 6000
+learning_rate = 0.001
+alpha = 1
+beta = 0.01
+optimizer = optim.Adam([generated], lr=learning_rate)
+
+for step in range(total_steps):
+    # Obtain the convolution features in specifically chosen layers
+    generated_features = model(generated)
+    original_img_features = model(original_img)
+    style_features = model(style_img)
+
+    # Loss is 0 initially
+    style_loss = original_loss = 0
+
+    # iterate through all the features for the chosen layers
+    for gen_feature, orig_feature, style_feature in zip(
+        generated_features, original_img_features, style_features
+    ):
+
+        # batch_size will just be 1
+        batch_size, channel, height, width = gen_feature.shape
+        original_loss += torch.mean((gen_feature - orig_feature) ** 2)
+        # Compute Gram Matrix of generated
+        G = gen_feature.view(channel, height * width).mm(
+            gen_feature.view(channel, height * width).t()
+        )
+        # Compute Gram Matrix of Style
+        A = style_feature.view(channel, height * width).mm(
+            style_feature.view(channel, height * width).t()
+        )
+        style_loss += torch.mean((G - A) ** 2)
+
+    total_loss = alpha * original_loss + beta * style_loss
+    optimizer.zero_grad()
+    total_loss.backward()
+    optimizer.step()
+
+    if step % 200 == 0:
+        print(total_loss)
+        save_image(generated, "generated.png")
--- a/ML/Pytorch/more_advanced/neuralstyle/output/img1.png
+++ b/ML/Pytorch/more_advanced/neuralstyle/output/img1.png
--- a/ML/Pytorch/more_advanced/neuralstyle/output/img2.png
+++ b/ML/Pytorch/more_advanced/neuralstyle/output/img2.png
--- a/Show More
+++ b/Show More