Initial commit

This commit is contained in:
Aladdin Persson
2021-01-30 21:49:15 +01:00
commit 65b8c80495
432 changed files with 1290844 additions and 0 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

View File

@@ -0,0 +1,31 @@
import cv2
import albumentations as A
import numpy as np
from utils import plot_examples
from PIL import Image
image = Image.open("images/elon.jpeg")
transform = A.Compose(
[
A.Resize(width=1920, height=1080),
A.RandomCrop(width=1280, height=720),
A.Rotate(limit=40, p=0.9, border_mode=cv2.BORDER_CONSTANT),
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.1),
A.RGBShift(r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.9),
A.OneOf([
A.Blur(blur_limit=3, p=0.5),
A.ColorJitter(p=0.5),
], p=1.0),
]
)
images_list = [image]
image = np.array(image)
for i in range(15):
augmentations = transform(image=image)
augmented_img = augmentations["image"]
images_list.append(augmented_img)
plot_examples(images_list)

View File

@@ -0,0 +1,41 @@
import cv2
import albumentations as A
import numpy as np
from utils import plot_examples
from PIL import Image
image = cv2.imread("images/cat.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
bboxes = [[13, 170, 224, 410]]
# Pascal_voc (x_min, y_min, x_max, y_max), YOLO, COCO
transform = A.Compose(
[
A.Resize(width=1920, height=1080),
A.RandomCrop(width=1280, height=720),
A.Rotate(limit=40, p=0.9, border_mode=cv2.BORDER_CONSTANT),
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.1),
A.RGBShift(r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.9),
A.OneOf([
A.Blur(blur_limit=3, p=0.5),
A.ColorJitter(p=0.5),
], p=1.0),
], bbox_params=A.BboxParams(format="pascal_voc", min_area=2048,
min_visibility=0.3, label_fields=[])
)
images_list = [image]
saved_bboxes = [bboxes[0]]
for i in range(15):
augmentations = transform(image=image, bboxes=bboxes)
augmented_img = augmentations["image"]
if len(augmentations["bboxes"]) == 0:
continue
images_list.append(augmented_img)
saved_bboxes.append(augmentations["bboxes"][0])
plot_examples(images_list, saved_bboxes)

View File

@@ -0,0 +1,62 @@
import torch
import numpy as np
import cv2
from PIL import Image
import torch.nn as nn
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.utils.data import Dataset
import os
class ImageFolder(Dataset):
def __init__(self, root_dir, transform=None):
super(ImageFolder, self).__init__()
self.data = []
self.root_dir = root_dir
self.transform = transform
self.class_names = os.listdir(root_dir)
for index, name in enumerate(self.class_names):
files = os.listdir(os.path.join(root_dir, name))
self.data += list(zip(files, [index]*len(files)))
def __len__(self):
return len(self.data)
def __getitem__(self, index):
img_file, label = self.data[index]
root_and_dir = os.path.join(self.root_dir, self.class_names[label])
image = np.array(Image.open(os.path.join(root_and_dir, img_file)))
if self.transform is not None:
augmentations = self.transform(image=image)
image = augmentations["image"]
return image, label
transform = A.Compose(
[
A.Resize(width=1920, height=1080),
A.RandomCrop(width=1280, height=720),
A.Rotate(limit=40, p=0.9, border_mode=cv2.BORDER_CONSTANT),
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.1),
A.RGBShift(r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.9),
A.OneOf([
A.Blur(blur_limit=3, p=0.5),
A.ColorJitter(p=0.5),
], p=1.0),
A.Normalize(
mean=[0, 0, 0],
std=[1, 1, 1],
max_pixel_value=255,
),
ToTensorV2(),
]
)
dataset = ImageFolder(root_dir="cat_dogs", transform=transform)
for x,y in dataset:
print(x.shape)

Binary file not shown.

After

Width:  |  Height:  |  Size: 81 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 274 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 77 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

View File

@@ -0,0 +1,37 @@
import cv2
import albumentations as A
import numpy as np
from utils import plot_examples
from PIL import Image
image = Image.open("images/elon.jpeg")
mask = Image.open("images/mask.jpeg")
mask2 = Image.open("images/second_mask.jpeg")
transform = A.Compose(
[
A.Resize(width=1920, height=1080),
A.RandomCrop(width=1280, height=720),
A.Rotate(limit=40, p=0.9, border_mode=cv2.BORDER_CONSTANT),
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.1),
A.RGBShift(r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.9),
A.OneOf([
A.Blur(blur_limit=3, p=0.5),
A.ColorJitter(p=0.5),
], p=1.0),
]
)
images_list = [image]
image = np.array(image)
mask = np.array(mask) # np.asarray(mask), np.array(mask)
mask2 = np.array(mask2)
for i in range(4):
augmentations = transform(image=image, masks=[mask, mask2])
augmented_img = augmentations["image"]
augmented_masks = augmentations["masks"]
images_list.append(augmented_img)
images_list.append(augmented_masks[0])
images_list.append(augmented_masks[1])
plot_examples(images_list)

View File

@@ -0,0 +1,36 @@
import random
import cv2
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import numpy as np
import albumentations as A
def visualize(image):
plt.figure(figsize=(10, 10))
plt.axis('off')
plt.imshow(image)
plt.show()
def plot_examples(images, bboxes=None):
fig = plt.figure(figsize=(15, 15))
columns = 4
rows = 5
for i in range(1, len(images)):
if bboxes is not None:
img = visualize_bbox(images[i - 1], bboxes[i - 1], class_name="Elon")
else:
img = images[i-1]
fig.add_subplot(rows, columns, i)
plt.imshow(img)
plt.show()
# From https://albumentations.ai/docs/examples/example_bboxes/
def visualize_bbox(img, bbox, class_name, color=(255, 0, 0), thickness=5):
"""Visualizes a single bounding box on the image"""
x_min, y_min, x_max, y_max = map(int, bbox)
cv2.rectangle(img, (x_min, y_min), (x_max, y_max), color, thickness)
return img

View File

@@ -0,0 +1,11 @@
Animal,Label
cat.0.jpg,0
cat.1.jpg,0
cat.2.jpg,0
cat.3.jpg,0
cat.4.jpg,0
cat.5.jpg,0
cat.6.jpg,0
cat.7.jpg,0
dog.0.jpg,1
dog.1.jpg,1
1 Animal Label
2 cat.0.jpg 0
3 cat.1.jpg 0
4 cat.2.jpg 0
5 cat.3.jpg 0
6 cat.4.jpg 0
7 cat.5.jpg 0
8 cat.6.jpg 0
9 cat.7.jpg 0
10 dog.0.jpg 1
11 dog.1.jpg 1

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

View File

@@ -0,0 +1,131 @@
# Imports
import os
from typing import Union
import torch.nn.functional as F # All functions that don't have any parameters
import pandas as pd
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torchvision
import torchvision.transforms as transforms # Transformations we can perform on our dataset
from pandas import io
# from skimage import io
from torch.utils.data import (
Dataset,
DataLoader,
) # Gives easier dataset managment and creates mini batches
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
# Create Fully Connected Network
class NN(nn.Module):
def __init__(self, input_size, num_classes):
super(NN, self).__init__()
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
class SoloDataset(Dataset):
def __init__(self, csv_file, root_dir, transform=None):
self.annotations = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.annotations)
def __getitem__(self, index):
x_data = self.annotations.iloc[index, 0:11]
x_data = torch.tensor(x_data)
y_label = torch.tensor(int(self.annotations.iloc[index, 11]))
return (x_data.float(), y_label)
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
num_classes = 26
learning_rate = 1e-3
batch_size = 5
num_epochs = 30
input_size = 11
# Load Data
dataset = SoloDataset(
csv_file="power.csv", root_dir="test123", transform=transforms.ToTensor()
)
train_set, test_set = torch.utils.data.random_split(dataset, [2900, 57])
train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_set, batch_size=batch_size, shuffle=True)
# Model
model = NN(input_size=input_size, num_classes=num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
print(len(train_set))
print(len(test_set))
# Train Network
for epoch in range(num_epochs):
losses = []
for batch_idx, (data, targets) in enumerate(train_loader):
# Get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)
# forward
scores = model(data)
loss = criterion(scores, targets)
losses.append(loss.item())
# backward
optimizer.zero_grad()
loss.backward()
# gradient descent or adam step
optimizer.step()
print(f"Cost at epoch {epoch} is {sum(losses) / len(losses)}")
# Check accuracy on training to see how good our model is
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy {float(num_correct) / float(num_samples) * 100:.2f}"
)
model.train()
print("Checking accuracy on Training Set")
check_accuracy(train_loader, model)
print("Checking accuracy on Test Set")
check_accuracy(test_loader, model)

View File

@@ -0,0 +1,130 @@
"""
Example of how to create custom dataset in Pytorch. In this case
we have images of cats and dogs in a separate folder and a csv
file containing the name to the jpg file as well as the target
label (0 for cat, 1 for dog).
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-03 Initial coding
"""
# Imports
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torchvision.transforms as transforms # Transformations we can perform on our dataset
import torchvision
import os
import pandas as pd
from skimage import io
from torch.utils.data import (
Dataset,
DataLoader,
) # Gives easier dataset managment and creates mini batches
class CatsAndDogsDataset(Dataset):
def __init__(self, csv_file, root_dir, transform=None):
self.annotations = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.annotations)
def __getitem__(self, index):
img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
image = io.imread(img_path)
y_label = torch.tensor(int(self.annotations.iloc[index, 1]))
if self.transform:
image = self.transform(image)
return (image, y_label)
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
in_channel = 3
num_classes = 2
learning_rate = 1e-3
batch_size = 32
num_epochs = 10
# Load Data
dataset = CatsAndDogsDataset(
csv_file="cats_dogs.csv",
root_dir="cats_dogs_resized",
transform=transforms.ToTensor(),
)
# Dataset is actually a lot larger ~25k images, just took out 10 pictures
# to upload to Github. It's enough to understand the structure and scale
# if you got more images.
train_set, test_set = torch.utils.data.random_split(dataset, [5, 5])
train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_set, batch_size=batch_size, shuffle=True)
# Model
model = torchvision.models.googlenet(pretrained=True)
model.to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Train Network
for epoch in range(num_epochs):
losses = []
for batch_idx, (data, targets) in enumerate(train_loader):
# Get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)
# forward
scores = model(data)
loss = criterion(scores, targets)
losses.append(loss.item())
# backward
optimizer.zero_grad()
loss.backward()
# gradient descent or adam step
optimizer.step()
print(f"Cost at epoch {epoch} is {sum(losses)/len(losses)}")
# Check accuracy on training to see how good our model is
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
)
model.train()
print("Checking accuracy on Training Set")
check_accuracy(train_loader, model)
print("Checking accuracy on Test Set")
check_accuracy(test_loader, model)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,142 @@
import os # when loading file paths
import pandas as pd # for lookup in annotation file
import spacy # for tokenizer
import torch
from torch.nn.utils.rnn import pad_sequence # pad batch
from torch.utils.data import DataLoader, Dataset
from PIL import Image # Load img
import torchvision.transforms as transforms
# We want to convert text -> numerical values
# 1. We need a Vocabulary mapping each word to a index
# 2. We need to setup a Pytorch dataset to load the data
# 3. Setup padding of every batch (all examples should be
# of same seq_len and setup dataloader)
# Note that loading the image is very easy compared to the text!
# Download with: python -m spacy download en
spacy_eng = spacy.load("en")
class Vocabulary:
def __init__(self, freq_threshold):
self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
self.freq_threshold = freq_threshold
def __len__(self):
return len(self.itos)
@staticmethod
def tokenizer_eng(text):
return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
def build_vocabulary(self, sentence_list):
frequencies = {}
idx = 4
for sentence in sentence_list:
for word in self.tokenizer_eng(sentence):
if word not in frequencies:
frequencies[word] = 1
else:
frequencies[word] += 1
if frequencies[word] == self.freq_threshold:
self.stoi[word] = idx
self.itos[idx] = word
idx += 1
def numericalize(self, text):
tokenized_text = self.tokenizer_eng(text)
return [
self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
for token in tokenized_text
]
class FlickrDataset(Dataset):
def __init__(self, root_dir, captions_file, transform=None, freq_threshold=5):
self.root_dir = root_dir
self.df = pd.read_csv(captions_file)
self.transform = transform
# Get img, caption columns
self.imgs = self.df["image"]
self.captions = self.df["caption"]
# Initialize vocabulary and build vocab
self.vocab = Vocabulary(freq_threshold)
self.vocab.build_vocabulary(self.captions.tolist())
def __len__(self):
return len(self.df)
def __getitem__(self, index):
caption = self.captions[index]
img_id = self.imgs[index]
img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
if self.transform is not None:
img = self.transform(img)
numericalized_caption = [self.vocab.stoi["<SOS>"]]
numericalized_caption += self.vocab.numericalize(caption)
numericalized_caption.append(self.vocab.stoi["<EOS>"])
return img, torch.tensor(numericalized_caption)
class MyCollate:
def __init__(self, pad_idx):
self.pad_idx = pad_idx
def __call__(self, batch):
imgs = [item[0].unsqueeze(0) for item in batch]
imgs = torch.cat(imgs, dim=0)
targets = [item[1] for item in batch]
targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)
return imgs, targets
def get_loader(
root_folder,
annotation_file,
transform,
batch_size=32,
num_workers=8,
shuffle=True,
pin_memory=True,
):
dataset = FlickrDataset(root_folder, annotation_file, transform=transform)
pad_idx = dataset.vocab.stoi["<PAD>"]
loader = DataLoader(
dataset=dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=shuffle,
pin_memory=pin_memory,
collate_fn=MyCollate(pad_idx=pad_idx),
)
return loader, dataset
if __name__ == "__main__":
transform = transforms.Compose(
[transforms.Resize((224, 224)), transforms.ToTensor(),]
)
loader, dataset = get_loader(
"flickr8k/images/", "flickr8k/captions.txt", transform=transform
)
for idx, (imgs, captions) in enumerate(loader):
print(imgs.shape)
print(captions.shape)

View File

@@ -0,0 +1,125 @@
"""
Example code of a simple bidirectional LSTM on the MNIST dataset.
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-05-09 Initial coding
"""
# Imports
import torch
import torchvision
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F # All functions that don't have any parameters
from torch.utils.data import (
DataLoader,
) # Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
input_size = 28
sequence_length = 28
num_layers = 2
hidden_size = 256
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 2
# Create a bidirectional LSTM
class BRNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(BRNN, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(
input_size, hidden_size, num_layers, batch_first=True, bidirectional=True
)
self.fc = nn.Linear(hidden_size * 2, num_classes)
def forward(self, x):
h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
out, _ = self.lstm(x, (h0, c0))
out = self.fc(out[:, -1, :])
return out
# Load Data
train_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
test_dataset = datasets.MNIST(
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
# Initialize network
model = BRNN(input_size, hidden_size, num_layers, num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Train Network
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(train_loader):
# Get data to cuda if possible
data = data.to(device=device).squeeze(1)
targets = targets.to(device=device)
# forward
scores = model(data)
loss = criterion(scores, targets)
# backward
optimizer.zero_grad()
loss.backward()
# gradient descent or adam step
optimizer.step()
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
if loader.dataset.train:
print("Checking accuracy on training data")
else:
print("Checking accuracy on test data")
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device).squeeze(1)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy \
{float(num_correct)/float(num_samples)*100:.2f}"
)
model.train()
check_accuracy(train_loader, model)
check_accuracy(test_loader, model)

View File

@@ -0,0 +1,69 @@
"""
Example code of how to initialize weights for a simple CNN network.
Video explanation: https://youtu.be/xWQ-p_o0Uik
Got any questions leave a comment on youtube :)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-10 Initial coding
"""
# Imports
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.nn.functional as F # All functions that don't have any parameters
class CNN(nn.Module):
def __init__(self, in_channels, num_classes):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(
in_channels=in_channels,
out_channels=6,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
)
self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
self.conv2 = nn.Conv2d(
in_channels=6,
out_channels=16,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
)
self.fc1 = nn.Linear(16 * 7 * 7, num_classes)
self.initialize_weights()
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.pool(x)
x = F.relu(self.conv2(x))
x = self.pool(x)
x = x.reshape(x.shape[0], -1)
x = self.fc1(x)
return x
def initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_uniform_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.kaiming_uniform_(m.weight)
nn.init.constant_(m.bias, 0)
if __name__ == "__main__":
model = CNN(in_channels=3, num_classes=10)
for param in model.parameters():
print(param)

View File

@@ -0,0 +1,54 @@
"""
Small code example of how to save and load checkpoint of a model.
This example doesn't perform any training, so it would be quite useless.
In practice you would save the model as you train, and then load before
continuining training at another point.
Video explanation of code & how to save and load model: https://youtu.be/g6kQl_EFn84
Got any questions leave a comment on youtube :)
Coded by Aladdin Persson <aladdin dot person at hotmail dot com>
- 2020-04-07 Initial programming
"""
# Imports
import torch
import torchvision
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F # All functions that don't have any parameters
from torch.utils.data import (
DataLoader,
) # Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
def load_checkpoint(checkpoint, model, optimizer):
print("=> Loading checkpoint")
model.load_state_dict(checkpoint["state_dict"])
optimizer.load_state_dict(checkpoint["optimizer"])
def main():
# Initialize network
model = torchvision.models.vgg16(pretrained=False)
optimizer = optim.Adam(model.parameters())
checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
# Try save checkpoint
save_checkpoint(checkpoint)
# Try load checkpoint
load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,107 @@
"""
Example code of how to use a learning rate scheduler simple, in this
case with a (very) small and simple Feedforward Network training on MNIST
dataset with a learning rate scheduler. In this case ReduceLROnPlateau
scheduler is used, but can easily be changed to any of the other schedulers
available.
Video explanation: https://youtu.be/P31hB37g4Ak
Got any questions leave a comment on youtube :)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-10 Initial programming
"""
# Imports
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
from torch.utils.data import (
DataLoader,
) # Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
num_classes = 10
learning_rate = 0.1
batch_size = 128
num_epochs = 100
# Define a very simple model
model = nn.Sequential(nn.Linear(784, 50), nn.ReLU(), nn.Linear(50, 10)).to(device)
# Load Data
train_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Define Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, factor=0.1, patience=5, verbose=True
)
# Train Network
for epoch in range(1, num_epochs):
losses = []
for batch_idx, (data, targets) in enumerate(train_loader):
# Get data to cuda if possible
data = data.reshape(data.shape[0], -1)
data = data.to(device=device)
targets = targets.to(device=device)
# forward
scores = model(data)
loss = criterion(scores, targets)
losses.append(loss.item())
# backward
loss.backward()
# gradient descent or adam step
# scheduler.step(loss)
optimizer.step()
optimizer.zero_grad()
mean_loss = sum(losses) / len(losses)
# After each epoch do scheduler.step, note in this scheduler we need to send
# in loss for that epoch!
scheduler.step(mean_loss)
print(f"Cost at epoch {epoch} is {mean_loss}")
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
)
model.train()
check_accuracy(train_loader, model)

View File

@@ -0,0 +1,99 @@
# Imports
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F # All functions that don't have any parameters
from torch.utils.data import DataLoader # Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
# Simple CNN
class CNN(nn.Module):
def __init__(self, in_channels=1, num_classes=10):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(in_channels=1, out_channels=420, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
self.conv2 = nn.Conv2d(in_channels=420, out_channels=1000, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
self.fc1 = nn.Linear(1000 * 7 * 7, num_classes)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.pool(x)
x = F.relu(self.conv2(x))
x = self.pool(x)
x = x.reshape(x.shape[0], -1)
x = self.fc1(x)
return x
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Hyperparameters
in_channel = 1
num_classes = 10
learning_rate = 0.001
batch_size = 100
num_epochs = 5
# Load Data
train_dataset = datasets.MNIST(root='dataset/', train=True, transform=transforms.ToTensor(), download=True)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = datasets.MNIST(root='dataset/', train=False, transform=transforms.ToTensor(), download=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
# Initialize network
model = CNN().to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Necessary for FP16
scaler = torch.cuda.amp.GradScaler()
# Train Network
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(train_loader):
# Get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)
# forward
with torch.cuda.amp.autocast():
scores = model(data)
loss = criterion(scores, targets)
# backward
optimizer.zero_grad()
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct) / float(num_samples) * 100:.2f}')
model.train()
check_accuracy(train_loader, model)
check_accuracy(test_loader, model)

View File

@@ -0,0 +1,123 @@
"""
Shows a small example of how to load a pretrain model (VGG16) from PyTorch,
and modifies this to train on the CIFAR10 dataset. The same method generalizes
well to other datasets, but the modifications to the network may need to be changed.
Video explanation: https://youtu.be/U4bHxEhMGNk
Got any questions leave a comment on youtube :)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-08 Initial coding
"""
# Imports
import torch
import torchvision
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F # All functions that don't have any parameters
from torch.utils.data import (
DataLoader,
) # Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
num_classes = 10
learning_rate = 1e-3
batch_size = 1024
num_epochs = 5
# Simple Identity class that let's input pass without changes
class Identity(nn.Module):
def __init__(self):
super(Identity, self).__init__()
def forward(self, x):
return x
# Load pretrain model & modify it
model = torchvision.models.vgg16(pretrained=True)
# If you want to do finetuning then set requires_grad = False
# Remove these two lines if you want to train entire model,
# and only want to load the pretrain weights.
for param in model.parameters():
param.requires_grad = False
model.avgpool = Identity()
model.classifier = nn.Sequential(
nn.Linear(512, 100), nn.ReLU(), nn.Linear(100, num_classes)
)
model.to(device)
# Load Data
train_dataset = datasets.CIFAR10(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Train Network
for epoch in range(num_epochs):
losses = []
for batch_idx, (data, targets) in enumerate(train_loader):
# Get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)
# forward
scores = model(data)
loss = criterion(scores, targets)
losses.append(loss.item())
# backward
optimizer.zero_grad()
loss.backward()
# gradient descent or adam step
optimizer.step()
print(f"Cost at epoch {epoch} is {sum(losses)/len(losses):.5f}")
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
if loader.dataset.train:
print("Checking accuracy on training data")
else:
print("Checking accuracy on test data")
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
)
model.train()
check_accuracy(train_loader, model)

View File

@@ -0,0 +1,41 @@
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
# Create a simple toy dataset example, normally this
# would be doing custom class with __getitem__ etc,
# which we have done in custom dataset tutorials
x = torch.randn((1000, 3, 224, 224))
y = torch.randint(low=0, high=10, size=(1000, 1))
ds = TensorDataset(x, y)
loader = DataLoader(ds, batch_size=8)
model = nn.Sequential(
nn.Conv2d(3, 10, kernel_size=3, padding=1, stride=1),
nn.Flatten(),
nn.Linear(10*224*224, 10),
)
NUM_EPOCHS = 100
for epoch in range(NUM_EPOCHS):
loop = tqdm(loader)
for idx, (x, y) in enumerate(loop):
scores = model(x)
# here we would compute loss, backward, optimizer step etc.
# you know how it goes, but now you have a nice progress bar
# with tqdm
# then at the bottom if you want additional info shown, you can
# add it here, for loss and accuracy you would obviously compute
# but now we just set them to random values
loop.set_description(f"Epoch [{epoch}/{NUM_EPOCHS}]")
loop.set_postfix(loss=torch.rand(1).item(), acc=torch.rand(1).item())
# There you go. Hope it was useful :)

View File

@@ -0,0 +1,172 @@
"""
Example code of a simple RNN, GRU, LSTM on the MNIST dataset.
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-05-09 Initial coding
"""
# Imports
import torch
import torchvision
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F # All functions that don't have any parameters
from torch.utils.data import (
DataLoader,
) # Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
input_size = 28
hidden_size = 256
num_layers = 2
num_classes = 10
sequence_length = 28
learning_rate = 0.005
batch_size = 64
num_epochs = 2
# Recurrent neural network (many-to-one)
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM
out, _ = self.rnn(x, h0)
out = out.reshape(out.shape[0], -1)
# Decode the hidden state of the last time step
out = self.fc(out)
return out
# Recurrent neural network with GRU (many-to-one)
class RNN_GRU(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN_GRU, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM
out, _ = self.gru(x, h0)
out = out.reshape(out.shape[0], -1)
# Decode the hidden state of the last time step
out = self.fc(out)
return out
# Recurrent neural network with LSTM (many-to-one)
class RNN_LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN_LSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM
out, _ = self.lstm(
x, (h0, c0)
) # out: tensor of shape (batch_size, seq_length, hidden_size)
out = out.reshape(out.shape[0], -1)
# Decode the hidden state of the last time step
out = self.fc(out)
return out
# Load Data
train_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
test_dataset = datasets.MNIST(
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
# Initialize network
model = RNN_LSTM(input_size, hidden_size, num_layers, num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Train Network
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(train_loader):
# Get data to cuda if possible
data = data.to(device=device).squeeze(1)
targets = targets.to(device=device)
# forward
scores = model(data)
loss = criterion(scores, targets)
# backward
optimizer.zero_grad()
loss.backward()
# gradient descent or adam step
optimizer.step()
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
if loader.dataset.train:
print("Checking accuracy on training data")
else:
print("Checking accuracy on test data")
num_correct = 0
num_samples = 0
# Set model to eval
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device).squeeze(1)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with \
accuracy {float(num_correct)/float(num_samples)*100:.2f}"
)
# Set model back to train
model.train()
check_accuracy(train_loader, model)
check_accuracy(test_loader, model)

View File

@@ -0,0 +1,134 @@
"""
Example code of a simple CNN network training on MNIST dataset.
The code is intended to show how to create a CNN network as well
as how to initialize loss, optimizer, etc. in a simple way to get
training to work with function that checks accuracy as well.
Video explanation: https://youtu.be/wnK3uWv_WkU
Got any questions leave a comment on youtube :)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-08 Initial coding
"""
# Imports
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F # All functions that don't have any parameters
from torch.utils.data import (
DataLoader,
) # Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
# Simple CNN
class CNN(nn.Module):
def __init__(self, in_channels=1, num_classes=10):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(
in_channels=1,
out_channels=8,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
)
self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
self.conv2 = nn.Conv2d(
in_channels=8,
out_channels=16,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
)
self.fc1 = nn.Linear(16 * 7 * 7, num_classes)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.pool(x)
x = F.relu(self.conv2(x))
x = self.pool(x)
x = x.reshape(x.shape[0], -1)
x = self.fc1(x)
return x
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
in_channel = 1
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 5
# Load Data
train_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = datasets.MNIST(
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
# Initialize network
model = CNN().to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Train Network
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(train_loader):
# Get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)
# forward
scores = model(data)
loss = criterion(scores, targets)
# backward
optimizer.zero_grad()
loss.backward()
# gradient descent or adam step
optimizer.step()
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
if loader.dataset.train:
print("Checking accuracy on training data")
else:
print("Checking accuracy on test data")
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
)
model.train()
check_accuracy(train_loader, model)
check_accuracy(test_loader, model)

View File

@@ -0,0 +1,120 @@
"""
Working code of a simple Fully Connected (FC) network training on MNIST dataset.
The code is intended to show how to create a FC network as well
as how to initialize loss, optimizer, etc. in a simple way to get
training to work with function that checks accuracy as well.
Video explanation: https://youtu.be/Jy4wM2X21u0
Got any questions leave a comment on youtube :)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-08 Initial coding
"""
# Imports
import torch
import torchvision
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F # All functions that don't have any parameters
from torch.utils.data import (
DataLoader,
) # Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
# Create Fully Connected Network
class NN(nn.Module):
def __init__(self, input_size, num_classes):
super(NN, self).__init__()
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
input_size = 784
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 1
# Load Data
train_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = datasets.MNIST(
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
# Initialize network
model = NN(input_size=input_size, num_classes=num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Train Network
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(train_loader):
# Get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)
# Get to correct shape
data = data.reshape(data.shape[0], -1)
# forward
scores = model(data)
loss = criterion(scores, targets)
# backward
optimizer.zero_grad()
loss.backward()
# gradient descent or adam step
optimizer.step()
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
if loader.dataset.train:
print("Checking accuracy on training data")
else:
print("Checking accuracy on test data")
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
x = x.reshape(x.shape[0], -1)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
)
model.train()
check_accuracy(train_loader, model)
check_accuracy(test_loader, model)

View File

@@ -0,0 +1,28 @@
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_set = datasets.CIFAR10(root="ds/", transform=transforms.ToTensor(), download=True)
train_loader = DataLoader(dataset=train_set, batch_size=64, shuffle=True)
def get_mean_std(loader):
# var[X] = E[X**2] - E[X]**2
channels_sum, channels_sqrd_sum, num_batches = 0, 0, 0
for data, _ in tqdm(loader):
channels_sum += torch.mean(data, dim=[0, 2, 3])
channels_sqrd_sum += torch.mean(data ** 2, dim=[0, 2, 3])
num_batches += 1
mean = channels_sum / num_batches
std = (channels_sqrd_sum / num_batches - mean ** 2) ** 0.5
return mean, std
mean, std = get_mean_std(train_loader)
print(mean)
print(std)

View File

@@ -0,0 +1,299 @@
"""
Walk through of a lot of different useful Tensor Operations, where we
go through what I think are four main parts in:
1. Initialization of a Tensor
2. Tensor Mathematical Operations and Comparison
3. Tensor Indexing
4. Tensor Reshaping
But also other things such as setting the device (GPU/CPU) and converting
between different types (int, float etc) and how to convert a tensor to an
numpy array and vice-versa.
"""
import torch
# ================================================================= #
# Initializing Tensor #
# ================================================================= #
device = "cuda" if torch.cuda.is_available() else "cpu" # Cuda to run on GPU!
# Initializing a Tensor in this case of shape 2x3 (2 rows, 3 columns)
my_tensor = torch.tensor(
[[1, 2, 3], [4, 5, 6]], dtype=torch.float32, device=device, requires_grad=True
)
# A few tensor attributes
print(
f"Information about tensor: {my_tensor}"
) # Prints data of the tensor, device and grad info
print(
"Type of Tensor {my_tensor.dtype}"
) # Prints dtype of the tensor (torch.float32, etc)
print(
f"Device Tensor is on {my_tensor.device}"
) # Prints cpu/cuda (followed by gpu number)
print(f"Shape of tensor {my_tensor.shape}") # Prints shape, in this case 2x3
print(f"Requires gradient: {my_tensor.requires_grad}") # Prints true/false
# Other common initialization methods (there exists a ton more)
x = torch.empty(size=(3, 3)) # Tensor of shape 3x3 with uninitialized data
x = torch.zeros((3, 3)) # Tensor of shape 3x3 with values of 0
x = torch.rand(
(3, 3)
) # Tensor of shape 3x3 with values from uniform distribution in interval [0,1)
x = torch.ones((3, 3)) # Tensor of shape 3x3 with values of 1
x = torch.eye(5, 5) # Returns Identity Matrix I, (I <-> Eye), matrix of shape 2x3
x = torch.arange(
start=0, end=5, step=1
) # Tensor [0, 1, 2, 3, 4], note, can also do: torch.arange(11)
x = torch.linspace(start=0.1, end=1, steps=10) # x = [0.1, 0.2, ..., 1]
x = torch.empty(size=(1, 5)).normal_(
mean=0, std=1
) # Normally distributed with mean=0, std=1
x = torch.empty(size=(1, 5)).uniform_(
0, 1
) # Values from a uniform distribution low=0, high=1
x = torch.diag(torch.ones(3)) # Diagonal matrix of shape 3x3
# How to make initialized tensors to other types (int, float, double)
# These will work even if you're on CPU or CUDA!
tensor = torch.arange(4) # [0, 1, 2, 3] Initialized as int64 by default
print(f"Converted Boolean: {tensor.bool()}") # Converted to Boolean: 1 if nonzero
print(f"Converted int16 {tensor.short()}") # Converted to int16
print(
f"Converted int64 {tensor.long()}"
) # Converted to int64 (This one is very important, used super often)
print(f"Converted float16 {tensor.half()}") # Converted to float16
print(
f"Converted float32 {tensor.float()}"
) # Converted to float32 (This one is very important, used super often)
print(f"Converted float64 {tensor.double()}") # Converted to float64
# Array to Tensor conversion and vice-versa
import numpy as np
np_array = np.zeros((5, 5))
tensor = torch.from_numpy(np_array)
np_array_again = (
tensor.numpy()
) # np_array_again will be same as np_array (perhaps with numerical round offs)
# =============================================================================== #
# Tensor Math & Comparison Operations #
# =============================================================================== #
x = torch.tensor([1, 2, 3])
y = torch.tensor([9, 8, 7])
# -- Addition --
z1 = torch.empty(3)
torch.add(x, y, out=z1) # This is one way
z2 = torch.add(x, y) # This is another way
z = x + y # This is my preferred way, simple and clean.
# -- Subtraction --
z = x - y # We can do similarly as the preferred way of addition
# -- Division (A bit clunky) --
z = torch.true_divide(x, y) # Will do element wise division if of equal shape
# -- Inplace Operations --
t = torch.zeros(3)
t.add_(x) # Whenever we have operation followed by _ it will mutate the tensor in place
t += x # Also inplace: t = t + x is not inplace, bit confusing.
# -- Exponentiation (Element wise if vector or matrices) --
z = x.pow(2) # z = [1, 4, 9]
z = x ** 2 # z = [1, 4, 9]
# -- Simple Comparison --
z = x > 0 # Returns [True, True, True]
z = x < 0 # Returns [False, False, False]
# -- Matrix Multiplication --
x1 = torch.rand((2, 5))
x2 = torch.rand((5, 3))
x3 = torch.mm(x1, x2) # Matrix multiplication of x1 and x2, out shape: 2x3
x3 = x1.mm(x2) # Similar as line above
# -- Matrix Exponentiation --
matrix_exp = torch.rand(5, 5)
print(
matrix_exp.matrix_power(3)
) # is same as matrix_exp (mm) matrix_exp (mm) matrix_exp
# -- Element wise Multiplication --
z = x * y # z = [9, 16, 21] = [1*9, 2*8, 3*7]
# -- Dot product --
z = torch.dot(x, y) # Dot product, in this case z = 1*9 + 2*8 + 3*7
# -- Batch Matrix Multiplication --
batch = 32
n = 10
m = 20
p = 30
tensor1 = torch.rand((batch, n, m))
tensor2 = torch.rand((batch, m, p))
out_bmm = torch.bmm(tensor1, tensor2) # Will be shape: (b x n x p)
# -- Example of broadcasting --
x1 = torch.rand((5, 5))
x2 = torch.ones((1, 5))
z = (
x1 - x2
) # Shape of z is 5x5: How? The 1x5 vector (x2) is subtracted for each row in the 5x5 (x1)
z = (
x1 ** x2
) # Shape of z is 5x5: How? Broadcasting! Element wise exponentiation for every row
# Other useful tensor operations
sum_x = torch.sum(
x, dim=0
) # Sum of x across dim=0 (which is the only dim in our case), sum_x = 6
values, indices = torch.max(x, dim=0) # Can also do x.max(dim=0)
values, indices = torch.min(x, dim=0) # Can also do x.min(dim=0)
abs_x = torch.abs(x) # Returns x where abs function has been applied to every element
z = torch.argmax(x, dim=0) # Gets index of the maximum value
z = torch.argmin(x, dim=0) # Gets index of the minimum value
mean_x = torch.mean(x.float(), dim=0) # mean requires x to be float
z = torch.eq(x, y) # Element wise comparison, in this case z = [False, False, False]
sorted_y, indices = torch.sort(y, dim=0, descending=False)
z = torch.clamp(x, min=0)
# All values < 0 set to 0 and values > 0 unchanged (this is exactly ReLU function)
# If you want to values over max_val to be clamped, do torch.clamp(x, min=min_val, max=max_val)
x = torch.tensor([1, 0, 1, 1, 1], dtype=torch.bool) # True/False values
z = torch.any(x) # will return True, can also do x.any() instead of torch.any(x)
z = torch.all(
x
) # will return False (since not all are True), can also do x.all() instead of torch.all()
# ============================================================= #
# Tensor Indexing #
# ============================================================= #
batch_size = 10
features = 25
x = torch.rand((batch_size, features))
# Get first examples features
print(x[0].shape) # shape [25], this is same as doing x[0,:]
# Get the first feature for all examples
print(x[:, 0].shape) # shape [10]
# For example: Want to access third example in the batch and the first ten features
print(x[2, 0:10].shape) # shape: [10]
# For example we can use this to, assign certain elements
x[0, 0] = 100
# Fancy Indexing
x = torch.arange(10)
indices = [2, 5, 8]
print(x[indices]) # x[indices] = [2, 5, 8]
x = torch.rand((3, 5))
rows = torch.tensor([1, 0])
cols = torch.tensor([4, 0])
print(x[rows, cols]) # Gets second row fifth column and first row first column
# More advanced indexing
x = torch.arange(10)
print(x[(x < 2) | (x > 8)]) # will be [0, 1, 9]
print(x[x.remainder(2) == 0]) # will be [0, 2, 4, 6, 8]
# Useful operations for indexing
print(
torch.where(x > 5, x, x * 2)
) # gives [0, 2, 4, 6, 8, 10, 6, 7, 8, 9], all values x > 5 yield x, else x*2
x = torch.tensor([0, 0, 1, 2, 2, 3, 4]).unique() # x = [0, 1, 2, 3, 4]
print(
x.ndimension()
) # The number of dimensions, in this case 1. if x.shape is 5x5x5 ndim would be 3
x = torch.arange(10)
print(
x.numel()
) # The number of elements in x (in this case it's trivial because it's just a vector)
# ============================================================= #
# Tensor Reshaping #
# ============================================================= #
x = torch.arange(9)
# Let's say we want to reshape it to be 3x3
x_3x3 = x.view(3, 3)
# We can also do (view and reshape are very similar)
# and the differences are in simple terms (I'm no expert at this),
# is that view acts on contiguous tensors meaning if the
# tensor is stored contiguously in memory or not, whereas
# for reshape it doesn't matter because it will copy the
# tensor to make it contiguously stored, which might come
# with some performance loss.
x_3x3 = x.reshape(3, 3)
# If we for example do:
y = x_3x3.t()
print(
y.is_contiguous()
) # This will return False and if we try to use view now, it won't work!
# y.view(9) would cause an error, reshape however won't
# This is because in memory it was stored [0, 1, 2, ... 8], whereas now it's [0, 3, 6, 1, 4, 7, 2, 5, 8]
# The jump is no longer 1 in memory for one element jump (matrices are stored as a contiguous block, and
# using pointers to construct these matrices). This is a bit complicated and I need to explore this more
# as well, at least you know it's a problem to be cautious of! A solution is to do the following
print(y.contiguous().view(9)) # Calling .contiguous() before view and it works
# Moving on to another operation, let's say we want to add two tensors dimensions togethor
x1 = torch.rand(2, 5)
x2 = torch.rand(2, 5)
print(torch.cat((x1, x2), dim=0).shape) # Shape: 4x5
print(torch.cat((x1, x2), dim=1).shape) # Shape 2x10
# Let's say we want to unroll x1 into one long vector with 10 elements, we can do:
z = x1.view(-1) # And -1 will unroll everything
# If we instead have an additional dimension and we wish to keep those as is we can do:
batch = 64
x = torch.rand((batch, 2, 5))
z = x.view(
batch, -1
) # And z.shape would be 64x10, this is very useful stuff and is used all the time
# Let's say we want to switch x axis so that instead of 64x2x5 we have 64x5x2
# I.e we want dimension 0 to stay, dimension 1 to become dimension 2, dimension 2 to become dimension 1
# Basically you tell permute where you want the new dimensions to be, torch.transpose is a special case
# of permute (why?)
z = x.permute(0, 2, 1)
# Splits x last dimension into chunks of 2 (since 5 is not integer div by 2) the last dimension
# will be smaller, so it will split it into two tensors: 64x2x3 and 64x2x2
z = torch.chunk(x, chunks=2, dim=1)
print(z[0].shape)
print(z[1].shape)
# Let's say we want to add an additional dimension
x = torch.arange(
10
) # Shape is [10], let's say we want to add an additional so we have 1x10
print(x.unsqueeze(0).shape) # 1x10
print(x.unsqueeze(1).shape) # 10x1
# Let's say we have x which is 1x1x10 and we want to remove a dim so we have 1x10
x = torch.arange(10).unsqueeze(0).unsqueeze(1)
# Perhaps unsurprisingly
z = x.squeeze(1) # can also do .squeeze(0) both returns 1x10
# That was some essential Tensor operations, hopefully you found it useful!

View File

@@ -0,0 +1,142 @@
"""
Example code of how to use the TensorBoard in PyTorch.
This code uses a lot of different functions from TensorBoard
and tries to have them all in a compact way, it might not be
super clear exactly what calls does what, for that I recommend
watching the YouTube video.
Video explanation: https://youtu.be/RLqsxWaQdHE
Got any questions leave a comment on youtube :)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-17 Initial coding
"""
# Imports
import torch
import torchvision
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F # All functions that don't have any parameters
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
from torch.utils.data import (
DataLoader,
) # Gives easier dataset managment and creates mini batches
from torch.utils.tensorboard import SummaryWriter # to print to tensorboard
# Simple CNN
class CNN(nn.Module):
def __init__(self, in_channels=1, num_classes=10):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(
in_channels=in_channels, out_channels=8, kernel_size=3, stride=1, padding=1
)
self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
self.conv2 = nn.Conv2d(
in_channels=8, out_channels=16, kernel_size=3, stride=1, padding=1
)
self.fc1 = nn.Linear(16 * 7 * 7, num_classes)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.pool(x)
x = F.relu(self.conv2(x))
x = self.pool(x)
x = x.reshape(x.shape[0], -1)
x = self.fc1(x)
return x
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
in_channels = 1
num_classes = 10
num_epochs = 1
# Load Data
train_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
# To do hyperparameter search, include more batch_sizes you want to try
# and more learning rates!
batch_sizes = [256]
learning_rates = [0.001]
classes = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
for batch_size in batch_sizes:
for learning_rate in learning_rates:
step = 0
# Initialize network
model = CNN(in_channels=in_channels, num_classes=num_classes)
model.to(device)
model.train()
criterion = nn.CrossEntropyLoss()
train_loader = DataLoader(
dataset=train_dataset, batch_size=batch_size, shuffle=True
)
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0)
writer = SummaryWriter(
f"runs/MNIST/MiniBatchSize {batch_size} LR {learning_rate}"
)
# Visualize model in TensorBoard
images, _ = next(iter(train_loader))
writer.add_graph(model, images.to(device))
writer.close()
for epoch in range(num_epochs):
losses = []
accuracies = []
for batch_idx, (data, targets) in enumerate(train_loader):
# Get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)
# forward
scores = model(data)
loss = criterion(scores, targets)
losses.append(loss.item())
# backward
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Calculate 'running' training accuracy
features = data.reshape(data.shape[0], -1)
img_grid = torchvision.utils.make_grid(data)
_, predictions = scores.max(1)
num_correct = (predictions == targets).sum()
running_train_acc = float(num_correct) / float(data.shape[0])
accuracies.append(running_train_acc)
# Plot things to tensorboard
class_labels = [classes[label] for label in predictions]
writer.add_image("mnist_images", img_grid)
writer.add_histogram("fc1", model.fc1.weight)
writer.add_scalar("Training loss", loss, global_step=step)
writer.add_scalar(
"Training Accuracy", running_train_acc, global_step=step
)
if batch_idx == 230:
writer.add_embedding(
features,
metadata=class_labels,
label_img=data,
global_step=batch_idx,
)
step += 1
writer.add_hparams(
{"lr": learning_rate, "bsize": batch_size},
{
"accuracy": sum(accuracies) / len(accuracies),
"loss": sum(losses) / len(losses),
},
)

View File

@@ -0,0 +1,155 @@
"""
Shows a small example of how to use transformations (perhaps unecessarily many)
on CIFAR10 dataset and training on a small CNN toy network.
Video explanation: https://youtu.be/Zvd276j9sZ8
Got any questions leave a comment I'm pretty good at responding on youtube
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-09 Initial coding
"""
# Imports
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F # All functions that don't have any parameters
from torch.utils.data import (
DataLoader,
) # Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
# Simple CNN
class CNN(nn.Module):
def __init__(self, in_channels, num_classes):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(
in_channels=in_channels,
out_channels=8,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
)
self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
self.conv2 = nn.Conv2d(
in_channels=8,
out_channels=16,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
)
self.fc1 = nn.Linear(16 * 8 * 8, num_classes)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.pool(x)
x = F.relu(self.conv2(x))
x = self.pool(x)
x = x.reshape(x.shape[0], -1)
x = self.fc1(x)
return x
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
learning_rate = 1e-4
batch_size = 64
num_epochs = 5
# Load pretrain model & modify it
model = CNN(in_channels=3, num_classes=10)
model.classifier = nn.Sequential(nn.Linear(512, 100), nn.ReLU(), nn.Linear(100, 10))
model.to(device)
# Load Data
my_transforms = transforms.Compose(
[ # Compose makes it possible to have many transforms
transforms.Resize((36, 36)), # Resizes (32,32) to (36,36)
transforms.RandomCrop((32, 32)), # Takes a random (32,32) crop
transforms.ColorJitter(brightness=0.5), # Change brightness of image
transforms.RandomRotation(
degrees=45
), # Perhaps a random rotation from -45 to 45 degrees
transforms.RandomHorizontalFlip(
p=0.5
), # Flips the image horizontally with probability 0.5
transforms.RandomVerticalFlip(
p=0.05
), # Flips image vertically with probability 0.05
transforms.RandomGrayscale(p=0.2), # Converts to grayscale with probability 0.2
transforms.ToTensor(), # Finally converts PIL image to tensor so we can train w. pytorch
transforms.Normalize(
mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]
), # Note: these values aren't optimal
]
)
train_dataset = datasets.CIFAR10(
root="dataset/", train=True, transform=my_transforms, download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Train Network
for epoch in range(num_epochs):
losses = []
for batch_idx, (data, targets) in enumerate(train_loader):
# Get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)
# forward
scores = model(data)
loss = criterion(scores, targets)
losses.append(loss.item())
# backward
optimizer.zero_grad()
loss.backward()
# gradient descent or adam step
optimizer.step()
print(f"Cost at epoch {epoch} is {sum(losses)/len(losses):.5f}")
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
if loader.dataset.train:
print("Checking accuracy on training data")
else:
print("Checking accuracy on test data")
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}"
)
model.train()
check_accuracy(train_loader, model)

View File

@@ -0,0 +1,15 @@
import random, torch, os, numpy as np
def seed_everything(seed=42):
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
seed_everything()
# Do training etc after running seed_everything

View File

@@ -0,0 +1,67 @@
"""
An implementation of LeNet CNN architecture.
Video explanation: https://youtu.be/fcOW-Zyb5Bo
Got any questions leave a comment on youtube :)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-05 Initial coding
"""
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
class LeNet(nn.Module):
def __init__(self):
super(LeNet, self).__init__()
self.relu = nn.ReLU()
self.pool = nn.AvgPool2d(kernel_size=(2, 2), stride=(2, 2))
self.conv1 = nn.Conv2d(
in_channels=1,
out_channels=6,
kernel_size=(5, 5),
stride=(1, 1),
padding=(0, 0),
)
self.conv2 = nn.Conv2d(
in_channels=6,
out_channels=16,
kernel_size=(5, 5),
stride=(1, 1),
padding=(0, 0),
)
self.conv3 = nn.Conv2d(
in_channels=16,
out_channels=120,
kernel_size=(5, 5),
stride=(1, 1),
padding=(0, 0),
)
self.linear1 = nn.Linear(120, 84)
self.linear2 = nn.Linear(84, 10)
def forward(self, x):
x = self.relu(self.conv1(x))
x = self.pool(x)
x = self.relu(self.conv2(x))
x = self.pool(x)
x = self.relu(
self.conv3(x)
) # num_examples x 120 x 1 x 1 --> num_examples x 120
x = x.reshape(x.shape[0], -1)
x = self.relu(self.linear1(x))
x = self.linear2(x)
return x
def test_lenet():
x = torch.randn(64, 1, 32, 32)
model = LeNet()
return model(x)
if __name__ == "__main__":
out = test_lenet()
print(out.shape)

View File

@@ -0,0 +1,166 @@
"""
An implementation of GoogLeNet / InceptionNet from scratch.
Video explanation: https://youtu.be/uQc4Fs7yx5I
Got any questions leave a comment on youtube :)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-07 Initial coding
"""
# Imports
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
class GoogLeNet(nn.Module):
def __init__(self, aux_logits=True, num_classes=1000):
super(GoogLeNet, self).__init__()
assert aux_logits == True or aux_logits == False
self.aux_logits = aux_logits
# Write in_channels, etc, all explicit in self.conv1, rest will write to
# make everything as compact as possible, kernel_size=3 instead of (3,3)
self.conv1 = conv_block(
in_channels=3,
out_channels=64,
kernel_size=(7, 7),
stride=(2, 2),
padding=(3, 3),
)
self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.conv2 = conv_block(64, 192, kernel_size=3, stride=1, padding=1)
self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# In this order: in_channels, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, out_1x1pool
self.inception3a = Inception_block(192, 64, 96, 128, 16, 32, 32)
self.inception3b = Inception_block(256, 128, 128, 192, 32, 96, 64)
self.maxpool3 = nn.MaxPool2d(kernel_size=(3, 3), stride=2, padding=1)
self.inception4a = Inception_block(480, 192, 96, 208, 16, 48, 64)
self.inception4b = Inception_block(512, 160, 112, 224, 24, 64, 64)
self.inception4c = Inception_block(512, 128, 128, 256, 24, 64, 64)
self.inception4d = Inception_block(512, 112, 144, 288, 32, 64, 64)
self.inception4e = Inception_block(528, 256, 160, 320, 32, 128, 128)
self.maxpool4 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.inception5a = Inception_block(832, 256, 160, 320, 32, 128, 128)
self.inception5b = Inception_block(832, 384, 192, 384, 48, 128, 128)
self.avgpool = nn.AvgPool2d(kernel_size=7, stride=1)
self.dropout = nn.Dropout(p=0.4)
self.fc1 = nn.Linear(1024, 1000)
if self.aux_logits:
self.aux1 = InceptionAux(512, num_classes)
self.aux2 = InceptionAux(528, num_classes)
else:
self.aux1 = self.aux2 = None
def forward(self, x):
x = self.conv1(x)
x = self.maxpool1(x)
x = self.conv2(x)
# x = self.conv3(x)
x = self.maxpool2(x)
x = self.inception3a(x)
x = self.inception3b(x)
x = self.maxpool3(x)
x = self.inception4a(x)
# Auxiliary Softmax classifier 1
if self.aux_logits and self.training:
aux1 = self.aux1(x)
x = self.inception4b(x)
x = self.inception4c(x)
x = self.inception4d(x)
# Auxiliary Softmax classifier 2
if self.aux_logits and self.training:
aux2 = self.aux2(x)
x = self.inception4e(x)
x = self.maxpool4(x)
x = self.inception5a(x)
x = self.inception5b(x)
x = self.avgpool(x)
x = x.reshape(x.shape[0], -1)
x = self.dropout(x)
x = self.fc1(x)
if self.aux_logits and self.training:
return aux1, aux2, x
else:
return x
class Inception_block(nn.Module):
def __init__(
self, in_channels, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, out_1x1pool
):
super(Inception_block, self).__init__()
self.branch1 = conv_block(in_channels, out_1x1, kernel_size=(1, 1))
self.branch2 = nn.Sequential(
conv_block(in_channels, red_3x3, kernel_size=(1, 1)),
conv_block(red_3x3, out_3x3, kernel_size=(3, 3), padding=(1, 1)),
)
self.branch3 = nn.Sequential(
conv_block(in_channels, red_5x5, kernel_size=(1, 1)),
conv_block(red_5x5, out_5x5, kernel_size=(5, 5), padding=(2, 2)),
)
self.branch4 = nn.Sequential(
nn.MaxPool2d(kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
conv_block(in_channels, out_1x1pool, kernel_size=(1, 1)),
)
def forward(self, x):
return torch.cat(
[self.branch1(x), self.branch2(x), self.branch3(x), self.branch4(x)], 1
)
class InceptionAux(nn.Module):
def __init__(self, in_channels, num_classes):
super(InceptionAux, self).__init__()
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.7)
self.pool = nn.AvgPool2d(kernel_size=5, stride=3)
self.conv = conv_block(in_channels, 128, kernel_size=1)
self.fc1 = nn.Linear(2048, 1024)
self.fc2 = nn.Linear(1024, num_classes)
def forward(self, x):
x = self.pool(x)
x = self.conv(x)
x = x.reshape(x.shape[0], -1)
x = self.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
class conv_block(nn.Module):
def __init__(self, in_channels, out_channels, **kwargs):
super(conv_block, self).__init__()
self.relu = nn.ReLU()
self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
self.batchnorm = nn.BatchNorm2d(out_channels)
def forward(self, x):
return self.relu(self.batchnorm(self.conv(x)))
if __name__ == "__main__":
# N = 3 (Mini batch size)
x = torch.randn(3, 3, 224, 224)
model = GoogLeNet(aux_logits=True, num_classes=1000)
print(model(x)[2].shape)

View File

@@ -0,0 +1,163 @@
# -*- coding: utf-8 -*-
"""
From scratch implementation of the famous ResNet models.
The intuition for ResNet is simple and clear, but to code
it didn't feel super clear at first, even when reading Pytorch own
implementation.
Video explanation:
Got any questions leave a comment on youtube :)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-12 Initial coding
"""
import torch
import torch.nn as nn
class block(nn.Module):
def __init__(
self, in_channels, intermediate_channels, identity_downsample=None, stride=1
):
super(block, self).__init__()
self.expansion = 4
self.conv1 = nn.Conv2d(
in_channels, intermediate_channels, kernel_size=1, stride=1, padding=0
)
self.bn1 = nn.BatchNorm2d(intermediate_channels)
self.conv2 = nn.Conv2d(
intermediate_channels,
intermediate_channels,
kernel_size=3,
stride=stride,
padding=1,
)
self.bn2 = nn.BatchNorm2d(intermediate_channels)
self.conv3 = nn.Conv2d(
intermediate_channels,
intermediate_channels * self.expansion,
kernel_size=1,
stride=1,
padding=0,
)
self.bn3 = nn.BatchNorm2d(intermediate_channels * self.expansion)
self.relu = nn.ReLU()
self.identity_downsample = identity_downsample
self.stride = stride
def forward(self, x):
identity = x.clone()
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.conv2(x)
x = self.bn2(x)
x = self.relu(x)
x = self.conv3(x)
x = self.bn3(x)
if self.identity_downsample is not None:
identity = self.identity_downsample(identity)
x += identity
x = self.relu(x)
return x
class ResNet(nn.Module):
def __init__(self, block, layers, image_channels, num_classes):
super(ResNet, self).__init__()
self.in_channels = 64
self.conv1 = nn.Conv2d(image_channels, 64, kernel_size=7, stride=2, padding=3)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU()
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# Essentially the entire ResNet architecture are in these 4 lines below
self.layer1 = self._make_layer(
block, layers[0], intermediate_channels=64, stride=1
)
self.layer2 = self._make_layer(
block, layers[1], intermediate_channels=128, stride=2
)
self.layer3 = self._make_layer(
block, layers[2], intermediate_channels=256, stride=2
)
self.layer4 = self._make_layer(
block, layers[3], intermediate_channels=512, stride=2
)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * 4, num_classes)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.reshape(x.shape[0], -1)
x = self.fc(x)
return x
def _make_layer(self, block, num_residual_blocks, intermediate_channels, stride):
identity_downsample = None
layers = []
# Either if we half the input space for ex, 56x56 -> 28x28 (stride=2), or channels changes
# we need to adapt the Identity (skip connection) so it will be able to be added
# to the layer that's ahead
if stride != 1 or self.in_channels != intermediate_channels * 4:
identity_downsample = nn.Sequential(
nn.Conv2d(
self.in_channels,
intermediate_channels * 4,
kernel_size=1,
stride=stride,
),
nn.BatchNorm2d(intermediate_channels * 4),
)
layers.append(
block(self.in_channels, intermediate_channels, identity_downsample, stride)
)
# The expansion size is always 4 for ResNet 50,101,152
self.in_channels = intermediate_channels * 4
# For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,
# then finally back to 256. Hence no identity downsample is needed, since stride = 1,
# and also same amount of channels.
for i in range(num_residual_blocks - 1):
layers.append(block(self.in_channels, intermediate_channels))
return nn.Sequential(*layers)
def ResNet50(img_channel=3, num_classes=1000):
return ResNet(block, [3, 4, 6, 3], img_channel, num_classes)
def ResNet101(img_channel=3, num_classes=1000):
return ResNet(block, [3, 4, 23, 3], img_channel, num_classes)
def ResNet152(img_channel=3, num_classes=1000):
return ResNet(block, [3, 8, 36, 3], img_channel, num_classes)
def test():
net = ResNet101(img_channel=3, num_classes=1000)
y = net(torch.randn(4, 3, 224, 224)).to("cuda")
print(y.size())
test()

View File

@@ -0,0 +1,119 @@
"""
A from scratch implementation of the VGG architecture.
Video explanation: https://youtu.be/ACmuBbuXn20
Got any questions leave a comment on youtube :)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-05 Initial coding
"""
# Imports
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
VGG_types = {
"VGG11": [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
"VGG13": [64, 64, "M", 128, 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
"VGG16": [
64,
64,
"M",
128,
128,
"M",
256,
256,
256,
"M",
512,
512,
512,
"M",
512,
512,
512,
"M",
],
"VGG19": [
64,
64,
"M",
128,
128,
"M",
256,
256,
256,
256,
"M",
512,
512,
512,
512,
"M",
512,
512,
512,
512,
"M",
],
}
class VGG_net(nn.Module):
def __init__(self, in_channels=3, num_classes=1000):
super(VGG_net, self).__init__()
self.in_channels = in_channels
self.conv_layers = self.create_conv_layers(VGG_types["VGG16"])
self.fcs = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(4096, 4096),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.conv_layers(x)
x = x.reshape(x.shape[0], -1)
x = self.fcs(x)
return x
def create_conv_layers(self, architecture):
layers = []
in_channels = self.in_channels
for x in architecture:
if type(x) == int:
out_channels = x
layers += [
nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
),
nn.BatchNorm2d(x),
nn.ReLU(),
]
in_channels = x
elif x == "M":
layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]
return nn.Sequential(*layers)
if __name__ == "__main__":
device = "cuda" if torch.cuda.is_available() else "cpu"
model = VGG_net(in_channels=3, num_classes=1000).to(device)
print(model)
## N = 3 (Mini batch size)
# x = torch.randn(3, 3, 224, 224).to(device)
# print(model(x).shape)

View File

@@ -0,0 +1,107 @@
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter # to print to tensorboard
class Discriminator(nn.Module):
def __init__(self, in_features):
super().__init__()
self.disc = nn.Sequential(
nn.Linear(in_features, 128),
nn.LeakyReLU(0.01),
nn.Linear(128, 1),
nn.Sigmoid(),
)
def forward(self, x):
return self.disc(x)
class Generator(nn.Module):
def __init__(self, z_dim, img_dim):
super().__init__()
self.gen = nn.Sequential(
nn.Linear(z_dim, 256),
nn.LeakyReLU(0.01),
nn.Linear(256, img_dim),
nn.Tanh(), # normalize inputs to [-1, 1] so make outputs [-1, 1]
)
def forward(self, x):
return self.gen(x)
# Hyperparameters etc.
device = "cuda" if torch.cuda.is_available() else "cpu"
lr = 3e-4
z_dim = 64
image_dim = 28 * 28 * 1 # 784
batch_size = 32
num_epochs = 50
disc = Discriminator(image_dim).to(device)
gen = Generator(z_dim, image_dim).to(device)
fixed_noise = torch.randn((batch_size, z_dim)).to(device)
transforms = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,)),]
)
dataset = datasets.MNIST(root="dataset/", transform=transforms, download=True)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
opt_disc = optim.Adam(disc.parameters(), lr=lr)
opt_gen = optim.Adam(gen.parameters(), lr=lr)
criterion = nn.BCELoss()
writer_fake = SummaryWriter(f"logs/fake")
writer_real = SummaryWriter(f"logs/real")
step = 0
for epoch in range(num_epochs):
for batch_idx, (real, _) in enumerate(loader):
real = real.view(-1, 784).to(device)
batch_size = real.shape[0]
### Train Discriminator: max log(D(x)) + log(1 - D(G(z)))
noise = torch.randn(batch_size, z_dim).to(device)
fake = gen(noise)
disc_real = disc(real).view(-1)
lossD_real = criterion(disc_real, torch.ones_like(disc_real))
disc_fake = disc(fake).view(-1)
lossD_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
lossD = (lossD_real + lossD_fake) / 2
disc.zero_grad()
lossD.backward(retain_graph=True)
opt_disc.step()
### Train Generator: min log(1 - D(G(z))) <-> max log(D(G(z))
# where the second option of maximizing doesn't suffer from
# saturating gradients
output = disc(fake).view(-1)
lossG = criterion(output, torch.ones_like(output))
gen.zero_grad()
lossG.backward()
opt_gen.step()
if batch_idx == 0:
print(
f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \
Loss D: {lossD:.4f}, loss G: {lossG:.4f}"
)
with torch.no_grad():
fake = gen(fixed_noise).reshape(-1, 1, 28, 28)
data = real.reshape(-1, 1, 28, 28)
img_grid_fake = torchvision.utils.make_grid(fake, normalize=True)
img_grid_real = torchvision.utils.make_grid(data, normalize=True)
writer_fake.add_image(
"Mnist Fake Images", img_grid_fake, global_step=step
)
writer_real.add_image(
"Mnist Real Images", img_grid_real, global_step=step
)
step += 1

View File

@@ -0,0 +1,96 @@
"""
Discriminator and Generator implementation from DCGAN paper
"""
import torch
import torch.nn as nn
class Discriminator(nn.Module):
def __init__(self, channels_img, features_d):
super(Discriminator, self).__init__()
self.disc = nn.Sequential(
# input: N x channels_img x 64 x 64
nn.Conv2d(
channels_img, features_d, kernel_size=4, stride=2, padding=1
),
nn.LeakyReLU(0.2),
# _block(in_channels, out_channels, kernel_size, stride, padding)
self._block(features_d, features_d * 2, 4, 2, 1),
self._block(features_d * 2, features_d * 4, 4, 2, 1),
self._block(features_d * 4, features_d * 8, 4, 2, 1),
# After all _block img output is 4x4 (Conv2d below makes into 1x1)
nn.Conv2d(features_d * 8, 1, kernel_size=4, stride=2, padding=0),
nn.Sigmoid(),
)
def _block(self, in_channels, out_channels, kernel_size, stride, padding):
return nn.Sequential(
nn.Conv2d(
in_channels,
out_channels,
kernel_size,
stride,
padding,
bias=False,
),
#nn.BatchNorm2d(out_channels),
nn.LeakyReLU(0.2),
)
def forward(self, x):
return self.disc(x)
class Generator(nn.Module):
def __init__(self, channels_noise, channels_img, features_g):
super(Generator, self).__init__()
self.net = nn.Sequential(
# Input: N x channels_noise x 1 x 1
self._block(channels_noise, features_g * 16, 4, 1, 0), # img: 4x4
self._block(features_g * 16, features_g * 8, 4, 2, 1), # img: 8x8
self._block(features_g * 8, features_g * 4, 4, 2, 1), # img: 16x16
self._block(features_g * 4, features_g * 2, 4, 2, 1), # img: 32x32
nn.ConvTranspose2d(
features_g * 2, channels_img, kernel_size=4, stride=2, padding=1
),
# Output: N x channels_img x 64 x 64
nn.Tanh(),
)
def _block(self, in_channels, out_channels, kernel_size, stride, padding):
return nn.Sequential(
nn.ConvTranspose2d(
in_channels,
out_channels,
kernel_size,
stride,
padding,
bias=False,
),
#nn.BatchNorm2d(out_channels),
nn.ReLU(),
)
def forward(self, x):
return self.net(x)
def initialize_weights(model):
# Initializes weights according to the DCGAN paper
for m in model.modules():
if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d)):
nn.init.normal_(m.weight.data, 0.0, 0.02)
def test():
N, in_channels, H, W = 8, 3, 64, 64
noise_dim = 100
x = torch.randn((N, in_channels, H, W))
disc = Discriminator(in_channels, 8)
assert disc(x).shape == (N, 1, 1, 1), "Discriminator test failed"
gen = Generator(noise_dim, in_channels, 8)
z = torch.randn((N, noise_dim, 1, 1))
assert gen(z).shape == (N, in_channels, H, W), "Generator test failed"
# test()

View File

@@ -0,0 +1,105 @@
"""
Training of DCGAN network on MNIST dataset with Discriminator
and Generator imported from models.py
"""
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from model import Discriminator, Generator, initialize_weights
# Hyperparameters etc.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LEARNING_RATE = 2e-4 # could also use two lrs, one for gen and one for disc
BATCH_SIZE = 128
IMAGE_SIZE = 64
CHANNELS_IMG = 1
NOISE_DIM = 100
NUM_EPOCHS = 5
FEATURES_DISC = 64
FEATURES_GEN = 64
transforms = transforms.Compose(
[
transforms.Resize(IMAGE_SIZE),
transforms.ToTensor(),
transforms.Normalize(
[0.5 for _ in range(CHANNELS_IMG)], [0.5 for _ in range(CHANNELS_IMG)]
),
]
)
# If you train on MNIST, remember to set channels_img to 1
dataset = datasets.MNIST(root="dataset/", train=True, transform=transforms,
download=True)
# comment mnist above and uncomment below if train on CelebA
#dataset = datasets.ImageFolder(root="celeb_dataset", transform=transforms)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
gen = Generator(NOISE_DIM, CHANNELS_IMG, FEATURES_GEN).to(device)
disc = Discriminator(CHANNELS_IMG, FEATURES_DISC).to(device)
initialize_weights(gen)
initialize_weights(disc)
opt_gen = optim.Adam(gen.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999))
opt_disc = optim.Adam(disc.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999))
criterion = nn.BCELoss()
fixed_noise = torch.randn(32, NOISE_DIM, 1, 1).to(device)
writer_real = SummaryWriter(f"logs/real")
writer_fake = SummaryWriter(f"logs/fake")
step = 0
gen.train()
disc.train()
for epoch in range(NUM_EPOCHS):
# Target labels not needed! <3 unsupervised
for batch_idx, (real, _) in enumerate(dataloader):
real = real.to(device)
noise = torch.randn(BATCH_SIZE, NOISE_DIM, 1, 1).to(device)
fake = gen(noise)
### Train Discriminator: max log(D(x)) + log(1 - D(G(z)))
disc_real = disc(real).reshape(-1)
loss_disc_real = criterion(disc_real, torch.ones_like(disc_real))
disc_fake = disc(fake.detach()).reshape(-1)
loss_disc_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
loss_disc = (loss_disc_real + loss_disc_fake) / 2
disc.zero_grad()
loss_disc.backward()
opt_disc.step()
### Train Generator: min log(1 - D(G(z))) <-> max log(D(G(z))
output = disc(fake).reshape(-1)
loss_gen = criterion(output, torch.ones_like(output))
gen.zero_grad()
loss_gen.backward()
opt_gen.step()
# Print losses occasionally and print to tensorboard
if batch_idx % 100 == 0:
print(
f"Epoch [{epoch}/{NUM_EPOCHS}] Batch {batch_idx}/{len(dataloader)} \
Loss D: {loss_disc:.4f}, loss G: {loss_gen:.4f}"
)
with torch.no_grad():
fake = gen(fixed_noise)
# take out (up to) 32 examples
img_grid_real = torchvision.utils.make_grid(
real[:32], normalize=True
)
img_grid_fake = torchvision.utils.make_grid(
fake[:32], normalize=True
)
writer_real.add_image("Real", img_grid_real, global_step=step)
writer_fake.add_image("Fake", img_grid_fake, global_step=step)
step += 1

View File

@@ -0,0 +1,98 @@
"""
Discriminator and Generator implementation from DCGAN paper,
with removed Sigmoid() as output from Discriminator (and therefor
it should be called critic)
"""
import torch
import torch.nn as nn
class Discriminator(nn.Module):
def __init__(self, channels_img, features_d):
super(Discriminator, self).__init__()
self.disc = nn.Sequential(
# input: N x channels_img x 64 x 64
nn.Conv2d(
channels_img, features_d, kernel_size=4, stride=2, padding=1
),
nn.LeakyReLU(0.2),
# _block(in_channels, out_channels, kernel_size, stride, padding)
self._block(features_d, features_d * 2, 4, 2, 1),
self._block(features_d * 2, features_d * 4, 4, 2, 1),
self._block(features_d * 4, features_d * 8, 4, 2, 1),
# After all _block img output is 4x4 (Conv2d below makes into 1x1)
nn.Conv2d(features_d * 8, 1, kernel_size=4, stride=2, padding=0),
)
def _block(self, in_channels, out_channels, kernel_size, stride, padding):
return nn.Sequential(
nn.Conv2d(
in_channels,
out_channels,
kernel_size,
stride,
padding,
bias=False,
),
nn.InstanceNorm2d(out_channels, affine=True),
nn.LeakyReLU(0.2),
)
def forward(self, x):
return self.disc(x)
class Generator(nn.Module):
def __init__(self, channels_noise, channels_img, features_g):
super(Generator, self).__init__()
self.net = nn.Sequential(
# Input: N x channels_noise x 1 x 1
self._block(channels_noise, features_g * 16, 4, 1, 0), # img: 4x4
self._block(features_g * 16, features_g * 8, 4, 2, 1), # img: 8x8
self._block(features_g * 8, features_g * 4, 4, 2, 1), # img: 16x16
self._block(features_g * 4, features_g * 2, 4, 2, 1), # img: 32x32
nn.ConvTranspose2d(
features_g * 2, channels_img, kernel_size=4, stride=2, padding=1
),
# Output: N x channels_img x 64 x 64
nn.Tanh(),
)
def _block(self, in_channels, out_channels, kernel_size, stride, padding):
return nn.Sequential(
nn.ConvTranspose2d(
in_channels,
out_channels,
kernel_size,
stride,
padding,
bias=False,
),
nn.BatchNorm2d(out_channels),
nn.ReLU(),
)
def forward(self, x):
return self.net(x)
def initialize_weights(model):
# Initializes weights according to the DCGAN paper
for m in model.modules():
if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d)):
nn.init.normal_(m.weight.data, 0.0, 0.02)
def test():
N, in_channels, H, W = 8, 3, 64, 64
noise_dim = 100
x = torch.randn((N, in_channels, H, W))
disc = Discriminator(in_channels, 8)
assert disc(x).shape == (N, 1, 1, 1), "Discriminator test failed"
gen = Generator(noise_dim, in_channels, 8)
z = torch.randn((N, noise_dim, 1, 1))
assert gen(z).shape == (N, in_channels, H, W), "Generator test failed"
# test()

View File

@@ -0,0 +1,114 @@
"""
Training of DCGAN network with WGAN loss
"""
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from model import Discriminator, Generator, initialize_weights
# Hyperparameters etc
device = "cuda" if torch.cuda.is_available() else "cpu"
LEARNING_RATE = 5e-5
BATCH_SIZE = 64
IMAGE_SIZE = 64
CHANNELS_IMG = 1
Z_DIM = 128
NUM_EPOCHS = 5
FEATURES_CRITIC = 64
FEATURES_GEN = 64
CRITIC_ITERATIONS = 5
WEIGHT_CLIP = 0.01
transforms = transforms.Compose(
[
transforms.Resize(IMAGE_SIZE),
transforms.ToTensor(),
transforms.Normalize(
[0.5 for _ in range(CHANNELS_IMG)], [0.5 for _ in range(CHANNELS_IMG)]
),
]
)
dataset = datasets.MNIST(root="dataset/", transform=transforms, download=True)
#comment mnist and uncomment below if you want to train on CelebA dataset
#dataset = datasets.ImageFolder(root="celeb_dataset", transform=transforms)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
# initialize gen and disc/critic
gen = Generator(Z_DIM, CHANNELS_IMG, FEATURES_GEN).to(device)
critic = Discriminator(CHANNELS_IMG, FEATURES_CRITIC).to(device)
initialize_weights(gen)
initialize_weights(critic)
# initializate optimizer
opt_gen = optim.RMSprop(gen.parameters(), lr=LEARNING_RATE)
opt_critic = optim.RMSprop(critic.parameters(), lr=LEARNING_RATE)
# for tensorboard plotting
fixed_noise = torch.randn(32, Z_DIM, 1, 1).to(device)
writer_real = SummaryWriter(f"logs/real")
writer_fake = SummaryWriter(f"logs/fake")
step = 0
gen.train()
critic.train()
for epoch in range(NUM_EPOCHS):
# Target labels not needed! <3 unsupervised
for batch_idx, (data, _) in enumerate(loader):
data = data.to(device)
cur_batch_size = data.shape[0]
# Train Critic: max E[critic(real)] - E[critic(fake)]
for _ in range(CRITIC_ITERATIONS):
noise = torch.randn(cur_batch_size, Z_DIM, 1, 1).to(device)
fake = gen(noise)
critic_real = critic(data).reshape(-1)
critic_fake = critic(fake).reshape(-1)
loss_critic = -(torch.mean(critic_real) - torch.mean(critic_fake))
critic.zero_grad()
loss_critic.backward(retain_graph=True)
opt_critic.step()
# clip critic weights between -0.01, 0.01
for p in critic.parameters():
p.data.clamp_(-WEIGHT_CLIP, WEIGHT_CLIP)
# Train Generator: max E[critic(gen_fake)] <-> min -E[critic(gen_fake)]
gen_fake = critic(fake).reshape(-1)
loss_gen = -torch.mean(gen_fake)
gen.zero_grad()
loss_gen.backward()
opt_gen.step()
# Print losses occasionally and print to tensorboard
if batch_idx % 100 == 0 and batch_idx > 0:
gen.eval()
critic.eval()
print(
f"Epoch [{epoch}/{NUM_EPOCHS}] Batch {batch_idx}/{len(loader)} \
Loss D: {loss_critic:.4f}, loss G: {loss_gen:.4f}"
)
with torch.no_grad():
fake = gen(noise)
# take out (up to) 32 examples
img_grid_real = torchvision.utils.make_grid(
data[:32], normalize=True
)
img_grid_fake = torchvision.utils.make_grid(
fake[:32], normalize=True
)
writer_real.add_image("Real", img_grid_real, global_step=step)
writer_fake.add_image("Fake", img_grid_fake, global_step=step)
step += 1
gen.train()
critic.train()

View File

@@ -0,0 +1,84 @@
"""
Discriminator and Generator implementation from DCGAN paper
"""
import torch
import torch.nn as nn
class Discriminator(nn.Module):
def __init__(self, channels_img, features_d):
super(Discriminator, self).__init__()
self.disc = nn.Sequential(
# input: N x channels_img x 64 x 64
nn.Conv2d(channels_img, features_d, kernel_size=4, stride=2, padding=1),
nn.LeakyReLU(0.2),
# _block(in_channels, out_channels, kernel_size, stride, padding)
self._block(features_d, features_d * 2, 4, 2, 1),
self._block(features_d * 2, features_d * 4, 4, 2, 1),
self._block(features_d * 4, features_d * 8, 4, 2, 1),
# After all _block img output is 4x4 (Conv2d below makes into 1x1)
nn.Conv2d(features_d * 8, 1, kernel_size=4, stride=2, padding=0),
)
def _block(self, in_channels, out_channels, kernel_size, stride, padding):
return nn.Sequential(
nn.Conv2d(
in_channels, out_channels, kernel_size, stride, padding, bias=False,
),
nn.InstanceNorm2d(out_channels, affine=True),
nn.LeakyReLU(0.2),
)
def forward(self, x):
return self.disc(x)
class Generator(nn.Module):
def __init__(self, channels_noise, channels_img, features_g):
super(Generator, self).__init__()
self.net = nn.Sequential(
# Input: N x channels_noise x 1 x 1
self._block(channels_noise, features_g * 16, 4, 1, 0), # img: 4x4
self._block(features_g * 16, features_g * 8, 4, 2, 1), # img: 8x8
self._block(features_g * 8, features_g * 4, 4, 2, 1), # img: 16x16
self._block(features_g * 4, features_g * 2, 4, 2, 1), # img: 32x32
nn.ConvTranspose2d(
features_g * 2, channels_img, kernel_size=4, stride=2, padding=1
),
# Output: N x channels_img x 64 x 64
nn.Tanh(),
)
def _block(self, in_channels, out_channels, kernel_size, stride, padding):
return nn.Sequential(
nn.ConvTranspose2d(
in_channels, out_channels, kernel_size, stride, padding, bias=False,
),
nn.BatchNorm2d(out_channels),
nn.ReLU(),
)
def forward(self, x):
return self.net(x)
def initialize_weights(model):
# Initializes weights according to the DCGAN paper
for m in model.modules():
if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d)):
nn.init.normal_(m.weight.data, 0.0, 0.02)
def test():
N, in_channels, H, W = 8, 3, 64, 64
noise_dim = 100
x = torch.randn((N, in_channels, H, W))
disc = Discriminator(in_channels, 8)
assert disc(x).shape == (N, 1, 1, 1), "Discriminator test failed"
gen = Generator(noise_dim, in_channels, 8)
z = torch.randn((N, noise_dim, 1, 1))
assert gen(z).shape == (N, in_channels, H, W), "Generator test failed"
# test()

View File

@@ -0,0 +1,111 @@
"""
Training of WGAN-GP
"""
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from utils import gradient_penalty, save_checkpoint, load_checkpoint
from model import Discriminator, Generator, initialize_weights
# Hyperparameters etc.
device = "cuda" if torch.cuda.is_available() else "cpu"
LEARNING_RATE = 1e-4
BATCH_SIZE = 64
IMAGE_SIZE = 64
CHANNELS_IMG = 1
Z_DIM = 100
NUM_EPOCHS = 100
FEATURES_CRITIC = 16
FEATURES_GEN = 16
CRITIC_ITERATIONS = 5
LAMBDA_GP = 10
transforms = transforms.Compose(
[
transforms.Resize(IMAGE_SIZE),
transforms.ToTensor(),
transforms.Normalize(
[0.5 for _ in range(CHANNELS_IMG)], [0.5 for _ in range(CHANNELS_IMG)]),
]
)
dataset = datasets.MNIST(root="dataset/", transform=transforms, download=True)
# comment mnist above and uncomment below for training on CelebA
#dataset = datasets.ImageFolder(root="celeb_dataset", transform=transforms)
loader = DataLoader(
dataset,
batch_size=BATCH_SIZE,
shuffle=True,
)
# initialize gen and disc, note: discriminator should be called critic,
# according to WGAN paper (since it no longer outputs between [0, 1])
gen = Generator(Z_DIM, CHANNELS_IMG, FEATURES_GEN).to(device)
critic = Discriminator(CHANNELS_IMG, FEATURES_CRITIC).to(device)
initialize_weights(gen)
initialize_weights(critic)
# initializate optimizer
opt_gen = optim.Adam(gen.parameters(), lr=LEARNING_RATE, betas=(0.0, 0.9))
opt_critic = optim.Adam(critic.parameters(), lr=LEARNING_RATE, betas=(0.0, 0.9))
# for tensorboard plotting
fixed_noise = torch.randn(32, Z_DIM, 1, 1).to(device)
writer_real = SummaryWriter(f"logs/GAN_MNIST/real")
writer_fake = SummaryWriter(f"logs/GAN_MNIST/fake")
step = 0
gen.train()
critic.train()
for epoch in range(NUM_EPOCHS):
# Target labels not needed! <3 unsupervised
for batch_idx, (real, _) in enumerate(loader):
real = real.to(device)
cur_batch_size = real.shape[0]
# Train Critic: max E[critic(real)] - E[critic(fake)]
# equivalent to minimizing the negative of that
for _ in range(CRITIC_ITERATIONS):
noise = torch.randn(cur_batch_size, Z_DIM, 1, 1).to(device)
fake = gen(noise)
critic_real = critic(real).reshape(-1)
critic_fake = critic(fake).reshape(-1)
gp = gradient_penalty(critic, real, fake, device=device)
loss_critic = (
-(torch.mean(critic_real) - torch.mean(critic_fake)) + LAMBDA_GP * gp
)
critic.zero_grad()
loss_critic.backward(retain_graph=True)
opt_critic.step()
# Train Generator: max E[critic(gen_fake)] <-> min -E[critic(gen_fake)]
gen_fake = critic(fake).reshape(-1)
loss_gen = -torch.mean(gen_fake)
gen.zero_grad()
loss_gen.backward()
opt_gen.step()
# Print losses occasionally and print to tensorboard
if batch_idx % 100 == 0 and batch_idx > 0:
print(
f"Epoch [{epoch}/{NUM_EPOCHS}] Batch {batch_idx}/{len(loader)} \
Loss D: {loss_critic:.4f}, loss G: {loss_gen:.4f}"
)
with torch.no_grad():
fake = gen(fixed_noise)
# take out (up to) 32 examples
img_grid_real = torchvision.utils.make_grid(real[:32], normalize=True)
img_grid_fake = torchvision.utils.make_grid(fake[:32], normalize=True)
writer_real.add_image("Real", img_grid_real, global_step=step)
writer_fake.add_image("Fake", img_grid_fake, global_step=step)
step += 1

View File

@@ -0,0 +1,35 @@
import torch
import torch.nn as nn
def gradient_penalty(critic, real, fake, device="cpu"):
BATCH_SIZE, C, H, W = real.shape
alpha = torch.rand((BATCH_SIZE, 1, 1, 1)).repeat(1, C, H, W).to(device)
interpolated_images = real * alpha + fake * (1 - alpha)
# Calculate critic scores
mixed_scores = critic(interpolated_images)
# Take the gradient of the scores with respect to the images
gradient = torch.autograd.grad(
inputs=interpolated_images,
outputs=mixed_scores,
grad_outputs=torch.ones_like(mixed_scores),
create_graph=True,
retain_graph=True,
)[0]
gradient = gradient.view(gradient.shape[0], -1)
gradient_norm = gradient.norm(2, dim=1)
gradient_penalty = torch.mean((gradient_norm - 1) ** 2)
return gradient_penalty
def save_checkpoint(state, filename="celeba_wgan_gp.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
def load_checkpoint(checkpoint, gen, disc):
print("=> Loading checkpoint")
gen.load_state_dict(checkpoint['gen'])
disc.load_state_dict(checkpoint['disc'])

View File

@@ -0,0 +1,205 @@
"""
Implementation of ProGAN generator and discriminator with the key
attributions from the paper. We have tried to make the implementation
compact but a goal is also to keep it readable and understandable.
Specifically the key points implemented are:
1) Progressive growing (of model and layers)
2) Minibatch std on Discriminator
3) Normalization with PixelNorm
4) Equalized Learning Rate (here I cheated and only did it on Conv layers)
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from math import log2
"""
Factors is used in Discrmininator and Generator for how much
the channels should be multiplied and expanded for each layer,
so specifically the first 5 layers the channels stay the same,
whereas when we increase the img_size (towards the later layers)
we decrease the number of chanels by 1/2, 1/4, etc.
"""
factors = [1, 1, 1, 1, 1/2, 1/4, 1/4, 1/8, 1/16]
class WSConv2d(nn.Module):
"""
Weight scaled Conv2d (Equalized Learning Rate)
Note that input is multiplied rather than changing weights
this will have the same result.
Inspired by:
https://github.com/nvnbny/progressive_growing_of_gans/blob/master/modelUtils.py
"""
def __init__(
self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, gain=2
):
super(WSConv2d, self).__init__()
self.conv = nn.Conv2d(
in_channels, out_channels, kernel_size, stride, padding
)
self.scale = (gain / (self.conv.weight[0].numel())) ** 0.5
# initialize conv layer
nn.init.normal_(self.conv.weight)
nn.init.zeros_(self.conv.bias)
def forward(self, x):
return self.conv(x * self.scale)
class PixelNorm(nn.Module):
def __init__(self):
super(PixelNorm, self).__init__()
self.epsilon = 1e-8
def forward(self, x):
return x / torch.sqrt(
torch.mean(x ** 2, dim=1, keepdim=True) + self.epsilon
)
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels, use_pixelnorm=True):
super(ConvBlock, self).__init__()
self.use_pn = use_pixelnorm
self.conv1 = WSConv2d(in_channels, out_channels)
self.conv2 = WSConv2d(out_channels, out_channels)
self.leaky = nn.LeakyReLU(0.2)
self.pn = PixelNorm()
def forward(self, x):
x = self.leaky(self.conv1(x))
x = self.pn(x) if self.use_pn else x
x = self.leaky(self.conv2(x))
x = self.pn(x) if self.use_pn else x
return x
class Generator(nn.Module):
def __init__(self, z_dim, in_channels, img_size, img_channels=3):
super(Generator, self).__init__()
self.prog_blocks, self.rgb_layers = nn.ModuleList([]), nn.ModuleList([])
# initial takes 1x1 -> 4x4
self.initial = nn.Sequential(
nn.ConvTranspose2d(z_dim, in_channels, 4, 1, 0),
nn.LeakyReLU(0.2),
PixelNorm(),
)
# Create progression blocks and rgb layers
channels = in_channels
# we need to double img for log2(img_size/4) and
# +1 in loop for initial 4x4
for idx in range(int(log2(img_size/4)) + 1):
conv_in = channels
conv_out = int(in_channels*factors[idx])
self.prog_blocks.append(ConvBlock(conv_in, conv_out))
self.rgb_layers.append(WSConv2d(conv_out, img_channels, kernel_size=1, stride=1, padding=0))
channels = conv_out
def fade_in(self, alpha, upscaled, generated):
#assert 0 <= alpha <= 1, "Alpha not between 0 and 1"
#assert upscaled.shape == generated.shape
return torch.tanh(alpha * generated + (1 - alpha) * upscaled)
def forward(self, x, alpha, steps):
upscaled = self.initial(x)
out = self.prog_blocks[0](upscaled)
if steps == 0:
return self.rgb_layers[0](out)
for step in range(1, steps+1):
upscaled = F.interpolate(out, scale_factor=2, mode="nearest")
out = self.prog_blocks[step](upscaled)
# The number of channels in upscale will stay the same, while
# out which has moved through prog_blocks might change. To ensure
# we can convert both to rgb we use different rgb_layers
# (steps-1) and steps for upscaled, out respectively
final_upscaled = self.rgb_layers[steps - 1](upscaled)
final_out = self.rgb_layers[steps](out)
return self.fade_in(alpha, final_upscaled, final_out)
class Discriminator(nn.Module):
def __init__(self, img_size, z_dim, in_channels, img_channels=3):
super(Discriminator, self).__init__()
self.prog_blocks, self.rgb_layers = nn.ModuleList([]), nn.ModuleList([])
# Create progression blocks and rgb layers
channels = in_channels
for idx in range(int(log2(img_size/4)) + 1):
conv_in = int(in_channels * factors[idx])
conv_out = channels
self.rgb_layers.append(WSConv2d(img_channels, conv_in, kernel_size=1, stride=1, padding=0))
self.prog_blocks.append(ConvBlock(conv_in, conv_out, use_pixelnorm=False))
channels = conv_in
self.avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
# +1 to in_channels because we concatenate from minibatch std
self.conv = WSConv2d(in_channels + 1, z_dim, kernel_size=4, stride=1, padding=0)
self.linear = nn.Linear(z_dim, 1)
def fade_in(self, alpha, downscaled, out):
"""Used to fade in downscaled using avgpooling and output from CNN"""
#assert 0 <= alpha <= 1, "Alpha needs to be between [0, 1]"
#assert downscaled.shape == out.shape
return alpha * out + (1 - alpha) * downscaled
def minibatch_std(self, x):
batch_statistics = (
torch.std(x, dim=0)
.mean()
.repeat(x.shape[0], 1, x.shape[2], x.shape[3])
)
return torch.cat([x, batch_statistics], dim=1)
def forward(self, x, alpha, steps):
out = self.rgb_layers[steps](x) # convert from rgb as initial step
if steps == 0: # i.e, image is 4x4
out = self.minibatch_std(out)
out = self.conv(out)
return self.linear(out.view(-1, out.shape[1]))
# index steps which has the "reverse" fade_in
downscaled = self.rgb_layers[steps - 1](self.avg_pool(x))
out = self.avg_pool(self.prog_blocks[steps](out))
out = self.fade_in(alpha, downscaled, out)
for step in range(steps - 1, 0, -1):
downscaled = self.avg_pool(out)
out = self.prog_blocks[step](downscaled)
out = self.minibatch_std(out)
out = self.conv(out)
return self.linear(out.view(-1, out.shape[1]))
if __name__ == "__main__":
import time
Z_DIM = 100
IN_CHANNELS = 16
img_size = 512
num_steps = int(log2(img_size / 4))
x = torch.randn((5, Z_DIM, 1, 1))
gen = Generator(Z_DIM, IN_CHANNELS, img_size=img_size)
disc = Discriminator(img_size, Z_DIM, IN_CHANNELS)
start = time.time()
with torch.autograd.profiler.profile(use_cuda=True) as prof:
z = gen(x, alpha=0.5, steps=num_steps)
print(prof)
gen_time = time.time()-start
t = time.time()
out = disc(z, 0.01, num_steps)
disc_time = time.time()-t
print(gen_time, disc_time)
#print(disc(z, 0.01, num_steps).shape)

View File

@@ -0,0 +1,5 @@
def func(x=1, y=2, **kwargs):
print(x, y)
print(func(x=3, y=4))

View File

@@ -0,0 +1,165 @@
""" Training of ProGAN using WGAN-GP loss"""
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from utils import gradient_penalty, plot_to_tensorboard, save_checkpoint, load_checkpoint
from model import Discriminator, Generator
from math import log2
from tqdm import tqdm
import time
torch.backends.cudnn.benchmarks = True
torch.manual_seed(0)
# Hyperparameters etc.
device = "cuda" if torch.cuda.is_available() else "cpu"
LEARNING_RATE = 1e-4
BATCH_SIZES = [128, 128, 64, 16, 8, 4, 2, 2, 1]
IMAGE_SIZE = 128
CHANNELS_IMG = 3
Z_DIM = 128
IN_CHANNELS = 128
CRITIC_ITERATIONS = 1
LAMBDA_GP = 10
NUM_STEPS = int(log2(IMAGE_SIZE / 4)) + 1
PROGRESSIVE_EPOCHS = [2 ** i for i in range(int(log2(IMAGE_SIZE / 4) + 1))]
PROGRESSIVE_EPOCHS = [8 for i in range(int(log2(IMAGE_SIZE / 4) + 1))]
fixed_noise = torch.randn(8, Z_DIM, 1, 1).to(device)
NUM_WORKERS = 4
def get_loader(image_size):
transform = transforms.Compose(
[
transforms.Resize((image_size, image_size)),
transforms.ToTensor(),
transforms.Normalize(
[0.5 for _ in range(CHANNELS_IMG)],
[0.5 for _ in range(CHANNELS_IMG)],
),
]
)
batch_size = BATCH_SIZES[int(log2(image_size/4))]
dataset = datasets.ImageFolder(root="celeb_dataset", transform=transform)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
return loader, dataset
def train_fn(
critic,
gen,
loader,
dataset,
step,
alpha,
opt_critic,
opt_gen,
tensorboard_step,
writer,
):
start = time.time()
total_time = 0
training = tqdm(loader, leave=True)
for batch_idx, (real, _) in enumerate(training):
real = real.to(device)
cur_batch_size = real.shape[0]
model_start = time.time()
# Train Critic: max E[critic(real)] - E[critic(fake)]
# which is equivalent to minimizing the negative of the expression
for _ in range(CRITIC_ITERATIONS):
critic.zero_grad()
noise = torch.randn(cur_batch_size, Z_DIM, 1, 1).to(device)
fake = gen(noise, alpha, step)
critic_real = critic(real, alpha, step).reshape(-1)
critic_fake = critic(fake, alpha, step).reshape(-1)
gp = gradient_penalty(critic, real, fake, alpha, step, device=device)
loss_critic = (
-(torch.mean(critic_real) - torch.mean(critic_fake))
+ LAMBDA_GP * gp
)
loss_critic.backward(retain_graph=True)
opt_critic.step()
# Train Generator: max E[critic(gen_fake)] <-> min -E[critic(gen_fake)]
gen.zero_grad()
fake = gen(noise, alpha, step)
gen_fake = critic(fake, alpha, step).reshape(-1)
loss_gen = -torch.mean(gen_fake)
loss_gen.backward()
opt_gen.step()
# Update alpha and ensure less than 1
alpha += cur_batch_size / (
(PROGRESSIVE_EPOCHS[step]*0.5) * len(dataset) # - step
)
alpha = min(alpha, 1)
total_time += time.time()-model_start
if batch_idx % 300 == 0:
with torch.no_grad():
fixed_fakes = gen(fixed_noise, alpha, step)
plot_to_tensorboard(
writer, loss_critic, loss_gen, real, fixed_fakes, tensorboard_step
)
tensorboard_step += 1
print(f'Fraction spent on model training: {total_time/(time.time()-start)}')
return tensorboard_step, alpha
def main():
# initialize gen and disc, note: discriminator should be called critic,
# according to WGAN paper (since it no longer outputs between [0, 1])
gen = Generator(Z_DIM, IN_CHANNELS, img_size=IMAGE_SIZE, img_channels=CHANNELS_IMG).to(device)
critic = Discriminator(IMAGE_SIZE, Z_DIM, IN_CHANNELS, img_channels=CHANNELS_IMG).to(device)
# initializate optimizer
opt_gen = optim.Adam(gen.parameters(), lr=LEARNING_RATE, betas=(0.0, 0.99))
opt_critic = optim.Adam(critic.parameters(), lr=LEARNING_RATE, betas=(0.0, 0.99))
# for tensorboard plotting
writer = SummaryWriter(f"logs/gan")
load_checkpoint(torch.load("celeba_wgan_gp.pth.tar"), gen, critic)
gen.train()
critic.train()
tensorboard_step = 0
for step, num_epochs in enumerate(PROGRESSIVE_EPOCHS):
alpha = 0.01
if step < 3:
continue
if step == 4:
print(f"Img size is: {4*2**step}")
loader, dataset = get_loader(4 * 2 ** step)
for epoch in range(num_epochs):
print(f"Epoch [{epoch+1}/{num_epochs}]")
tensorboard_step, alpha = train_fn(
critic,
gen,
loader,
dataset,
step,
alpha,
opt_critic,
opt_gen,
tensorboard_step,
writer,
)
checkpoint = {'gen': gen.state_dict(),
'critic': critic.state_dict(),
'opt_gen': opt_gen.state_dict(),
'opt_critic': opt_critic.state_dict()}
save_checkpoint(checkpoint)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,54 @@
import torch
import torchvision
import torch.nn as nn
# Print losses occasionally and print to tensorboard
def plot_to_tensorboard(
writer, loss_critic, loss_gen, real, fake, tensorboard_step
):
writer.add_scalar("Loss Critic", loss_critic, global_step=tensorboard_step)
with torch.no_grad():
# take out (up to) 32 examples
img_grid_real = torchvision.utils.make_grid(real[:8], normalize=True)
img_grid_fake = torchvision.utils.make_grid(fake[:8], normalize=True)
writer.add_image("Real", img_grid_real, global_step=tensorboard_step)
writer.add_image("Fake", img_grid_fake, global_step=tensorboard_step)
def gradient_penalty(critic, real, fake, alpha, train_step, device="cpu"):
BATCH_SIZE, C, H, W = real.shape
beta = torch.rand((BATCH_SIZE, 1, 1, 1)).repeat(1, C, H, W).to(device)
interpolated_images = real * beta + fake * (1 - beta)
# Calculate critic scores
mixed_scores = critic(interpolated_images, alpha, train_step)
# Take the gradient of the scores with respect to the images
gradient = torch.autograd.grad(
inputs=interpolated_images,
outputs=mixed_scores,
grad_outputs=torch.ones_like(mixed_scores),
create_graph=True,
retain_graph=True,
)[0]
gradient = gradient.view(gradient.shape[0], -1)
gradient_norm = gradient.norm(2, dim=1)
gradient_penalty = torch.mean((gradient_norm - 1) ** 2)
return gradient_penalty
def save_checkpoint(state, filename="celeba_wgan_gp.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
def load_checkpoint(checkpoint, gen, disc, opt_gen=None, opt_disc=None):
print("=> Loading checkpoint")
gen.load_state_dict(checkpoint['gen'])
disc.load_state_dict(checkpoint['critic'])
if opt_gen != None and opt_disc != None:
opt_gen.load_state_dict(checkpoint['opt_gen'])
opt_disc.load_state_dict(checkpoint['opt_critic'])

Binary file not shown.

After

Width:  |  Height:  |  Size: 101 KiB

View File

@@ -0,0 +1,29 @@
import os
from PIL import Image
from torch.utils.data import Dataset
import numpy as np
class CarvanaDataset(Dataset):
def __init__(self, image_dir, mask_dir, transform=None):
self.image_dir = image_dir
self.mask_dir = mask_dir
self.transform = transform
self.images = os.listdir(image_dir)
def __len__(self):
return len(self.images)
def __getitem__(self, index):
img_path = os.path.join(self.image_dir, self.images[index])
mask_path = os.path.join(self.mask_dir, self.images[index].replace(".jpg", "_mask.gif"))
image = np.array(Image.open(img_path).convert("RGB"))
mask = np.array(Image.open(mask_path).convert("L"), dtype=np.float32)
mask[mask == 255.0] = 1.0
if self.transform is not None:
augmentations = self.transform(image=image, mask=mask)
image = augmentations["image"]
mask = augmentations["mask"]
return image, mask

View File

@@ -0,0 +1,76 @@
import torch
import torch.nn as nn
import torchvision.transforms.functional as TF
class DoubleConv(nn.Module):
def __init__(self, in_channels, out_channels):
super(DoubleConv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
)
def forward(self, x):
return self.conv(x)
class UNET(nn.Module):
def __init__(
self, in_channels=3, out_channels=1, features=[64, 128, 256, 512],
):
super(UNET, self).__init__()
self.ups = nn.ModuleList()
self.downs = nn.ModuleList()
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
# Down part of UNET
for feature in features:
self.downs.append(DoubleConv(in_channels, feature))
in_channels = feature
# Up part of UNET
for feature in reversed(features):
self.ups.append(
nn.ConvTranspose2d(
feature*2, feature, kernel_size=2, stride=2,
)
)
self.ups.append(DoubleConv(feature*2, feature))
self.bottleneck = DoubleConv(features[-1], features[-1]*2)
self.final_conv = nn.Conv2d(features[0], out_channels, kernel_size=1)
def forward(self, x):
skip_connections = []
for down in self.downs:
x = down(x)
skip_connections.append(x)
x = self.pool(x)
x = self.bottleneck(x)
skip_connections = skip_connections[::-1]
for idx in range(0, len(self.ups), 2):
x = self.ups[idx](x)
skip_connection = skip_connections[idx//2]
if x.shape != skip_connection.shape:
x = TF.resize(x, size=skip_connection.shape[2:])
concat_skip = torch.cat((skip_connection, x), dim=1)
x = self.ups[idx+1](concat_skip)
return self.final_conv(x)
def test():
x = torch.randn((3, 1, 161, 161))
model = UNET(in_channels=1, out_channels=1)
preds = model(x)
assert preds.shape == x.shape
if __name__ == "__main__":
test()

View File

@@ -0,0 +1,124 @@
import torch
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from model import UNET
from utils import (
load_checkpoint,
save_checkpoint,
get_loaders,
check_accuracy,
save_predictions_as_imgs,
)
# Hyperparameters etc.
LEARNING_RATE = 1e-4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16
NUM_EPOCHS = 3
NUM_WORKERS = 2
IMAGE_HEIGHT = 160 # 1280 originally
IMAGE_WIDTH = 240 # 1918 originally
PIN_MEMORY = True
LOAD_MODEL = True
TRAIN_IMG_DIR = "data/train_images/"
TRAIN_MASK_DIR = "data/train_masks/"
VAL_IMG_DIR = "data/val_images/"
VAL_MASK_DIR = "data/val_masks/"
def train_fn(loader, model, optimizer, loss_fn, scaler):
loop = tqdm(loader)
for batch_idx, (data, targets) in enumerate(loop):
data = data.to(device=DEVICE)
targets = targets.float().unsqueeze(1).to(device=DEVICE)
# forward
with torch.cuda.amp.autocast():
predictions = model(data)
loss = loss_fn(predictions, targets)
# backward
optimizer.zero_grad()
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
# update tqdm loop
loop.set_postfix(loss=loss.item())
def main():
train_transform = A.Compose(
[
A.Resize(height=IMAGE_HEIGHT, width=IMAGE_WIDTH),
A.Rotate(limit=35, p=1.0),
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.1),
A.Normalize(
mean=[0.0, 0.0, 0.0],
std=[1.0, 1.0, 1.0],
max_pixel_value=255.0,
),
ToTensorV2(),
],
)
val_transforms = A.Compose(
[
A.Resize(height=IMAGE_HEIGHT, width=IMAGE_WIDTH),
A.Normalize(
mean=[0.0, 0.0, 0.0],
std=[1.0, 1.0, 1.0],
max_pixel_value=255.0,
),
ToTensorV2(),
],
)
model = UNET(in_channels=3, out_channels=1).to(DEVICE)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
train_loader, val_loader = get_loaders(
TRAIN_IMG_DIR,
TRAIN_MASK_DIR,
VAL_IMG_DIR,
VAL_MASK_DIR,
BATCH_SIZE,
train_transform,
val_transforms,
NUM_WORKERS,
PIN_MEMORY,
)
if LOAD_MODEL:
load_checkpoint(torch.load("my_checkpoint.pth.tar"), model)
check_accuracy(val_loader, model, device=DEVICE)
scaler = torch.cuda.amp.GradScaler()
for epoch in range(NUM_EPOCHS):
train_fn(train_loader, model, optimizer, loss_fn, scaler)
# save model
checkpoint = {
"state_dict": model.state_dict(),
"optimizer":optimizer.state_dict(),
}
save_checkpoint(checkpoint)
# check accuracy
check_accuracy(val_loader, model, device=DEVICE)
# print some examples to a folder
save_predictions_as_imgs(
val_loader, model, folder="saved_images/", device=DEVICE
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,93 @@
import torch
import torchvision
from dataset import CarvanaDataset
from torch.utils.data import DataLoader
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
def load_checkpoint(checkpoint, model):
print("=> Loading checkpoint")
model.load_state_dict(checkpoint["state_dict"])
def get_loaders(
train_dir,
train_maskdir,
val_dir,
val_maskdir,
batch_size,
train_transform,
val_transform,
num_workers=4,
pin_memory=True,
):
train_ds = CarvanaDataset(
image_dir=train_dir,
mask_dir=train_maskdir,
transform=train_transform,
)
train_loader = DataLoader(
train_ds,
batch_size=batch_size,
num_workers=num_workers,
pin_memory=pin_memory,
shuffle=True,
)
val_ds = CarvanaDataset(
image_dir=val_dir,
mask_dir=val_maskdir,
transform=val_transform,
)
val_loader = DataLoader(
val_ds,
batch_size=batch_size,
num_workers=num_workers,
pin_memory=pin_memory,
shuffle=False,
)
return train_loader, val_loader
def check_accuracy(loader, model, device="cuda"):
num_correct = 0
num_pixels = 0
dice_score = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device)
y = y.to(device).unsqueeze(1)
preds = torch.sigmoid(model(x))
preds = (preds > 0.5).float()
num_correct += (preds == y).sum()
num_pixels += torch.numel(preds)
dice_score += (2 * (preds * y).sum()) / (
(preds + y).sum() + 1e-8
)
print(
f"Got {num_correct}/{num_pixels} with acc {num_correct/num_pixels*100:.2f}"
)
print(f"Dice score: {dice_score/len(loader)}")
model.train()
def save_predictions_as_imgs(
loader, model, folder="saved_images/", device="cuda"
):
model.eval()
for idx, (x, y) in enumerate(loader):
x = x.to(device=device)
with torch.no_grad():
preds = torch.sigmoid(model(x))
preds = (preds > 0.5).float()
torchvision.utils.save_image(
preds, f"{folder}/pred_{idx}.png"
)
torchvision.utils.save_image(y.unsqueeze(1), f"{folder}{idx}.png")
model.train()

View File

@@ -0,0 +1,131 @@
"""
Example code of how to code GANs and more specifically DCGAN,
for more information about DCGANs read: https://arxiv.org/abs/1511.06434
We then train the DCGAN on the MNIST dataset (toy dataset of handwritten digits)
and then generate our own. You can apply this more generally on really any dataset
but MNIST is simple enough to get the overall idea.
Video explanation: https://youtu.be/5RYETbFFQ7s
Got any questions leave a comment on youtube :)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-20 Initial coding
"""
# Imports
import torch
import torchvision
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
from torch.utils.data import (
DataLoader,
) # Gives easier dataset managment and creates mini batches
from torch.utils.tensorboard import SummaryWriter # to print to tensorboard
from model_utils import (
Discriminator,
Generator,
) # Import our models we've defined (from DCGAN paper)
# Hyperparameters
lr = 0.0005
batch_size = 64
image_size = 64
channels_img = 1
channels_noise = 256
num_epochs = 10
# For how many channels Generator and Discriminator should use
features_d = 16
features_g = 16
my_transforms = transforms.Compose(
[
transforms.Resize(image_size),
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,)),
]
)
dataset = datasets.MNIST(
root="dataset/", train=True, transform=my_transforms, download=True
)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Create discriminator and generator
netD = Discriminator(channels_img, features_d).to(device)
netG = Generator(channels_noise, channels_img, features_g).to(device)
# Setup Optimizer for G and D
optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(0.5, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(0.5, 0.999))
netG.train()
netD.train()
criterion = nn.BCELoss()
real_label = 1
fake_label = 0
fixed_noise = torch.randn(64, channels_noise, 1, 1).to(device)
writer_real = SummaryWriter(f"runs/GAN_MNIST/test_real")
writer_fake = SummaryWriter(f"runs/GAN_MNIST/test_fake")
step = 0
print("Starting Training...")
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(dataloader):
data = data.to(device)
batch_size = data.shape[0]
### Train Discriminator: max log(D(x)) + log(1 - D(G(z)))
netD.zero_grad()
label = (torch.ones(batch_size) * 0.9).to(device)
output = netD(data).reshape(-1)
lossD_real = criterion(output, label)
D_x = output.mean().item()
noise = torch.randn(batch_size, channels_noise, 1, 1).to(device)
fake = netG(noise)
label = (torch.ones(batch_size) * 0.1).to(device)
output = netD(fake.detach()).reshape(-1)
lossD_fake = criterion(output, label)
lossD = lossD_real + lossD_fake
lossD.backward()
optimizerD.step()
### Train Generator: max log(D(G(z)))
netG.zero_grad()
label = torch.ones(batch_size).to(device)
output = netD(fake).reshape(-1)
lossG = criterion(output, label)
lossG.backward()
optimizerG.step()
# Print losses ocassionally and print to tensorboard
if batch_idx % 100 == 0:
step += 1
print(
f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(dataloader)} \
Loss D: {lossD:.4f}, loss G: {lossG:.4f} D(x): {D_x:.4f}"
)
with torch.no_grad():
fake = netG(fixed_noise)
img_grid_real = torchvision.utils.make_grid(data[:32], normalize=True)
img_grid_fake = torchvision.utils.make_grid(fake[:32], normalize=True)
writer_real.add_image(
"Mnist Real Images", img_grid_real, global_step=step
)
writer_fake.add_image(
"Mnist Fake Images", img_grid_fake, global_step=step
)

View File

@@ -0,0 +1,4 @@
### Generative Adversarial Network
DCGAN_mnist.py: main file and training network
model_utils.py: Generator and discriminator implementation

View File

@@ -0,0 +1,76 @@
"""
Discriminator and Generator implementation from DCGAN paper
that we import in the main (DCGAN_mnist.py) file.
"""
import torch
import torch.nn as nn
class Discriminator(nn.Module):
def __init__(self, channels_img, features_d):
super(Discriminator, self).__init__()
self.net = nn.Sequential(
# N x channels_img x 64 x 64
nn.Conv2d(channels_img, features_d, kernel_size=4, stride=2, padding=1),
nn.LeakyReLU(0.2),
# N x features_d x 32 x 32
nn.Conv2d(features_d, features_d * 2, kernel_size=4, stride=2, padding=1),
nn.BatchNorm2d(features_d * 2),
nn.LeakyReLU(0.2),
nn.Conv2d(
features_d * 2, features_d * 4, kernel_size=4, stride=2, padding=1
),
nn.BatchNorm2d(features_d * 4),
nn.LeakyReLU(0.2),
nn.Conv2d(
features_d * 4, features_d * 8, kernel_size=4, stride=2, padding=1
),
nn.BatchNorm2d(features_d * 8),
nn.LeakyReLU(0.2),
# N x features_d*8 x 4 x 4
nn.Conv2d(features_d * 8, 1, kernel_size=4, stride=2, padding=0),
# N x 1 x 1 x 1
nn.Sigmoid(),
)
def forward(self, x):
return self.net(x)
class Generator(nn.Module):
def __init__(self, channels_noise, channels_img, features_g):
super(Generator, self).__init__()
self.net = nn.Sequential(
# N x channels_noise x 1 x 1
nn.ConvTranspose2d(
channels_noise, features_g * 16, kernel_size=4, stride=1, padding=0
),
nn.BatchNorm2d(features_g * 16),
nn.ReLU(),
# N x features_g*16 x 4 x 4
nn.ConvTranspose2d(
features_g * 16, features_g * 8, kernel_size=4, stride=2, padding=1
),
nn.BatchNorm2d(features_g * 8),
nn.ReLU(),
nn.ConvTranspose2d(
features_g * 8, features_g * 4, kernel_size=4, stride=2, padding=1
),
nn.BatchNorm2d(features_g * 4),
nn.ReLU(),
nn.ConvTranspose2d(
features_g * 4, features_g * 2, kernel_size=4, stride=2, padding=1
),
nn.BatchNorm2d(features_g * 2),
nn.ReLU(),
nn.ConvTranspose2d(
features_g * 2, channels_img, kernel_size=4, stride=2, padding=1
),
# N x channels_img x 64 x 64
nn.Tanh(),
)
def forward(self, x):
return self.net(x)

View File

@@ -0,0 +1,242 @@
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter # to print to tensorboard
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")
def tokenize_ger(text):
return [tok.text for tok in spacy_ger.tokenizer(text)]
def tokenize_eng(text):
return [tok.text for tok in spacy_eng.tokenizer(text)]
german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")
english = Field(
tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
)
train_data, valid_data, test_data = Multi30k.splits(
exts=(".de", ".en"), fields=(german, english)
)
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)
class Encoder(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
super(Encoder, self).__init__()
self.dropout = nn.Dropout(p)
self.hidden_size = hidden_size
self.num_layers = num_layers
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
def forward(self, x):
# x shape: (seq_length, N) where N is batch size
embedding = self.dropout(self.embedding(x))
# embedding shape: (seq_length, N, embedding_size)
outputs, (hidden, cell) = self.rnn(embedding)
# outputs shape: (seq_length, N, hidden_size)
return hidden, cell
class Decoder(nn.Module):
def __init__(
self, input_size, embedding_size, hidden_size, output_size, num_layers, p
):
super(Decoder, self).__init__()
self.dropout = nn.Dropout(p)
self.hidden_size = hidden_size
self.num_layers = num_layers
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x, hidden, cell):
# x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
# is 1 here because we are sending in a single word and not a sentence
x = x.unsqueeze(0)
embedding = self.dropout(self.embedding(x))
# embedding shape: (1, N, embedding_size)
outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
# outputs shape: (1, N, hidden_size)
predictions = self.fc(outputs)
# predictions shape: (1, N, length_target_vocabulary) to send it to
# loss function we want it to be (N, length_target_vocabulary) so we're
# just gonna remove the first dim
predictions = predictions.squeeze(0)
return predictions, hidden, cell
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder):
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, source, target, teacher_force_ratio=0.5):
batch_size = source.shape[1]
target_len = target.shape[0]
target_vocab_size = len(english.vocab)
outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
hidden, cell = self.encoder(source)
# Grab the first input to the Decoder which will be <SOS> token
x = target[0]
for t in range(1, target_len):
# Use previous hidden, cell as context from encoder at start
output, hidden, cell = self.decoder(x, hidden, cell)
# Store next output prediction
outputs[t] = output
# Get the best word the Decoder predicted (index in the vocabulary)
best_guess = output.argmax(1)
# With probability of teacher_force_ratio we take the actual next word
# otherwise we take the word that the Decoder predicted it to be.
# Teacher Forcing is used so that the model gets used to seeing
# similar inputs at training and testing time, if teacher forcing is 1
# then inputs at test time might be completely different than what the
# network is used to. This was a long comment.
x = target[t] if random.random() < teacher_force_ratio else best_guess
return outputs
### We're ready to define everything we need for training our Seq2Seq model ###
# Training hyperparameters
num_epochs = 100
learning_rate = 0.001
batch_size = 64
# Model hyperparameters
load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024 # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5
# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size=batch_size,
sort_within_batch=True,
sort_key=lambda x: len(x.src),
device=device,
)
encoder_net = Encoder(
input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)
decoder_net = Decoder(
input_size_decoder,
decoder_embedding_size,
hidden_size,
output_size,
num_layers,
dec_dropout,
).to(device)
model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
if load_model:
load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."
for epoch in range(num_epochs):
print(f"[Epoch {epoch} / {num_epochs}]")
checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
save_checkpoint(checkpoint)
model.eval()
translated_sentence = translate_sentence(
model, sentence, german, english, device, max_length=50
)
print(f"Translated example sentence: \n {translated_sentence}")
model.train()
for batch_idx, batch in enumerate(train_iterator):
# Get input and targets and get to cuda
inp_data = batch.src.to(device)
target = batch.trg.to(device)
# Forward prop
output = model(inp_data, target)
# Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
# doesn't take input in that form. For example if we have MNIST we want to have
# output to be: (N, 10) and targets just (N). Here we can view it in a similar
# way that we have output_words * batch_size that we want to send in into
# our cost function, so we need to do some reshapin. While we're at it
# Let's also remove the start token while we're at it
output = output[1:].reshape(-1, output.shape[2])
target = target[1:].reshape(-1)
optimizer.zero_grad()
loss = criterion(output, target)
# Back prop
loss.backward()
# Clip to avoid exploding gradient issues, makes sure grads are
# within a healthy range
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
# Gradient descent step
optimizer.step()
# Plot to tensorboard
writer.add_scalar("Training loss", loss, global_step=step)
step += 1
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score*100:.2f}")

View File

@@ -0,0 +1,84 @@
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys
def translate_sentence(model, sentence, german, english, device, max_length=50):
# print(sentence)
# sys.exit()
# Load german tokenizer
spacy_ger = spacy.load("de")
# Create tokens using spacy and everything in lower case (which is what our vocab is)
if type(sentence) == str:
tokens = [token.text.lower() for token in spacy_ger(sentence)]
else:
tokens = [token.lower() for token in sentence]
# print(tokens)
# sys.exit()
# Add <SOS> and <EOS> in beginning and end respectively
tokens.insert(0, german.init_token)
tokens.append(german.eos_token)
# Go through each german token and convert to an index
text_to_indices = [german.vocab.stoi[token] for token in tokens]
# Convert to Tensor
sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
# Build encoder hidden, cell state
with torch.no_grad():
hidden, cell = model.encoder(sentence_tensor)
outputs = [english.vocab.stoi["<sos>"]]
for _ in range(max_length):
previous_word = torch.LongTensor([outputs[-1]]).to(device)
with torch.no_grad():
output, hidden, cell = model.decoder(previous_word, hidden, cell)
best_guess = output.argmax(1).item()
outputs.append(best_guess)
# Model predicts it's the end of the sentence
if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
break
translated_sentence = [english.vocab.itos[idx] for idx in outputs]
# remove start token
return translated_sentence[1:]
def bleu(data, model, german, english, device):
targets = []
outputs = []
for example in data:
src = vars(example)["src"]
trg = vars(example)["trg"]
prediction = translate_sentence(model, src, german, english, device)
prediction = prediction[:-1] # remove <eos> token
targets.append([trg])
outputs.append(prediction)
return bleu_score(outputs, targets)
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
def load_checkpoint(checkpoint, model, optimizer):
print("=> Loading checkpoint")
model.load_state_dict(checkpoint["state_dict"])
optimizer.load_state_dict(checkpoint["optimizer"])

View File

@@ -0,0 +1,279 @@
import random
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import spacy
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from torch.utils.tensorboard import SummaryWriter # to print to tensorboard
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
"""
To install spacy languages do:
python -m spacy download en
python -m spacy download de
"""
spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")
def tokenize_ger(text):
return [tok.text for tok in spacy_ger.tokenizer(text)]
def tokenize_eng(text):
return [tok.text for tok in spacy_eng.tokenizer(text)]
german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")
english = Field(
tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
)
train_data, valid_data, test_data = Multi30k.splits(
exts=(".de", ".en"), fields=(german, english)
)
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)
class Encoder(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
super(Encoder, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional=True)
self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
self.dropout = nn.Dropout(p)
def forward(self, x):
# x: (seq_length, N) where N is batch size
embedding = self.dropout(self.embedding(x))
# embedding shape: (seq_length, N, embedding_size)
encoder_states, (hidden, cell) = self.rnn(embedding)
# outputs shape: (seq_length, N, hidden_size)
# Use forward, backward cells and hidden through a linear layer
# so that it can be input to the decoder which is not bidirectional
# Also using index slicing ([idx:idx+1]) to keep the dimension
hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))
return encoder_states, hidden, cell
class Decoder(nn.Module):
def __init__(
self, input_size, embedding_size, hidden_size, output_size, num_layers, p
):
super(Decoder, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size, num_layers)
self.energy = nn.Linear(hidden_size * 3, 1)
self.fc = nn.Linear(hidden_size, output_size)
self.dropout = nn.Dropout(p)
self.softmax = nn.Softmax(dim=0)
self.relu = nn.ReLU()
def forward(self, x, encoder_states, hidden, cell):
x = x.unsqueeze(0)
# x: (1, N) where N is the batch size
embedding = self.dropout(self.embedding(x))
# embedding shape: (1, N, embedding_size)
sequence_length = encoder_states.shape[0]
h_reshaped = hidden.repeat(sequence_length, 1, 1)
# h_reshaped: (seq_length, N, hidden_size*2)
energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
# energy: (seq_length, N, 1)
attention = self.softmax(energy)
# attention: (seq_length, N, 1)
# attention: (seq_length, N, 1), snk
# encoder_states: (seq_length, N, hidden_size*2), snl
# we want context_vector: (1, N, hidden_size*2), i.e knl
context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)
rnn_input = torch.cat((context_vector, embedding), dim=2)
# rnn_input: (1, N, hidden_size*2 + embedding_size)
outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
# outputs shape: (1, N, hidden_size)
predictions = self.fc(outputs).squeeze(0)
# predictions: (N, hidden_size)
return predictions, hidden, cell
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder):
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, source, target, teacher_force_ratio=0.5):
batch_size = source.shape[1]
target_len = target.shape[0]
target_vocab_size = len(english.vocab)
outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
encoder_states, hidden, cell = self.encoder(source)
# First input will be <SOS> token
x = target[0]
for t in range(1, target_len):
# At every time step use encoder_states and update hidden, cell
output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)
# Store prediction for current time step
outputs[t] = output
# Get the best word the Decoder predicted (index in the vocabulary)
best_guess = output.argmax(1)
# With probability of teacher_force_ratio we take the actual next word
# otherwise we take the word that the Decoder predicted it to be.
# Teacher Forcing is used so that the model gets used to seeing
# similar inputs at training and testing time, if teacher forcing is 1
# then inputs at test time might be completely different than what the
# network is used to. This was a long comment.
x = target[t] if random.random() < teacher_force_ratio else best_guess
return outputs
### We're ready to define everything we need for training our Seq2Seq model ###
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = False
save_model = True
# Training hyperparameters
num_epochs = 100
learning_rate = 3e-4
batch_size = 32
# Model hyperparameters
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 1
enc_dropout = 0.0
dec_dropout = 0.0
# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size=batch_size,
sort_within_batch=True,
sort_key=lambda x: len(x.src),
device=device,
)
encoder_net = Encoder(
input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)
decoder_net = Decoder(
input_size_decoder,
decoder_embedding_size,
hidden_size,
output_size,
num_layers,
dec_dropout,
).to(device)
model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
if load_model:
load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
sentence = (
"ein boot mit mehreren männern darauf wird von einem großen"
"pferdegespann ans ufer gezogen."
)
for epoch in range(num_epochs):
print(f"[Epoch {epoch} / {num_epochs}]")
if save_model:
checkpoint = {
"state_dict": model.state_dict(),
"optimizer": optimizer.state_dict(),
}
save_checkpoint(checkpoint)
model.eval()
translated_sentence = translate_sentence(
model, sentence, german, english, device, max_length=50
)
print(f"Translated example sentence: \n {translated_sentence}")
model.train()
for batch_idx, batch in enumerate(train_iterator):
# Get input and targets and get to cuda
inp_data = batch.src.to(device)
target = batch.trg.to(device)
# Forward prop
output = model(inp_data, target)
# Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
# doesn't take input in that form. For example if we have MNIST we want to have
# output to be: (N, 10) and targets just (N). Here we can view it in a similar
# way that we have output_words * batch_size that we want to send in into
# our cost function, so we need to do some reshapin. While we're at it
# Let's also remove the start token while we're at it
output = output[1:].reshape(-1, output.shape[2])
target = target[1:].reshape(-1)
optimizer.zero_grad()
loss = criterion(output, target)
# Back prop
loss.backward()
# Clip to avoid exploding gradient issues, makes sure grads are
# within a healthy range
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
# Gradient descent step
optimizer.step()
# Plot to tensorboard
writer.add_scalar("Training loss", loss, global_step=step)
step += 1
# running on entire test data takes a while
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score * 100:.2f}")

View File

@@ -0,0 +1,79 @@
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys
def translate_sentence(model, sentence, german, english, device, max_length=50):
# Load german tokenizer
spacy_ger = spacy.load("de")
# Create tokens using spacy and everything in lower case (which is what our vocab is)
if type(sentence) == str:
tokens = [token.text.lower() for token in spacy_ger(sentence)]
else:
tokens = [token.lower() for token in sentence]
# Add <SOS> and <EOS> in beginning and end respectively
tokens.insert(0, german.init_token)
tokens.append(german.eos_token)
# Go through each german token and convert to an index
text_to_indices = [german.vocab.stoi[token] for token in tokens]
# Convert to Tensor
sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
# Build encoder hidden, cell state
with torch.no_grad():
outputs_encoder, hiddens, cells = model.encoder(sentence_tensor)
outputs = [english.vocab.stoi["<sos>"]]
for _ in range(max_length):
previous_word = torch.LongTensor([outputs[-1]]).to(device)
with torch.no_grad():
output, hiddens, cells = model.decoder(
previous_word, outputs_encoder, hiddens, cells
)
best_guess = output.argmax(1).item()
outputs.append(best_guess)
# Model predicts it's the end of the sentence
if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
break
translated_sentence = [english.vocab.itos[idx] for idx in outputs]
# remove start token
return translated_sentence[1:]
def bleu(data, model, german, english, device):
targets = []
outputs = []
for example in data:
src = vars(example)["src"]
trg = vars(example)["trg"]
prediction = translate_sentence(model, src, german, english, device)
prediction = prediction[:-1] # remove <eos> token
targets.append([trg])
outputs.append(prediction)
return bleu_score(outputs, targets)
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
def load_checkpoint(checkpoint, model, optimizer):
print("=> Loading checkpoint")
model.load_state_dict(checkpoint["state_dict"])
optimizer.load_state_dict(checkpoint["optimizer"])

View File

@@ -0,0 +1,12 @@
### Image Captioning
Download the dataset used: https://www.kaggle.com/dataset/e1cd22253a9b23b073794872bf565648ddbe4f17e7fa9e74766ad3707141adeb
Then set images folder, captions.txt inside a folder Flickr8k.
train.py: For training the network
model.py: creating the encoderCNN, decoderRNN and hooking them togethor
get_loader.py: Loading the data, creating vocabulary
utils.py: Load model, save model, printing few test cases downloaded online

View File

@@ -0,0 +1,142 @@
import os # when loading file paths
import pandas as pd # for lookup in annotation file
import spacy # for tokenizer
import torch
from torch.nn.utils.rnn import pad_sequence # pad batch
from torch.utils.data import DataLoader, Dataset
from PIL import Image # Load img
import torchvision.transforms as transforms
# We want to convert text -> numerical values
# 1. We need a Vocabulary mapping each word to a index
# 2. We need to setup a Pytorch dataset to load the data
# 3. Setup padding of every batch (all examples should be
# of same seq_len and setup dataloader)
# Note that loading the image is very easy compared to the text!
# Download with: python -m spacy download en
spacy_eng = spacy.load("en")
class Vocabulary:
def __init__(self, freq_threshold):
self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
self.freq_threshold = freq_threshold
def __len__(self):
return len(self.itos)
@staticmethod
def tokenizer_eng(text):
return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
def build_vocabulary(self, sentence_list):
frequencies = {}
idx = 4
for sentence in sentence_list:
for word in self.tokenizer_eng(sentence):
if word not in frequencies:
frequencies[word] = 1
else:
frequencies[word] += 1
if frequencies[word] == self.freq_threshold:
self.stoi[word] = idx
self.itos[idx] = word
idx += 1
def numericalize(self, text):
tokenized_text = self.tokenizer_eng(text)
return [
self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
for token in tokenized_text
]
class FlickrDataset(Dataset):
def __init__(self, root_dir, captions_file, transform=None, freq_threshold=5):
self.root_dir = root_dir
self.df = pd.read_csv(captions_file)
self.transform = transform
# Get img, caption columns
self.imgs = self.df["image"]
self.captions = self.df["caption"]
# Initialize vocabulary and build vocab
self.vocab = Vocabulary(freq_threshold)
self.vocab.build_vocabulary(self.captions.tolist())
def __len__(self):
return len(self.df)
def __getitem__(self, index):
caption = self.captions[index]
img_id = self.imgs[index]
img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
if self.transform is not None:
img = self.transform(img)
numericalized_caption = [self.vocab.stoi["<SOS>"]]
numericalized_caption += self.vocab.numericalize(caption)
numericalized_caption.append(self.vocab.stoi["<EOS>"])
return img, torch.tensor(numericalized_caption)
class MyCollate:
def __init__(self, pad_idx):
self.pad_idx = pad_idx
def __call__(self, batch):
imgs = [item[0].unsqueeze(0) for item in batch]
imgs = torch.cat(imgs, dim=0)
targets = [item[1] for item in batch]
targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)
return imgs, targets
def get_loader(
root_folder,
annotation_file,
transform,
batch_size=32,
num_workers=8,
shuffle=True,
pin_memory=True,
):
dataset = FlickrDataset(root_folder, annotation_file, transform=transform)
pad_idx = dataset.vocab.stoi["<PAD>"]
loader = DataLoader(
dataset=dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=shuffle,
pin_memory=pin_memory,
collate_fn=MyCollate(pad_idx=pad_idx),
)
return loader, dataset
if __name__ == "__main__":
transform = transforms.Compose(
[transforms.Resize((224, 224)), transforms.ToTensor(),]
)
loader, dataset = get_loader(
"flickr8k/images/", "flickr8k/captions.txt", transform=transform
)
for idx, (imgs, captions) in enumerate(loader):
print(imgs.shape)
print(captions.shape)

View File

@@ -0,0 +1,66 @@
import torch
import torch.nn as nn
import statistics
import torchvision.models as models
class EncoderCNN(nn.Module):
def __init__(self, embed_size, train_CNN=False):
super(EncoderCNN, self).__init__()
self.train_CNN = train_CNN
self.inception = models.inception_v3(pretrained=True, aux_logits=False)
self.inception.fc = nn.Linear(self.inception.fc.in_features, embed_size)
self.relu = nn.ReLU()
self.times = []
self.dropout = nn.Dropout(0.5)
def forward(self, images):
features = self.inception(images)
return self.dropout(self.relu(features))
class DecoderRNN(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
super(DecoderRNN, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
self.linear = nn.Linear(hidden_size, vocab_size)
self.dropout = nn.Dropout(0.5)
def forward(self, features, captions):
embeddings = self.dropout(self.embed(captions))
embeddings = torch.cat((features.unsqueeze(0), embeddings), dim=0)
hiddens, _ = self.lstm(embeddings)
outputs = self.linear(hiddens)
return outputs
class CNNtoRNN(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
super(CNNtoRNN, self).__init__()
self.encoderCNN = EncoderCNN(embed_size)
self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)
def forward(self, images, captions):
features = self.encoderCNN(images)
outputs = self.decoderRNN(features, captions)
return outputs
def caption_image(self, image, vocabulary, max_length=50):
result_caption = []
with torch.no_grad():
x = self.encoderCNN(image).unsqueeze(0)
states = None
for _ in range(max_length):
hiddens, states = self.decoderRNN.lstm(x, states)
output = self.decoderRNN.linear(hiddens.squeeze(0))
predicted = output.argmax(1)
result_caption.append(predicted.item())
x = self.decoderRNN.embed(predicted).unsqueeze(0)
if vocabulary.itos[predicted.item()] == "<EOS>":
break
return [vocabulary.itos[idx] for idx in result_caption]

Binary file not shown.

After

Width:  |  Height:  |  Size: 369 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 866 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 92 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 133 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 641 KiB

View File

@@ -0,0 +1,96 @@
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
from utils import save_checkpoint, load_checkpoint, print_examples
from get_loader import get_loader
from model import CNNtoRNN
def train():
transform = transforms.Compose(
[
transforms.Resize((356, 356)),
transforms.RandomCrop((299, 299)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]
)
train_loader, dataset = get_loader(
root_folder="flickr8k/images",
annotation_file="flickr8k/captions.txt",
transform=transform,
num_workers=2,
)
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = False
save_model = False
train_CNN = False
# Hyperparameters
embed_size = 256
hidden_size = 256
vocab_size = len(dataset.vocab)
num_layers = 1
learning_rate = 3e-4
num_epochs = 100
# for tensorboard
writer = SummaryWriter("runs/flickr")
step = 0
# initialize model, loss etc
model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi["<PAD>"])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Only finetune the CNN
for name, param in model.encoderCNN.inception.named_parameters():
if "fc.weight" in name or "fc.bias" in name:
param.requires_grad = True
else:
param.requires_grad = train_CNN
if load_model:
step = load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
model.train()
for epoch in range(num_epochs):
# Uncomment the line below to see a couple of test cases
# print_examples(model, device, dataset)
if save_model:
checkpoint = {
"state_dict": model.state_dict(),
"optimizer": optimizer.state_dict(),
"step": step,
}
save_checkpoint(checkpoint)
for idx, (imgs, captions) in tqdm(
enumerate(train_loader), total=len(train_loader), leave=False
):
imgs = imgs.to(device)
captions = captions.to(device)
outputs = model(imgs, captions[:-1])
loss = criterion(
outputs.reshape(-1, outputs.shape[2]), captions.reshape(-1)
)
writer.add_scalar("Training loss", loss.item(), global_step=step)
step += 1
optimizer.zero_grad()
loss.backward(loss)
optimizer.step()
if __name__ == "__main__":
train()

View File

@@ -0,0 +1,69 @@
import torch
import torchvision.transforms as transforms
from PIL import Image
def print_examples(model, device, dataset):
transform = transforms.Compose(
[
transforms.Resize((299, 299)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]
)
model.eval()
test_img1 = transform(Image.open("test_examples/dog.jpg").convert("RGB")).unsqueeze(
0
)
print("Example 1 CORRECT: Dog on a beach by the ocean")
print(
"Example 1 OUTPUT: "
+ " ".join(model.caption_image(test_img1.to(device), dataset.vocab))
)
test_img2 = transform(
Image.open("test_examples/child.jpg").convert("RGB")
).unsqueeze(0)
print("Example 2 CORRECT: Child holding red frisbee outdoors")
print(
"Example 2 OUTPUT: "
+ " ".join(model.caption_image(test_img2.to(device), dataset.vocab))
)
test_img3 = transform(Image.open("test_examples/bus.png").convert("RGB")).unsqueeze(
0
)
print("Example 3 CORRECT: Bus driving by parked cars")
print(
"Example 3 OUTPUT: "
+ " ".join(model.caption_image(test_img3.to(device), dataset.vocab))
)
test_img4 = transform(
Image.open("test_examples/boat.png").convert("RGB")
).unsqueeze(0)
print("Example 4 CORRECT: A small boat in the ocean")
print(
"Example 4 OUTPUT: "
+ " ".join(model.caption_image(test_img4.to(device), dataset.vocab))
)
test_img5 = transform(
Image.open("test_examples/horse.png").convert("RGB")
).unsqueeze(0)
print("Example 5 CORRECT: A cowboy riding a horse in the desert")
print(
"Example 5 OUTPUT: "
+ " ".join(model.caption_image(test_img5.to(device), dataset.vocab))
)
model.train()
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
def load_checkpoint(checkpoint, model, optimizer):
print("=> Loading checkpoint")
model.load_state_dict(checkpoint["state_dict"])
optimizer.load_state_dict(checkpoint["optimizer"])
step = checkpoint["step"]
return step

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

View File

@@ -0,0 +1,112 @@
import torch
import torch.nn as nn
import torch.optim as optim
from PIL import Image
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.utils import save_image
class VGG(nn.Module):
def __init__(self):
super(VGG, self).__init__()
# The first number x in convx_y gets added by 1 after it has gone
# through a maxpool, and the second y if we have several conv layers
# in between a max pool. These strings (0, 5, 10, ..) then correspond
# to conv1_1, conv2_1, conv3_1, conv4_1, conv5_1 mentioned in NST paper
self.chosen_features = ["0", "5", "10", "19", "28"]
# We don't need to run anything further than conv5_1 (the 28th module in vgg)
# Since remember, we dont actually care about the output of VGG: the only thing
# that is modified is the generated image (i.e, the input).
self.model = models.vgg19(pretrained=True).features[:29]
def forward(self, x):
# Store relevant features
features = []
# Go through each layer in model, if the layer is in the chosen_features,
# store it in features. At the end we'll just return all the activations
# for the specific layers we have in chosen_features
for layer_num, layer in enumerate(self.model):
x = layer(x)
if str(layer_num) in self.chosen_features:
features.append(x)
return features
def load_image(image_name):
image = Image.open(image_name)
image = loader(image).unsqueeze(0)
return image.to(device)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
imsize = 356
# Here we may want to use the Normalization constants used in the original
# VGG network (to get similar values net was originally trained on), but
# I found it didn't matter too much so I didn't end of using it. If you
# use it make sure to normalize back so the images don't look weird.
loader = transforms.Compose(
[
transforms.Resize((imsize, imsize)),
transforms.ToTensor(),
# transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
]
)
original_img = load_image("annahathaway.png")
style_img = load_image("style.jpg")
# initialized generated as white noise or clone of original image.
# Clone seemed to work better for me.
# generated = torch.randn(original_img.data.shape, device=device, requires_grad=True)
generated = original_img.clone().requires_grad_(True)
model = VGG().to(device).eval()
# Hyperparameters
total_steps = 6000
learning_rate = 0.001
alpha = 1
beta = 0.01
optimizer = optim.Adam([generated], lr=learning_rate)
for step in range(total_steps):
# Obtain the convolution features in specifically chosen layers
generated_features = model(generated)
original_img_features = model(original_img)
style_features = model(style_img)
# Loss is 0 initially
style_loss = original_loss = 0
# iterate through all the features for the chosen layers
for gen_feature, orig_feature, style_feature in zip(
generated_features, original_img_features, style_features
):
# batch_size will just be 1
batch_size, channel, height, width = gen_feature.shape
original_loss += torch.mean((gen_feature - orig_feature) ** 2)
# Compute Gram Matrix of generated
G = gen_feature.view(channel, height * width).mm(
gen_feature.view(channel, height * width).t()
)
# Compute Gram Matrix of Style
A = style_feature.view(channel, height * width).mm(
style_feature.view(channel, height * width).t()
)
style_loss += torch.mean((G - A) ** 2)
total_loss = alpha * original_loss + beta * style_loss
optimizer.zero_grad()
total_loss.backward()
optimizer.step()
if step % 200 == 0:
print(total_loss)
save_image(generated, "generated.png")

Binary file not shown.

After

Width:  |  Height:  |  Size: 310 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 282 KiB

Some files were not shown because too many files have changed in this diff Show More