Initial commit

This commit is contained in:
Aladdin Persson
2021-01-30 21:49:15 +01:00
commit 65b8c80495
432 changed files with 1290844 additions and 0 deletions

View File

@@ -0,0 +1,131 @@
"""
Example code of how to code GANs and more specifically DCGAN,
for more information about DCGANs read: https://arxiv.org/abs/1511.06434
We then train the DCGAN on the MNIST dataset (toy dataset of handwritten digits)
and then generate our own. You can apply this more generally on really any dataset
but MNIST is simple enough to get the overall idea.
Video explanation: https://youtu.be/5RYETbFFQ7s
Got any questions leave a comment on youtube :)
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
* 2020-04-20 Initial coding
"""
# Imports
import torch
import torchvision
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
from torch.utils.data import (
DataLoader,
) # Gives easier dataset managment and creates mini batches
from torch.utils.tensorboard import SummaryWriter # to print to tensorboard
from model_utils import (
Discriminator,
Generator,
) # Import our models we've defined (from DCGAN paper)
# Hyperparameters
lr = 0.0005
batch_size = 64
image_size = 64
channels_img = 1
channels_noise = 256
num_epochs = 10
# For how many channels Generator and Discriminator should use
features_d = 16
features_g = 16
my_transforms = transforms.Compose(
[
transforms.Resize(image_size),
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,)),
]
)
dataset = datasets.MNIST(
root="dataset/", train=True, transform=my_transforms, download=True
)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Create discriminator and generator
netD = Discriminator(channels_img, features_d).to(device)
netG = Generator(channels_noise, channels_img, features_g).to(device)
# Setup Optimizer for G and D
optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(0.5, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(0.5, 0.999))
netG.train()
netD.train()
criterion = nn.BCELoss()
real_label = 1
fake_label = 0
fixed_noise = torch.randn(64, channels_noise, 1, 1).to(device)
writer_real = SummaryWriter(f"runs/GAN_MNIST/test_real")
writer_fake = SummaryWriter(f"runs/GAN_MNIST/test_fake")
step = 0
print("Starting Training...")
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(dataloader):
data = data.to(device)
batch_size = data.shape[0]
### Train Discriminator: max log(D(x)) + log(1 - D(G(z)))
netD.zero_grad()
label = (torch.ones(batch_size) * 0.9).to(device)
output = netD(data).reshape(-1)
lossD_real = criterion(output, label)
D_x = output.mean().item()
noise = torch.randn(batch_size, channels_noise, 1, 1).to(device)
fake = netG(noise)
label = (torch.ones(batch_size) * 0.1).to(device)
output = netD(fake.detach()).reshape(-1)
lossD_fake = criterion(output, label)
lossD = lossD_real + lossD_fake
lossD.backward()
optimizerD.step()
### Train Generator: max log(D(G(z)))
netG.zero_grad()
label = torch.ones(batch_size).to(device)
output = netD(fake).reshape(-1)
lossG = criterion(output, label)
lossG.backward()
optimizerG.step()
# Print losses ocassionally and print to tensorboard
if batch_idx % 100 == 0:
step += 1
print(
f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(dataloader)} \
Loss D: {lossD:.4f}, loss G: {lossG:.4f} D(x): {D_x:.4f}"
)
with torch.no_grad():
fake = netG(fixed_noise)
img_grid_real = torchvision.utils.make_grid(data[:32], normalize=True)
img_grid_fake = torchvision.utils.make_grid(fake[:32], normalize=True)
writer_real.add_image(
"Mnist Real Images", img_grid_real, global_step=step
)
writer_fake.add_image(
"Mnist Fake Images", img_grid_fake, global_step=step
)

View File

@@ -0,0 +1,4 @@
### Generative Adversarial Network
DCGAN_mnist.py: main file and training network
model_utils.py: Generator and discriminator implementation

View File

@@ -0,0 +1,76 @@
"""
Discriminator and Generator implementation from DCGAN paper
that we import in the main (DCGAN_mnist.py) file.
"""
import torch
import torch.nn as nn
class Discriminator(nn.Module):
def __init__(self, channels_img, features_d):
super(Discriminator, self).__init__()
self.net = nn.Sequential(
# N x channels_img x 64 x 64
nn.Conv2d(channels_img, features_d, kernel_size=4, stride=2, padding=1),
nn.LeakyReLU(0.2),
# N x features_d x 32 x 32
nn.Conv2d(features_d, features_d * 2, kernel_size=4, stride=2, padding=1),
nn.BatchNorm2d(features_d * 2),
nn.LeakyReLU(0.2),
nn.Conv2d(
features_d * 2, features_d * 4, kernel_size=4, stride=2, padding=1
),
nn.BatchNorm2d(features_d * 4),
nn.LeakyReLU(0.2),
nn.Conv2d(
features_d * 4, features_d * 8, kernel_size=4, stride=2, padding=1
),
nn.BatchNorm2d(features_d * 8),
nn.LeakyReLU(0.2),
# N x features_d*8 x 4 x 4
nn.Conv2d(features_d * 8, 1, kernel_size=4, stride=2, padding=0),
# N x 1 x 1 x 1
nn.Sigmoid(),
)
def forward(self, x):
return self.net(x)
class Generator(nn.Module):
def __init__(self, channels_noise, channels_img, features_g):
super(Generator, self).__init__()
self.net = nn.Sequential(
# N x channels_noise x 1 x 1
nn.ConvTranspose2d(
channels_noise, features_g * 16, kernel_size=4, stride=1, padding=0
),
nn.BatchNorm2d(features_g * 16),
nn.ReLU(),
# N x features_g*16 x 4 x 4
nn.ConvTranspose2d(
features_g * 16, features_g * 8, kernel_size=4, stride=2, padding=1
),
nn.BatchNorm2d(features_g * 8),
nn.ReLU(),
nn.ConvTranspose2d(
features_g * 8, features_g * 4, kernel_size=4, stride=2, padding=1
),
nn.BatchNorm2d(features_g * 4),
nn.ReLU(),
nn.ConvTranspose2d(
features_g * 4, features_g * 2, kernel_size=4, stride=2, padding=1
),
nn.BatchNorm2d(features_g * 2),
nn.ReLU(),
nn.ConvTranspose2d(
features_g * 2, channels_img, kernel_size=4, stride=2, padding=1
),
# N x channels_img x 64 x 64
nn.Tanh(),
)
def forward(self, x):
return self.net(x)

View File

@@ -0,0 +1,242 @@
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter # to print to tensorboard
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")
def tokenize_ger(text):
return [tok.text for tok in spacy_ger.tokenizer(text)]
def tokenize_eng(text):
return [tok.text for tok in spacy_eng.tokenizer(text)]
german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")
english = Field(
tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
)
train_data, valid_data, test_data = Multi30k.splits(
exts=(".de", ".en"), fields=(german, english)
)
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)
class Encoder(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
super(Encoder, self).__init__()
self.dropout = nn.Dropout(p)
self.hidden_size = hidden_size
self.num_layers = num_layers
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
def forward(self, x):
# x shape: (seq_length, N) where N is batch size
embedding = self.dropout(self.embedding(x))
# embedding shape: (seq_length, N, embedding_size)
outputs, (hidden, cell) = self.rnn(embedding)
# outputs shape: (seq_length, N, hidden_size)
return hidden, cell
class Decoder(nn.Module):
def __init__(
self, input_size, embedding_size, hidden_size, output_size, num_layers, p
):
super(Decoder, self).__init__()
self.dropout = nn.Dropout(p)
self.hidden_size = hidden_size
self.num_layers = num_layers
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x, hidden, cell):
# x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
# is 1 here because we are sending in a single word and not a sentence
x = x.unsqueeze(0)
embedding = self.dropout(self.embedding(x))
# embedding shape: (1, N, embedding_size)
outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
# outputs shape: (1, N, hidden_size)
predictions = self.fc(outputs)
# predictions shape: (1, N, length_target_vocabulary) to send it to
# loss function we want it to be (N, length_target_vocabulary) so we're
# just gonna remove the first dim
predictions = predictions.squeeze(0)
return predictions, hidden, cell
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder):
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, source, target, teacher_force_ratio=0.5):
batch_size = source.shape[1]
target_len = target.shape[0]
target_vocab_size = len(english.vocab)
outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
hidden, cell = self.encoder(source)
# Grab the first input to the Decoder which will be <SOS> token
x = target[0]
for t in range(1, target_len):
# Use previous hidden, cell as context from encoder at start
output, hidden, cell = self.decoder(x, hidden, cell)
# Store next output prediction
outputs[t] = output
# Get the best word the Decoder predicted (index in the vocabulary)
best_guess = output.argmax(1)
# With probability of teacher_force_ratio we take the actual next word
# otherwise we take the word that the Decoder predicted it to be.
# Teacher Forcing is used so that the model gets used to seeing
# similar inputs at training and testing time, if teacher forcing is 1
# then inputs at test time might be completely different than what the
# network is used to. This was a long comment.
x = target[t] if random.random() < teacher_force_ratio else best_guess
return outputs
### We're ready to define everything we need for training our Seq2Seq model ###
# Training hyperparameters
num_epochs = 100
learning_rate = 0.001
batch_size = 64
# Model hyperparameters
load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024 # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5
# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size=batch_size,
sort_within_batch=True,
sort_key=lambda x: len(x.src),
device=device,
)
encoder_net = Encoder(
input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)
decoder_net = Decoder(
input_size_decoder,
decoder_embedding_size,
hidden_size,
output_size,
num_layers,
dec_dropout,
).to(device)
model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
if load_model:
load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."
for epoch in range(num_epochs):
print(f"[Epoch {epoch} / {num_epochs}]")
checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
save_checkpoint(checkpoint)
model.eval()
translated_sentence = translate_sentence(
model, sentence, german, english, device, max_length=50
)
print(f"Translated example sentence: \n {translated_sentence}")
model.train()
for batch_idx, batch in enumerate(train_iterator):
# Get input and targets and get to cuda
inp_data = batch.src.to(device)
target = batch.trg.to(device)
# Forward prop
output = model(inp_data, target)
# Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
# doesn't take input in that form. For example if we have MNIST we want to have
# output to be: (N, 10) and targets just (N). Here we can view it in a similar
# way that we have output_words * batch_size that we want to send in into
# our cost function, so we need to do some reshapin. While we're at it
# Let's also remove the start token while we're at it
output = output[1:].reshape(-1, output.shape[2])
target = target[1:].reshape(-1)
optimizer.zero_grad()
loss = criterion(output, target)
# Back prop
loss.backward()
# Clip to avoid exploding gradient issues, makes sure grads are
# within a healthy range
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
# Gradient descent step
optimizer.step()
# Plot to tensorboard
writer.add_scalar("Training loss", loss, global_step=step)
step += 1
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score*100:.2f}")

View File

@@ -0,0 +1,84 @@
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys
def translate_sentence(model, sentence, german, english, device, max_length=50):
# print(sentence)
# sys.exit()
# Load german tokenizer
spacy_ger = spacy.load("de")
# Create tokens using spacy and everything in lower case (which is what our vocab is)
if type(sentence) == str:
tokens = [token.text.lower() for token in spacy_ger(sentence)]
else:
tokens = [token.lower() for token in sentence]
# print(tokens)
# sys.exit()
# Add <SOS> and <EOS> in beginning and end respectively
tokens.insert(0, german.init_token)
tokens.append(german.eos_token)
# Go through each german token and convert to an index
text_to_indices = [german.vocab.stoi[token] for token in tokens]
# Convert to Tensor
sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
# Build encoder hidden, cell state
with torch.no_grad():
hidden, cell = model.encoder(sentence_tensor)
outputs = [english.vocab.stoi["<sos>"]]
for _ in range(max_length):
previous_word = torch.LongTensor([outputs[-1]]).to(device)
with torch.no_grad():
output, hidden, cell = model.decoder(previous_word, hidden, cell)
best_guess = output.argmax(1).item()
outputs.append(best_guess)
# Model predicts it's the end of the sentence
if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
break
translated_sentence = [english.vocab.itos[idx] for idx in outputs]
# remove start token
return translated_sentence[1:]
def bleu(data, model, german, english, device):
targets = []
outputs = []
for example in data:
src = vars(example)["src"]
trg = vars(example)["trg"]
prediction = translate_sentence(model, src, german, english, device)
prediction = prediction[:-1] # remove <eos> token
targets.append([trg])
outputs.append(prediction)
return bleu_score(outputs, targets)
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
def load_checkpoint(checkpoint, model, optimizer):
print("=> Loading checkpoint")
model.load_state_dict(checkpoint["state_dict"])
optimizer.load_state_dict(checkpoint["optimizer"])

View File

@@ -0,0 +1,279 @@
import random
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import spacy
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from torch.utils.tensorboard import SummaryWriter # to print to tensorboard
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
"""
To install spacy languages do:
python -m spacy download en
python -m spacy download de
"""
spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")
def tokenize_ger(text):
return [tok.text for tok in spacy_ger.tokenizer(text)]
def tokenize_eng(text):
return [tok.text for tok in spacy_eng.tokenizer(text)]
german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")
english = Field(
tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
)
train_data, valid_data, test_data = Multi30k.splits(
exts=(".de", ".en"), fields=(german, english)
)
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)
class Encoder(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
super(Encoder, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional=True)
self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
self.dropout = nn.Dropout(p)
def forward(self, x):
# x: (seq_length, N) where N is batch size
embedding = self.dropout(self.embedding(x))
# embedding shape: (seq_length, N, embedding_size)
encoder_states, (hidden, cell) = self.rnn(embedding)
# outputs shape: (seq_length, N, hidden_size)
# Use forward, backward cells and hidden through a linear layer
# so that it can be input to the decoder which is not bidirectional
# Also using index slicing ([idx:idx+1]) to keep the dimension
hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))
return encoder_states, hidden, cell
class Decoder(nn.Module):
def __init__(
self, input_size, embedding_size, hidden_size, output_size, num_layers, p
):
super(Decoder, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size, num_layers)
self.energy = nn.Linear(hidden_size * 3, 1)
self.fc = nn.Linear(hidden_size, output_size)
self.dropout = nn.Dropout(p)
self.softmax = nn.Softmax(dim=0)
self.relu = nn.ReLU()
def forward(self, x, encoder_states, hidden, cell):
x = x.unsqueeze(0)
# x: (1, N) where N is the batch size
embedding = self.dropout(self.embedding(x))
# embedding shape: (1, N, embedding_size)
sequence_length = encoder_states.shape[0]
h_reshaped = hidden.repeat(sequence_length, 1, 1)
# h_reshaped: (seq_length, N, hidden_size*2)
energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
# energy: (seq_length, N, 1)
attention = self.softmax(energy)
# attention: (seq_length, N, 1)
# attention: (seq_length, N, 1), snk
# encoder_states: (seq_length, N, hidden_size*2), snl
# we want context_vector: (1, N, hidden_size*2), i.e knl
context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)
rnn_input = torch.cat((context_vector, embedding), dim=2)
# rnn_input: (1, N, hidden_size*2 + embedding_size)
outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
# outputs shape: (1, N, hidden_size)
predictions = self.fc(outputs).squeeze(0)
# predictions: (N, hidden_size)
return predictions, hidden, cell
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder):
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, source, target, teacher_force_ratio=0.5):
batch_size = source.shape[1]
target_len = target.shape[0]
target_vocab_size = len(english.vocab)
outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
encoder_states, hidden, cell = self.encoder(source)
# First input will be <SOS> token
x = target[0]
for t in range(1, target_len):
# At every time step use encoder_states and update hidden, cell
output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)
# Store prediction for current time step
outputs[t] = output
# Get the best word the Decoder predicted (index in the vocabulary)
best_guess = output.argmax(1)
# With probability of teacher_force_ratio we take the actual next word
# otherwise we take the word that the Decoder predicted it to be.
# Teacher Forcing is used so that the model gets used to seeing
# similar inputs at training and testing time, if teacher forcing is 1
# then inputs at test time might be completely different than what the
# network is used to. This was a long comment.
x = target[t] if random.random() < teacher_force_ratio else best_guess
return outputs
### We're ready to define everything we need for training our Seq2Seq model ###
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = False
save_model = True
# Training hyperparameters
num_epochs = 100
learning_rate = 3e-4
batch_size = 32
# Model hyperparameters
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 1
enc_dropout = 0.0
dec_dropout = 0.0
# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size=batch_size,
sort_within_batch=True,
sort_key=lambda x: len(x.src),
device=device,
)
encoder_net = Encoder(
input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)
decoder_net = Decoder(
input_size_decoder,
decoder_embedding_size,
hidden_size,
output_size,
num_layers,
dec_dropout,
).to(device)
model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
if load_model:
load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
sentence = (
"ein boot mit mehreren männern darauf wird von einem großen"
"pferdegespann ans ufer gezogen."
)
for epoch in range(num_epochs):
print(f"[Epoch {epoch} / {num_epochs}]")
if save_model:
checkpoint = {
"state_dict": model.state_dict(),
"optimizer": optimizer.state_dict(),
}
save_checkpoint(checkpoint)
model.eval()
translated_sentence = translate_sentence(
model, sentence, german, english, device, max_length=50
)
print(f"Translated example sentence: \n {translated_sentence}")
model.train()
for batch_idx, batch in enumerate(train_iterator):
# Get input and targets and get to cuda
inp_data = batch.src.to(device)
target = batch.trg.to(device)
# Forward prop
output = model(inp_data, target)
# Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
# doesn't take input in that form. For example if we have MNIST we want to have
# output to be: (N, 10) and targets just (N). Here we can view it in a similar
# way that we have output_words * batch_size that we want to send in into
# our cost function, so we need to do some reshapin. While we're at it
# Let's also remove the start token while we're at it
output = output[1:].reshape(-1, output.shape[2])
target = target[1:].reshape(-1)
optimizer.zero_grad()
loss = criterion(output, target)
# Back prop
loss.backward()
# Clip to avoid exploding gradient issues, makes sure grads are
# within a healthy range
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
# Gradient descent step
optimizer.step()
# Plot to tensorboard
writer.add_scalar("Training loss", loss, global_step=step)
step += 1
# running on entire test data takes a while
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score * 100:.2f}")

View File

@@ -0,0 +1,79 @@
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys
def translate_sentence(model, sentence, german, english, device, max_length=50):
# Load german tokenizer
spacy_ger = spacy.load("de")
# Create tokens using spacy and everything in lower case (which is what our vocab is)
if type(sentence) == str:
tokens = [token.text.lower() for token in spacy_ger(sentence)]
else:
tokens = [token.lower() for token in sentence]
# Add <SOS> and <EOS> in beginning and end respectively
tokens.insert(0, german.init_token)
tokens.append(german.eos_token)
# Go through each german token and convert to an index
text_to_indices = [german.vocab.stoi[token] for token in tokens]
# Convert to Tensor
sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
# Build encoder hidden, cell state
with torch.no_grad():
outputs_encoder, hiddens, cells = model.encoder(sentence_tensor)
outputs = [english.vocab.stoi["<sos>"]]
for _ in range(max_length):
previous_word = torch.LongTensor([outputs[-1]]).to(device)
with torch.no_grad():
output, hiddens, cells = model.decoder(
previous_word, outputs_encoder, hiddens, cells
)
best_guess = output.argmax(1).item()
outputs.append(best_guess)
# Model predicts it's the end of the sentence
if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
break
translated_sentence = [english.vocab.itos[idx] for idx in outputs]
# remove start token
return translated_sentence[1:]
def bleu(data, model, german, english, device):
targets = []
outputs = []
for example in data:
src = vars(example)["src"]
trg = vars(example)["trg"]
prediction = translate_sentence(model, src, german, english, device)
prediction = prediction[:-1] # remove <eos> token
targets.append([trg])
outputs.append(prediction)
return bleu_score(outputs, targets)
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
def load_checkpoint(checkpoint, model, optimizer):
print("=> Loading checkpoint")
model.load_state_dict(checkpoint["state_dict"])
optimizer.load_state_dict(checkpoint["optimizer"])

View File

@@ -0,0 +1,12 @@
### Image Captioning
Download the dataset used: https://www.kaggle.com/dataset/e1cd22253a9b23b073794872bf565648ddbe4f17e7fa9e74766ad3707141adeb
Then set images folder, captions.txt inside a folder Flickr8k.
train.py: For training the network
model.py: creating the encoderCNN, decoderRNN and hooking them togethor
get_loader.py: Loading the data, creating vocabulary
utils.py: Load model, save model, printing few test cases downloaded online

View File

@@ -0,0 +1,142 @@
import os # when loading file paths
import pandas as pd # for lookup in annotation file
import spacy # for tokenizer
import torch
from torch.nn.utils.rnn import pad_sequence # pad batch
from torch.utils.data import DataLoader, Dataset
from PIL import Image # Load img
import torchvision.transforms as transforms
# We want to convert text -> numerical values
# 1. We need a Vocabulary mapping each word to a index
# 2. We need to setup a Pytorch dataset to load the data
# 3. Setup padding of every batch (all examples should be
# of same seq_len and setup dataloader)
# Note that loading the image is very easy compared to the text!
# Download with: python -m spacy download en
spacy_eng = spacy.load("en")
class Vocabulary:
def __init__(self, freq_threshold):
self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
self.freq_threshold = freq_threshold
def __len__(self):
return len(self.itos)
@staticmethod
def tokenizer_eng(text):
return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
def build_vocabulary(self, sentence_list):
frequencies = {}
idx = 4
for sentence in sentence_list:
for word in self.tokenizer_eng(sentence):
if word not in frequencies:
frequencies[word] = 1
else:
frequencies[word] += 1
if frequencies[word] == self.freq_threshold:
self.stoi[word] = idx
self.itos[idx] = word
idx += 1
def numericalize(self, text):
tokenized_text = self.tokenizer_eng(text)
return [
self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
for token in tokenized_text
]
class FlickrDataset(Dataset):
def __init__(self, root_dir, captions_file, transform=None, freq_threshold=5):
self.root_dir = root_dir
self.df = pd.read_csv(captions_file)
self.transform = transform
# Get img, caption columns
self.imgs = self.df["image"]
self.captions = self.df["caption"]
# Initialize vocabulary and build vocab
self.vocab = Vocabulary(freq_threshold)
self.vocab.build_vocabulary(self.captions.tolist())
def __len__(self):
return len(self.df)
def __getitem__(self, index):
caption = self.captions[index]
img_id = self.imgs[index]
img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
if self.transform is not None:
img = self.transform(img)
numericalized_caption = [self.vocab.stoi["<SOS>"]]
numericalized_caption += self.vocab.numericalize(caption)
numericalized_caption.append(self.vocab.stoi["<EOS>"])
return img, torch.tensor(numericalized_caption)
class MyCollate:
def __init__(self, pad_idx):
self.pad_idx = pad_idx
def __call__(self, batch):
imgs = [item[0].unsqueeze(0) for item in batch]
imgs = torch.cat(imgs, dim=0)
targets = [item[1] for item in batch]
targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)
return imgs, targets
def get_loader(
root_folder,
annotation_file,
transform,
batch_size=32,
num_workers=8,
shuffle=True,
pin_memory=True,
):
dataset = FlickrDataset(root_folder, annotation_file, transform=transform)
pad_idx = dataset.vocab.stoi["<PAD>"]
loader = DataLoader(
dataset=dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=shuffle,
pin_memory=pin_memory,
collate_fn=MyCollate(pad_idx=pad_idx),
)
return loader, dataset
if __name__ == "__main__":
transform = transforms.Compose(
[transforms.Resize((224, 224)), transforms.ToTensor(),]
)
loader, dataset = get_loader(
"flickr8k/images/", "flickr8k/captions.txt", transform=transform
)
for idx, (imgs, captions) in enumerate(loader):
print(imgs.shape)
print(captions.shape)

View File

@@ -0,0 +1,66 @@
import torch
import torch.nn as nn
import statistics
import torchvision.models as models
class EncoderCNN(nn.Module):
def __init__(self, embed_size, train_CNN=False):
super(EncoderCNN, self).__init__()
self.train_CNN = train_CNN
self.inception = models.inception_v3(pretrained=True, aux_logits=False)
self.inception.fc = nn.Linear(self.inception.fc.in_features, embed_size)
self.relu = nn.ReLU()
self.times = []
self.dropout = nn.Dropout(0.5)
def forward(self, images):
features = self.inception(images)
return self.dropout(self.relu(features))
class DecoderRNN(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
super(DecoderRNN, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
self.linear = nn.Linear(hidden_size, vocab_size)
self.dropout = nn.Dropout(0.5)
def forward(self, features, captions):
embeddings = self.dropout(self.embed(captions))
embeddings = torch.cat((features.unsqueeze(0), embeddings), dim=0)
hiddens, _ = self.lstm(embeddings)
outputs = self.linear(hiddens)
return outputs
class CNNtoRNN(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
super(CNNtoRNN, self).__init__()
self.encoderCNN = EncoderCNN(embed_size)
self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)
def forward(self, images, captions):
features = self.encoderCNN(images)
outputs = self.decoderRNN(features, captions)
return outputs
def caption_image(self, image, vocabulary, max_length=50):
result_caption = []
with torch.no_grad():
x = self.encoderCNN(image).unsqueeze(0)
states = None
for _ in range(max_length):
hiddens, states = self.decoderRNN.lstm(x, states)
output = self.decoderRNN.linear(hiddens.squeeze(0))
predicted = output.argmax(1)
result_caption.append(predicted.item())
x = self.decoderRNN.embed(predicted).unsqueeze(0)
if vocabulary.itos[predicted.item()] == "<EOS>":
break
return [vocabulary.itos[idx] for idx in result_caption]

Binary file not shown.

After

Width:  |  Height:  |  Size: 369 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 866 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 92 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 133 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 641 KiB

View File

@@ -0,0 +1,96 @@
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
from utils import save_checkpoint, load_checkpoint, print_examples
from get_loader import get_loader
from model import CNNtoRNN
def train():
transform = transforms.Compose(
[
transforms.Resize((356, 356)),
transforms.RandomCrop((299, 299)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]
)
train_loader, dataset = get_loader(
root_folder="flickr8k/images",
annotation_file="flickr8k/captions.txt",
transform=transform,
num_workers=2,
)
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = False
save_model = False
train_CNN = False
# Hyperparameters
embed_size = 256
hidden_size = 256
vocab_size = len(dataset.vocab)
num_layers = 1
learning_rate = 3e-4
num_epochs = 100
# for tensorboard
writer = SummaryWriter("runs/flickr")
step = 0
# initialize model, loss etc
model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi["<PAD>"])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Only finetune the CNN
for name, param in model.encoderCNN.inception.named_parameters():
if "fc.weight" in name or "fc.bias" in name:
param.requires_grad = True
else:
param.requires_grad = train_CNN
if load_model:
step = load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
model.train()
for epoch in range(num_epochs):
# Uncomment the line below to see a couple of test cases
# print_examples(model, device, dataset)
if save_model:
checkpoint = {
"state_dict": model.state_dict(),
"optimizer": optimizer.state_dict(),
"step": step,
}
save_checkpoint(checkpoint)
for idx, (imgs, captions) in tqdm(
enumerate(train_loader), total=len(train_loader), leave=False
):
imgs = imgs.to(device)
captions = captions.to(device)
outputs = model(imgs, captions[:-1])
loss = criterion(
outputs.reshape(-1, outputs.shape[2]), captions.reshape(-1)
)
writer.add_scalar("Training loss", loss.item(), global_step=step)
step += 1
optimizer.zero_grad()
loss.backward(loss)
optimizer.step()
if __name__ == "__main__":
train()

View File

@@ -0,0 +1,69 @@
import torch
import torchvision.transforms as transforms
from PIL import Image
def print_examples(model, device, dataset):
transform = transforms.Compose(
[
transforms.Resize((299, 299)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]
)
model.eval()
test_img1 = transform(Image.open("test_examples/dog.jpg").convert("RGB")).unsqueeze(
0
)
print("Example 1 CORRECT: Dog on a beach by the ocean")
print(
"Example 1 OUTPUT: "
+ " ".join(model.caption_image(test_img1.to(device), dataset.vocab))
)
test_img2 = transform(
Image.open("test_examples/child.jpg").convert("RGB")
).unsqueeze(0)
print("Example 2 CORRECT: Child holding red frisbee outdoors")
print(
"Example 2 OUTPUT: "
+ " ".join(model.caption_image(test_img2.to(device), dataset.vocab))
)
test_img3 = transform(Image.open("test_examples/bus.png").convert("RGB")).unsqueeze(
0
)
print("Example 3 CORRECT: Bus driving by parked cars")
print(
"Example 3 OUTPUT: "
+ " ".join(model.caption_image(test_img3.to(device), dataset.vocab))
)
test_img4 = transform(
Image.open("test_examples/boat.png").convert("RGB")
).unsqueeze(0)
print("Example 4 CORRECT: A small boat in the ocean")
print(
"Example 4 OUTPUT: "
+ " ".join(model.caption_image(test_img4.to(device), dataset.vocab))
)
test_img5 = transform(
Image.open("test_examples/horse.png").convert("RGB")
).unsqueeze(0)
print("Example 5 CORRECT: A cowboy riding a horse in the desert")
print(
"Example 5 OUTPUT: "
+ " ".join(model.caption_image(test_img5.to(device), dataset.vocab))
)
model.train()
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
def load_checkpoint(checkpoint, model, optimizer):
print("=> Loading checkpoint")
model.load_state_dict(checkpoint["state_dict"])
optimizer.load_state_dict(checkpoint["optimizer"])
step = checkpoint["step"]
return step

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

View File

@@ -0,0 +1,112 @@
import torch
import torch.nn as nn
import torch.optim as optim
from PIL import Image
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.utils import save_image
class VGG(nn.Module):
def __init__(self):
super(VGG, self).__init__()
# The first number x in convx_y gets added by 1 after it has gone
# through a maxpool, and the second y if we have several conv layers
# in between a max pool. These strings (0, 5, 10, ..) then correspond
# to conv1_1, conv2_1, conv3_1, conv4_1, conv5_1 mentioned in NST paper
self.chosen_features = ["0", "5", "10", "19", "28"]
# We don't need to run anything further than conv5_1 (the 28th module in vgg)
# Since remember, we dont actually care about the output of VGG: the only thing
# that is modified is the generated image (i.e, the input).
self.model = models.vgg19(pretrained=True).features[:29]
def forward(self, x):
# Store relevant features
features = []
# Go through each layer in model, if the layer is in the chosen_features,
# store it in features. At the end we'll just return all the activations
# for the specific layers we have in chosen_features
for layer_num, layer in enumerate(self.model):
x = layer(x)
if str(layer_num) in self.chosen_features:
features.append(x)
return features
def load_image(image_name):
image = Image.open(image_name)
image = loader(image).unsqueeze(0)
return image.to(device)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
imsize = 356
# Here we may want to use the Normalization constants used in the original
# VGG network (to get similar values net was originally trained on), but
# I found it didn't matter too much so I didn't end of using it. If you
# use it make sure to normalize back so the images don't look weird.
loader = transforms.Compose(
[
transforms.Resize((imsize, imsize)),
transforms.ToTensor(),
# transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
]
)
original_img = load_image("annahathaway.png")
style_img = load_image("style.jpg")
# initialized generated as white noise or clone of original image.
# Clone seemed to work better for me.
# generated = torch.randn(original_img.data.shape, device=device, requires_grad=True)
generated = original_img.clone().requires_grad_(True)
model = VGG().to(device).eval()
# Hyperparameters
total_steps = 6000
learning_rate = 0.001
alpha = 1
beta = 0.01
optimizer = optim.Adam([generated], lr=learning_rate)
for step in range(total_steps):
# Obtain the convolution features in specifically chosen layers
generated_features = model(generated)
original_img_features = model(original_img)
style_features = model(style_img)
# Loss is 0 initially
style_loss = original_loss = 0
# iterate through all the features for the chosen layers
for gen_feature, orig_feature, style_feature in zip(
generated_features, original_img_features, style_features
):
# batch_size will just be 1
batch_size, channel, height, width = gen_feature.shape
original_loss += torch.mean((gen_feature - orig_feature) ** 2)
# Compute Gram Matrix of generated
G = gen_feature.view(channel, height * width).mm(
gen_feature.view(channel, height * width).t()
)
# Compute Gram Matrix of Style
A = style_feature.view(channel, height * width).mm(
style_feature.view(channel, height * width).t()
)
style_loss += torch.mean((G - A) ** 2)
total_loss = alpha * original_loss + beta * style_loss
optimizer.zero_grad()
total_loss.backward()
optimizer.step()
if step % 200 == 0:
print(total_loss)
save_image(generated, "generated.png")

Binary file not shown.

After

Width:  |  Height:  |  Size: 310 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 282 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 294 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 284 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 293 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 270 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 294 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 308 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 215 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 215 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 280 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 807 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 86 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 549 KiB

View File

@@ -0,0 +1,255 @@
"""
Seq2Seq using Transformers on the Multi30k
dataset. In this video I utilize Pytorch
inbuilt Transformer modules, and have a
separate implementation for Transformers
from scratch. Training this model for a
while (not too long) gives a BLEU score
of ~35, and I think training for longer
would give even better results.
"""
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from torch.utils.tensorboard import SummaryWriter
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
"""
To install spacy languages do:
python -m spacy download en
python -m spacy download de
"""
spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")
def tokenize_ger(text):
return [tok.text for tok in spacy_ger.tokenizer(text)]
def tokenize_eng(text):
return [tok.text for tok in spacy_eng.tokenizer(text)]
german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")
english = Field(
tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
)
train_data, valid_data, test_data = Multi30k.splits(
exts=(".de", ".en"), fields=(german, english)
)
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)
class Transformer(nn.Module):
def __init__(
self,
embedding_size,
src_vocab_size,
trg_vocab_size,
src_pad_idx,
num_heads,
num_encoder_layers,
num_decoder_layers,
forward_expansion,
dropout,
max_len,
device,
):
super(Transformer, self).__init__()
self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
self.src_position_embedding = nn.Embedding(max_len, embedding_size)
self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
self.trg_position_embedding = nn.Embedding(max_len, embedding_size)
self.device = device
self.transformer = nn.Transformer(
embedding_size,
num_heads,
num_encoder_layers,
num_decoder_layers,
forward_expansion,
dropout,
)
self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
self.dropout = nn.Dropout(dropout)
self.src_pad_idx = src_pad_idx
def make_src_mask(self, src):
src_mask = src.transpose(0, 1) == self.src_pad_idx
# (N, src_len)
return src_mask.to(self.device)
def forward(self, src, trg):
src_seq_length, N = src.shape
trg_seq_length, N = trg.shape
src_positions = (
torch.arange(0, src_seq_length)
.unsqueeze(1)
.expand(src_seq_length, N)
.to(self.device)
)
trg_positions = (
torch.arange(0, trg_seq_length)
.unsqueeze(1)
.expand(trg_seq_length, N)
.to(self.device)
)
embed_src = self.dropout(
(self.src_word_embedding(src) + self.src_position_embedding(src_positions))
)
embed_trg = self.dropout(
(self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
)
src_padding_mask = self.make_src_mask(src)
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
self.device
)
out = self.transformer(
embed_src,
embed_trg,
src_key_padding_mask=src_padding_mask,
tgt_mask=trg_mask,
)
out = self.fc_out(out)
return out
# We're ready to define everything we need for training our Seq2Seq model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = True
save_model = True
# Training hyperparameters
num_epochs = 10000
learning_rate = 3e-4
batch_size = 32
# Model hyperparameters
src_vocab_size = len(german.vocab)
trg_vocab_size = len(english.vocab)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 100
forward_expansion = 4
src_pad_idx = english.vocab.stoi["<pad>"]
# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size=batch_size,
sort_within_batch=True,
sort_key=lambda x: len(x.src),
device=device,
)
model = Transformer(
embedding_size,
src_vocab_size,
trg_vocab_size,
src_pad_idx,
num_heads,
num_encoder_layers,
num_decoder_layers,
forward_expansion,
dropout,
max_len,
device,
).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, factor=0.1, patience=10, verbose=True
)
pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
if load_model:
load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
sentence = "ein pferd geht unter einer brücke neben einem boot."
for epoch in range(num_epochs):
print(f"[Epoch {epoch} / {num_epochs}]")
if save_model:
checkpoint = {
"state_dict": model.state_dict(),
"optimizer": optimizer.state_dict(),
}
save_checkpoint(checkpoint)
model.eval()
translated_sentence = translate_sentence(
model, sentence, german, english, device, max_length=50
)
print(f"Translated example sentence: \n {translated_sentence}")
model.train()
losses = []
for batch_idx, batch in enumerate(train_iterator):
# Get input and targets and get to cuda
inp_data = batch.src.to(device)
target = batch.trg.to(device)
# Forward prop
output = model(inp_data, target[:-1, :])
# Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
# doesn't take input in that form. For example if we have MNIST we want to have
# output to be: (N, 10) and targets just (N). Here we can view it in a similar
# way that we have output_words * batch_size that we want to send in into
# our cost function, so we need to do some reshapin.
# Let's also remove the start token while we're at it
output = output.reshape(-1, output.shape[2])
target = target[1:].reshape(-1)
optimizer.zero_grad()
loss = criterion(output, target)
losses.append(loss.item())
# Back prop
loss.backward()
# Clip to avoid exploding gradient issues, makes sure grads are
# within a healthy range
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
# Gradient descent step
optimizer.step()
# plot to tensorboard
writer.add_scalar("Training loss", loss, global_step=step)
step += 1
mean_loss = sum(losses) / len(losses)
scheduler.step(mean_loss)
# running on entire test data takes a while
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score * 100:.2f}")

View File

@@ -0,0 +1,70 @@
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys
def translate_sentence(model, sentence, german, english, device, max_length=50):
# Load german tokenizer
spacy_ger = spacy.load("de")
# Create tokens using spacy and everything in lower case (which is what our vocab is)
if type(sentence) == str:
tokens = [token.text.lower() for token in spacy_ger(sentence)]
else:
tokens = [token.lower() for token in sentence]
# Add <SOS> and <EOS> in beginning and end respectively
tokens.insert(0, german.init_token)
tokens.append(german.eos_token)
# Go through each german token and convert to an index
text_to_indices = [german.vocab.stoi[token] for token in tokens]
# Convert to Tensor
sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
outputs = [english.vocab.stoi["<sos>"]]
for i in range(max_length):
trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)
with torch.no_grad():
output = model(sentence_tensor, trg_tensor)
best_guess = output.argmax(2)[-1, :].item()
outputs.append(best_guess)
if best_guess == english.vocab.stoi["<eos>"]:
break
translated_sentence = [english.vocab.itos[idx] for idx in outputs]
# remove start token
return translated_sentence[1:]
def bleu(data, model, german, english, device):
targets = []
outputs = []
for example in data:
src = vars(example)["src"]
trg = vars(example)["trg"]
prediction = translate_sentence(model, src, german, english, device)
prediction = prediction[:-1] # remove <eos> token
targets.append([trg])
outputs.append(prediction)
return bleu_score(outputs, targets)
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
def load_checkpoint(checkpoint, model, optimizer):
print("=> Loading checkpoint")
model.load_state_dict(checkpoint["state_dict"])
optimizer.load_state_dict(checkpoint["optimizer"])

View File

@@ -0,0 +1,4 @@
name,quote,score
Jocko,You must own everything in your world. There is no one else to blame.,1
Bruce Lee,"Do not pray for an easy life, pray for the strength to endure a difficult one.",1
Potato guy,"Stand tall, and rice like a potato!",0
1 name quote score
2 Jocko You must own everything in your world. There is no one else to blame. 1
3 Bruce Lee Do not pray for an easy life, pray for the strength to endure a difficult one. 1
4 Potato guy Stand tall, and rice like a potato! 0

View File

@@ -0,0 +1,3 @@
{"name": "Jocko", "quote": "You must own everything in your world. There is no one else to blame.", "score":1}
{"name": "Bruce", "quote": "Do not pray for an easy life, pray for the strength to endure a difficult one.", "score":1}
{"name": "Random Potato", "quote": "Stand tall, and rice like a potato!", "score":0}

View File

@@ -0,0 +1,4 @@
name quote score
Jocko You must own everything in your world. There is no one else to blame. 1
Bruce Lee Do not pray for an easy life, pray for the strength to endure a difficult one. 1
Potato guy Stand tall, and rice like a potato! 0
1 name quote score
2 Jocko You must own everything in your world. There is no one else to blame. 1
3 Bruce Lee Do not pray for an easy life, pray for the strength to endure a difficult one. 1
4 Potato guy Stand tall, and rice like a potato! 0

View File

@@ -0,0 +1,4 @@
name,quote,score
Jocko,You must own everything in your world. There is no one else to blame.,1
Bruce Lee,"Do not pray for an easy life, pray for the strength to endure a difficult one.",1
Potato guy,"Stand tall, and rice like a potato!",0
1 name quote score
2 Jocko You must own everything in your world. There is no one else to blame. 1
3 Bruce Lee Do not pray for an easy life, pray for the strength to endure a difficult one. 1
4 Potato guy Stand tall, and rice like a potato! 0

View File

@@ -0,0 +1,3 @@
{"name": "Jocko", "quote": "You must own everything in your world. There is no one else to blame.", "score":1}
{"name": "Bruce", "quote": "Do not pray for an easy life, pray for the strength to endure a difficult one.", "score":1}
{"name": "Random Potato", "quote": "Stand tall, and rice like a potato!", "score":0}

View File

@@ -0,0 +1,4 @@
name quote score
Jocko You must own everything in your world. There is no one else to blame. 1
Bruce Lee Do not pray for an easy life, pray for the strength to endure a difficult one. 1
Potato guy Stand tall, and rice like a potato! 0
1 name quote score
2 Jocko You must own everything in your world. There is no one else to blame. 1
3 Bruce Lee Do not pray for an easy life, pray for the strength to endure a difficult one. 1
4 Potato guy Stand tall, and rice like a potato! 0

View File

@@ -0,0 +1,111 @@
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
from torchtext.data import Field, TabularDataset, BucketIterator
######### Loading from JSON/CSV/TSV files #########
# STEPS:
# 1. Specify how preprocessing should be done -> Fields
# 2. Use Dataset to load the data -> TabularDataset (JSON/CSV/TSV Files)
# 3. Construct an iterator to do batching & padding -> BucketIterator
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# python -m spacy download en
spacy_en = spacy.load("en")
def tokenize(text):
return [tok.text for tok in spacy_en.tokenizer(text)]
quote = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)
score = Field(sequential=False, use_vocab=False)
fields = {"quote": ("q", quote), "score": ("s", score)}
train_data, test_data = TabularDataset.splits(
path="mydata", train="train.json", test="test.json", format="json", fields=fields
)
# # train_data, test_data = TabularDataset.splits(
# # path='mydata',
# # train='train.csv',
# # test='test.csv',
# # format='csv',
# # fields=fields)
# # train_data, test_data = TabularDataset.splits(
# # path='mydata',
# # train='train.tsv',
# # test='test.tsv',
# # format='tsv',
# # fields=fields)
quote.build_vocab(train_data, max_size=10000, min_freq=1, vectors="glove.6B.100d")
train_iterator, test_iterator = BucketIterator.splits(
(train_data, test_data), batch_size=2, device=device
)
######### Training a simple LSTM on this toy data of ours #########
class RNN_LSTM(nn.Module):
def __init__(self, input_size, embed_size, hidden_size, num_layers):
super(RNN_LSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.embedding = nn.Embedding(input_size, embed_size)
self.rnn = nn.LSTM(embed_size, hidden_size, num_layers)
self.fc_out = nn.Linear(hidden_size, 1)
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)
embedded = self.embedding(x)
outputs, _ = self.rnn(embedded, (h0, c0))
prediction = self.fc_out(outputs[-1, :, :])
return prediction
# Hyperparameters
input_size = len(quote.vocab)
hidden_size = 512
num_layers = 2
embedding_size = 100
learning_rate = 0.005
num_epochs = 10
# Initialize network
model = RNN_LSTM(input_size, embedding_size, hidden_size, num_layers).to(device)
# (NOT COVERED IN YOUTUBE VIDEO): Load the pretrained embeddings onto our model
pretrained_embeddings = quote.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Train Network
for epoch in range(num_epochs):
for batch_idx, batch in enumerate(train_iterator):
# Get data to cuda if possible
data = batch.q.to(device=device)
targets = batch.s.to(device=device)
# forward
scores = model(data)
loss = criterion(scores.squeeze(1), targets.type_as(scores))
# backward
optimizer.zero_grad()
loss.backward()
# gradient descent
optimizer.step()

View File

@@ -0,0 +1,45 @@
import spacy
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
"""
To install spacy languages use:
python -m spacy download en
python -m spacy download de
"""
spacy_eng = spacy.load("en")
spacy_ger = spacy.load("de")
def tokenize_eng(text):
return [tok.text for tok in spacy_eng.tokenizer(text)]
def tokenize_ger(text):
return [tok.text for tok in spacy_ger.tokenizer(text)]
english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)
german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True)
train_data, validation_data, test_data = Multi30k.splits(
exts=(".de", ".en"), fields=(german, english)
)
english.build_vocab(train_data, max_size=10000, min_freq=2)
german.build_vocab(train_data, max_size=10000, min_freq=2)
train_iterator, validation_iterator, test_iterator = BucketIterator.splits(
(train_data, validation_data, test_data), batch_size=64, device="cuda"
)
for batch in train_iterator:
print(batch)
# string to integer (stoi)
print(f'Index of the word (the) is: {english.vocab.stoi["the"]}')
# print integer to string (itos)
print(f"Word of the index (1612) is: {english.vocab.itos[1612]}")
print(f"Word of the index (0) is: {english.vocab.itos[0]}")

View File

@@ -0,0 +1,64 @@
import spacy
import pandas as pd
from torchtext.data import Field, BucketIterator, TabularDataset
from sklearn.model_selection import train_test_split
### Load data from two text files where each row is a sentence ###
english_txt = open("train_WMT_english.txt", encoding="utf8").read().split("\n")
german_txt = open("train_WMT_german.txt", encoding="utf8").read().split("\n")
raw_data = {
"English": [line for line in english_txt[1:100]],
"German": [line for line in german_txt[1:100]],
}
df = pd.DataFrame(raw_data, columns=["English", "German"])
# create train and test set
train, test = train_test_split(df, test_size=0.1)
# Get train, test data to json and csv format which can be read by torchtext
train.to_json("train.json", orient="records", lines=True)
test.to_json("test.json", orient="records", lines=True)
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)
### Now we're back to where we were in previous Tutorials ###
"""
To install spacy languages use:
python -m spacy download en
python -m spacy download de
"""
spacy_eng = spacy.load("en")
spacy_ger = spacy.load("de")
def tokenize_eng(text):
return [tok.text for tok in spacy_eng.tokenizer(text)]
def tokenize_ger(text):
return [tok.text for tok in spacy_ger.tokenizer(text)]
english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)
german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True)
fields = {"English": ("eng", english), "German": ("ger", german)}
train_data, test_data = TabularDataset.splits(
path="", train="train.json", test="test.json", format="json", fields=fields
)
english.build_vocab(train_data, max_size=10000, min_freq=2)
german.build_vocab(train_data, max_size=10000, min_freq=2)
train_iterator, test_iterator = BucketIterator.splits(
(train_data, test_data), batch_size=32, device="cuda"
)
for batch in train_iterator:
print(batch)

View File

@@ -0,0 +1,291 @@
"""
A from scratch implementation of Transformer network,
following the paper Attention is all you need with a
few minor differences. I tried to make it as clear as
possible to understand and also went through the code
on my youtube channel!
"""
import torch
import torch.nn as nn
class SelfAttention(nn.Module):
def __init__(self, embed_size, heads):
super(SelfAttention, self).__init__()
self.embed_size = embed_size
self.heads = heads
self.head_dim = embed_size // heads
assert (
self.head_dim * heads == embed_size
), "Embedding size needs to be divisible by heads"
self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
self.fc_out = nn.Linear(heads * self.head_dim, embed_size)
def forward(self, values, keys, query, mask):
# Get number of training examples
N = query.shape[0]
value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
# Split the embedding into self.heads different pieces
values = values.reshape(N, value_len, self.heads, self.head_dim)
keys = keys.reshape(N, key_len, self.heads, self.head_dim)
query = query.reshape(N, query_len, self.heads, self.head_dim)
values = self.values(values) # (N, value_len, heads, head_dim)
keys = self.keys(keys) # (N, key_len, heads, head_dim)
queries = self.queries(query) # (N, query_len, heads, heads_dim)
# Einsum does matrix mult. for query*keys for each training example
# with every other training example, don't be confused by einsum
# it's just how I like doing matrix multiplication & bmm
energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
# queries shape: (N, query_len, heads, heads_dim),
# keys shape: (N, key_len, heads, heads_dim)
# energy: (N, heads, query_len, key_len)
# Mask padded indices so their weights become 0
if mask is not None:
energy = energy.masked_fill(mask == 0, float("-1e20"))
# Normalize energy values similarly to seq2seq + attention
# so that they sum to 1. Also divide by scaling factor for
# better stability
attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
# attention shape: (N, heads, query_len, key_len)
out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
N, query_len, self.heads * self.head_dim
)
# attention shape: (N, heads, query_len, key_len)
# values shape: (N, value_len, heads, heads_dim)
# out after matrix multiply: (N, query_len, heads, head_dim), then
# we reshape and flatten the last two dimensions.
out = self.fc_out(out)
# Linear layer doesn't modify the shape, final shape will be
# (N, query_len, embed_size)
return out
class TransformerBlock(nn.Module):
def __init__(self, embed_size, heads, dropout, forward_expansion):
super(TransformerBlock, self).__init__()
self.attention = SelfAttention(embed_size, heads)
self.norm1 = nn.LayerNorm(embed_size)
self.norm2 = nn.LayerNorm(embed_size)
self.feed_forward = nn.Sequential(
nn.Linear(embed_size, forward_expansion * embed_size),
nn.ReLU(),
nn.Linear(forward_expansion * embed_size, embed_size),
)
self.dropout = nn.Dropout(dropout)
def forward(self, value, key, query, mask):
attention = self.attention(value, key, query, mask)
# Add skip connection, run through normalization and finally dropout
x = self.dropout(self.norm1(attention + query))
forward = self.feed_forward(x)
out = self.dropout(self.norm2(forward + x))
return out
class Encoder(nn.Module):
def __init__(
self,
src_vocab_size,
embed_size,
num_layers,
heads,
device,
forward_expansion,
dropout,
max_length,
):
super(Encoder, self).__init__()
self.embed_size = embed_size
self.device = device
self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
self.position_embedding = nn.Embedding(max_length, embed_size)
self.layers = nn.ModuleList(
[
TransformerBlock(
embed_size,
heads,
dropout=dropout,
forward_expansion=forward_expansion,
)
for _ in range(num_layers)
]
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask):
N, seq_length = x.shape
positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
out = self.dropout(
(self.word_embedding(x) + self.position_embedding(positions))
)
# In the Encoder the query, key, value are all the same, it's in the
# decoder this will change. This might look a bit odd in this case.
for layer in self.layers:
out = layer(out, out, out, mask)
return out
class DecoderBlock(nn.Module):
def __init__(self, embed_size, heads, forward_expansion, dropout, device):
super(DecoderBlock, self).__init__()
self.norm = nn.LayerNorm(embed_size)
self.attention = SelfAttention(embed_size, heads=heads)
self.transformer_block = TransformerBlock(
embed_size, heads, dropout, forward_expansion
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, value, key, src_mask, trg_mask):
attention = self.attention(x, x, x, trg_mask)
query = self.dropout(self.norm(attention + x))
out = self.transformer_block(value, key, query, src_mask)
return out
class Decoder(nn.Module):
def __init__(
self,
trg_vocab_size,
embed_size,
num_layers,
heads,
forward_expansion,
dropout,
device,
max_length,
):
super(Decoder, self).__init__()
self.device = device
self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
self.position_embedding = nn.Embedding(max_length, embed_size)
self.layers = nn.ModuleList(
[
DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
for _ in range(num_layers)
]
)
self.fc_out = nn.Linear(embed_size, trg_vocab_size)
self.dropout = nn.Dropout(dropout)
def forward(self, x, enc_out, src_mask, trg_mask):
N, seq_length = x.shape
positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
for layer in self.layers:
x = layer(x, enc_out, enc_out, src_mask, trg_mask)
out = self.fc_out(x)
return out
class Transformer(nn.Module):
def __init__(
self,
src_vocab_size,
trg_vocab_size,
src_pad_idx,
trg_pad_idx,
embed_size=512,
num_layers=6,
forward_expansion=4,
heads=8,
dropout=0,
device="cpu",
max_length=100,
):
super(Transformer, self).__init__()
self.encoder = Encoder(
src_vocab_size,
embed_size,
num_layers,
heads,
device,
forward_expansion,
dropout,
max_length,
)
self.decoder = Decoder(
trg_vocab_size,
embed_size,
num_layers,
heads,
forward_expansion,
dropout,
device,
max_length,
)
self.src_pad_idx = src_pad_idx
self.trg_pad_idx = trg_pad_idx
self.device = device
def make_src_mask(self, src):
src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
# (N, 1, 1, src_len)
return src_mask.to(self.device)
def make_trg_mask(self, trg):
N, trg_len = trg.shape
trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
N, 1, trg_len, trg_len
)
return trg_mask.to(self.device)
def forward(self, src, trg):
src_mask = self.make_src_mask(src)
trg_mask = self.make_trg_mask(trg)
enc_src = self.encoder(src, src_mask)
out = self.decoder(trg, enc_src, src_mask, trg_mask)
return out
if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
x = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]).to(
device
)
trg = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]).to(device)
src_pad_idx = 0
trg_pad_idx = 0
src_vocab_size = 10
trg_vocab_size = 10
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device).to(
device
)
out = model(x, trg[:, :-1])
print(out.shape)