add lightning code, finetuning whisper, recommender system neural collaborative filtering

This commit is contained in:
Aladdin Persson
2023-02-21 16:25:42 +01:00
parent c646ef65e2
commit 94f6c024fe
51 changed files with 17977 additions and 25 deletions

View File

@@ -0,0 +1,60 @@
# Imports
import torch
import torchvision.datasets as datasets # Standard datasets
import torchvision.transforms as transforms # Transformations we can perform on our dataset for augmentation
from torch.utils.data import DataLoader
import pytorch_lightning as pl
class MNISTDataModule(pl.LightningDataModule):
def __init__(self, batch_size, num_workers):
super().__init__()
self.batch_size = batch_size
self.num_workers = num_workers
def setup(self, stage):
mnist_full = train_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
self.mnist_test = datasets.MNIST(
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
self.mnist_train, self.mnist_val = torch.utils.data.random_split(
mnist_full, [55000, 5000]
)
def train_dataloader(self):
return DataLoader(
self.mnist_train,
batch_size=self.batch_size,
num_workers=self.num_workers,
persistent_workers=True,
shuffle=True,
)
def val_dataloader(self):
return DataLoader(
self.mnist_val,
batch_size=self.batch_size,
num_workers=self.num_workers,
persistent_workers=True,
shuffle=False,
)
def test_dataloader(self):
return DataLoader(
self.mnist_test,
batch_size=self.batch_size,
num_workers=self.num_workers,
persistent_workers=True,
shuffle=False,
)
# check that it works
if __name__ == "__main__":
dm = MNISTDataModule()
dm.setup("fit")
print(len(dm.mnist_train))
print(len(dm.mnist_val))
print(len(dm.mnist_test))

View File

@@ -0,0 +1,92 @@
import torch
import torchvision
from torch import nn
import pytorch_lightning as pl
class VAEpl(pl.LightningModule):
def __init__(self, lr, input_dim=784, h_dim=200, z_dim=20):
super().__init__()
self.lr = lr
self.loss_fn = nn.BCELoss(reduction="sum")
self.input_dim = input_dim
# encoder
self.img_2hid = nn.Linear(input_dim, h_dim)
self.hid_2mu = nn.Linear(h_dim, z_dim)
self.hid_2sigma = nn.Linear(h_dim, z_dim)
# decoder
self.z_2hid = nn.Linear(z_dim, h_dim)
self.hid_2img = nn.Linear(h_dim, input_dim)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def encode(self, x):
h = self.relu(self.img_2hid(x))
mu, sigma = self.hid_2mu(h), self.hid_2sigma(h)
return mu, sigma
def decode(self, z):
h = self.relu(self.z_2hid(z))
return torch.sigmoid(self.hid_2img(h))
def forward(self, x):
mu, sigma = self.encode(x)
epsilon = torch.randn_like(sigma)
z_new = mu + sigma * epsilon
x_reconstructed = self.decode(z_new)
return x_reconstructed, mu, sigma
def training_step(self, batch, batch_idx):
x, _ = batch
x = x.view(-1, self.input_dim)
x_reconstructed, mu, sigma = self.forward(x)
reconstruction_loss = self.loss_fn(x_reconstructed, x)
kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
loss = reconstruction_loss + kl_div
self.log("train_loss", loss, sync_dist=True)
# add logging of images to tensorboard, x_reconstructed and x, so that
# it updates every step and we can the progress pictures in tensorboard
if batch_idx % 100 == 0:
# take out the first 8
x = x[:8]
x_reconstructed = x_reconstructed[:8]
grid = torchvision.utils.make_grid(x_reconstructed.view(-1, 1, 28, 28))
self.logger.experiment.add_image("reconstructed", grid, self.global_step)
grid = torchvision.utils.make_grid(x.view(-1, 1, 28, 28))
self.logger.experiment.add_image("original", grid, self.global_step)
return loss
def validation_step(self, batch, batch_idx):
x, _ = batch
x = x.view(-1, self.input_dim)
x_reconstructed, mu, sigma = self.forward(x)
reconstruction_loss = self.loss_fn(x_reconstructed, x)
kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
loss = reconstruction_loss + kl_div
self.log("val_loss", loss, sync_dist=True)
return loss
def test_step(self, batch, batch_idx):
x, _ = batch
x = x.view(-1, self.input_dim)
x_reconstructed, mu, sigma = self.forward(x)
reconstruction_loss = self.loss_fn(x_reconstructed, x)
kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
loss = reconstruction_loss + kl_div
self.log("test_loss", loss, sync_dist=True)
return loss
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
return optimizer
if __name__ == "__main__":
batch_size = 8
x = torch.randn(batch_size, 28 * 28 * 1)
vae_pl = VAEpl()
x_reconstructed, mu, sigma = vae_pl(x)
print(x_reconstructed.shape)

View File

@@ -0,0 +1,49 @@
import torch
import torchvision.datasets as datasets # Standard datasets
from tqdm import tqdm
from torch import nn, optim
from torchvision import transforms
from torchvision.utils import save_image
from torch.utils.data import DataLoader
from dataset import MNISTDataModule
import pytorch_lightning as pl
from model import VAEpl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.strategies import DeepSpeedStrategy
torch.set_float32_matmul_precision("medium")
"""
GOALS:
* Understand the strategy (deepspeed, ddp, etc) and how to use it
* Setup a config, for scheduler etc instead of configuring it in each sub-module
* Metrics
"""
# things to add
lr = 3e-4
batch_size = 128
num_workers = 2
model = VAEpl(lr)
dm = MNISTDataModule(batch_size, num_workers)
logger = TensorBoardLogger("my_checkpoint", name="scheduler_autolr_vae_pl_model")
# add callback for learning rate monitor, model checkpoint, and scheduler on plateau
callbacks = [pl.callbacks.LearningRateMonitor(logging_interval="step"),
pl.callbacks.ModelCheckpoint(monitor="val_loss", save_top_k=1, mode="min", save_last=True),
]
if __name__ == "__main__":
trainer = pl.Trainer(
max_epochs=100,
accelerator="gpu",
devices=2,
logger=logger,
#precision=16,
strategy=DeepSpeedStrategy(
stage=0,
),
)
#trainer.tune(model, dm)
trainer.fit(model, dm)

View File

@@ -0,0 +1,41 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
# import save_image from torchvision.utils
from torchvision.utils import save_image
def inference(model, dataset, digit, num_examples=1):
"""
Generates (num_examples) of a particular digit.
Specifically we extract an example of each digit,
then after we have the mu, sigma representation for
each digit we can sample from that.
After we sample we can run the decoder part of the VAE
and generate examples.
"""
images = []
idx = 0
for x, y in dataset:
if y == idx:
images.append(x)
idx += 1
if idx == 10:
break
encodings_digit = []
for d in range(10):
with torch.no_grad():
mu, sigma = model.encode(images[d].view(1, 784))
encodings_digit.append((mu, sigma))
mu, sigma = encodings_digit[digit]
for example in range(num_examples):
epsilon = torch.randn_like(sigma)
z = mu + sigma * epsilon
out = model.decode(z)
out = out.view(-1, 1, 28, 28)
save_image(out, f"generated_{digit}_ex{example}.png")

View File

@@ -23,27 +23,7 @@ model = VariationalAutoEncoder(INPUT_DIM, H_DIM, Z_DIM).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR_RATE)
loss_fn = nn.BCELoss(reduction="sum")
# Start Training
for epoch in range(NUM_EPOCHS):
loop = tqdm(enumerate(train_loader))
for i, (x, _) in loop:
# Forward pass
x = x.to(DEVICE).view(x.shape[0], INPUT_DIM)
x_reconstructed, mu, sigma = model(x)
# Compute loss
reconstruction_loss = loss_fn(x_reconstructed, x)
kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
# Backprop
loss = reconstruction_loss + kl_div
optimizer.zero_grad()
loss.backward()
optimizer.step()
loop.set_postfix(loss=loss.item())
model = model.to("cpu")
def inference(digit, num_examples=1):
"""
Generates (num_examples) of a particular digit.
@@ -79,8 +59,3 @@ def inference(digit, num_examples=1):
for idx in range(10):
inference(idx, num_examples=5)

View File

@@ -0,0 +1,120 @@
"""
Create a PyTorch Custom dataset that loads file in data/other.tsv that contains
the path to image audio and text transcription.
"""
import pytorch_lightning as pl
from tqdm import tqdm
import ffmpeg
import os
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
import sys
class CommonVoice(Dataset):
def __init__(self, data_dir, whisper_model="tiny"):
self.sampling_rate = 16_000
self.data_dir = data_dir
self.data = pd.read_csv(
os.path.join(data_dir, "other.tsv"),
sep="\t",
)
self.feature_extractor = WhisperFeatureExtractor.from_pretrained(
f"openai/whisper-{whisper_model}"
)
self.tokenizer = WhisperTokenizer.from_pretrained(
f"openai/whisper-{whisper_model}", language="sv", task="transcribe"
)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
audio_file_path = os.path.join(
self.data_dir + "clips/", self.data.iloc[idx]["path"]
)
sentence = self.data.iloc[idx]["sentence"]
text = self.tokenizer(sentence).input_ids
out, _ = (
ffmpeg.input(audio_file_path, threads=0)
.output(
"-", format="s16le", acodec="pcm_s16le", ac=1, ar=self.sampling_rate
)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
# run feature extractor
audio_features = self.feature_extractor(
out, sampling_rate=self.sampling_rate, return_tensors="pt"
)
return audio_features, text
# Create a collator that will pad the audio features and text labels
class DataCollatorSpeechSeq2SeqWithPadding:
def __init__(self, feature_extractor, tokenizer):
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
def __call__(self, batch):
text_features = [{"input_ids": x[1]} for x in batch]
batch_text = self.tokenizer.pad(
text_features, return_tensors="pt",
)
audio_features = [{"input_features": x[0]["input_features"]} for x in batch]
batch_audio = self.feature_extractor.pad(
audio_features, return_tensors="pt",
)
batch_text["input_ids"] = batch_text["input_ids"].masked_fill(
batch_text["attention_mask"].ne(1), -100
)
batch_audio["input_features"] = batch_audio["input_features"].squeeze(1)
labels = batch_text["input_ids"].clone()
if (labels[:, 0] == self.tokenizer.encode("")[0]).all().cpu().item():
labels = labels[:, 1:]
batch_text["labels"] = labels
return batch_audio, batch_text
# Put into a lightning datamodule
class WhisperDataset(pl.LightningDataModule):
def __init__(self, data_dir, batch_size=32, num_workers=0, whisper_model="tiny"):
super().__init__()
self.data_dir = data_dir
self.batch_size = batch_size
self.num_workers = num_workers
self.whisper_model = whisper_model
self.sampling_rate = 16_000
def setup(self, stage=None):
self.dataset = CommonVoice(self.data_dir, self.whisper_model)
self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(
self.dataset.feature_extractor, self.dataset.tokenizer
)
def train_dataloader(self):
return DataLoader(
self.dataset,
batch_size=self.batch_size,
shuffle=True,
num_workers=self.num_workers,
collate_fn=self.data_collator,
)
# Test if lightning datamodule working as intended
if __name__ == "__main__":
dm = WhisperDataset(data_dir="data/")
dm.setup()
from tqdm import tqdm
for batch in tqdm(dm.train_dataloader()):
pass

View File

@@ -0,0 +1,34 @@
import torch
import torchvision
from torch import nn
import pytorch_lightning as pl
from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
from transformers import WhisperForConditionalGeneration
class WhisperFinetuning(pl.LightningModule):
def __init__(self, lr, whisper_model="tiny"):
super().__init__()
self.lr = lr
self.model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{whisper_model}")
self.model.config.forced_decoder_ids = None
self.model.config.suppress_tokens = []
def training_step(self, batch, batch_idx):
encoder_input = batch[0]["input_features"]
decoder_labels = batch[1]["labels"]
out = self.model(
input_features=encoder_input,
labels=decoder_labels,
)
loss = out["loss"]
return loss
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
return optimizer
if __name__ == "__main__":
pass

View File

@@ -0,0 +1,9 @@
Goal: re-write the code of huggingface whisper finetuning to use pytorch lightning
1. load the dataset using lightning datamodule
* integrate huggingface loading data, or we can write it ourselves and use lightning datamodule
2. load the model using lightning module
3. train the model using lightning trainer
(4. See if we can sharded training with lightning trainer to maybe finetune a large whisper model
that we couldn't on single GPU)
End goal: Finetune the model on our own dataset for some cool application

View File

@@ -0,0 +1,7 @@
from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained(
f"openai/whisper-tiny", task="transcribe"
)
encoded_string = tokenizer.encode("")[0]
print(encoded_string) # should print 50258
print(tokenizer.bos_token_id) # should print 50257

View File

@@ -0,0 +1,31 @@
import torch
import torchvision.datasets as datasets # Standard datasets
from tqdm import tqdm
from torch import nn, optim
from torchvision import transforms
from torchvision.utils import save_image
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from model import WhisperFinetuning
from dataset import WhisperDataset
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.strategies import DeepSpeedStrategy
torch.set_float32_matmul_precision("medium")
# things to add
lr = 1e-5
batch_size = 32
num_workers = 4
model = WhisperFinetuning(lr)
dm = WhisperDataset(data_dir="data/", batch_size=batch_size, num_workers=num_workers)
if __name__ == "__main__":
trainer = pl.Trainer(
max_epochs=1000,
accelerator="gpu",
devices=[0],
precision=16,
)
trainer.fit(model, dm)

View File

@@ -0,0 +1,181 @@
import evaluate
from transformers import Seq2SeqTrainer
from transformers import WhisperForConditionalGeneration
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
from datasets import load_dataset, DatasetDict, Audio
# set so we only can see first cuda device
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
common_voice = DatasetDict()
common_voice["train"] = load_dataset(
"mozilla-foundation/common_voice_11_0",
"sv-SE",
split="train+validation",
use_auth_token=False,
)
common_voice["test"] = load_dataset(
"mozilla-foundation/common_voice_11_0",
"sv-SE",
split="test",
use_auth_token=False,
)
# common_voice = common_voice.remove_columns(
# [
# "accent",
# "age",
# "client_id",
# "down_votes",
# "gender",
# "locale",
# "path",
# "segment",
# "up_votes",
# ]
# )
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
tokenizer = WhisperTokenizer.from_pretrained(
"openai/whisper-tiny", language="sv", task="transcribe"
)
input_str = common_voice["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
print(f"Input: {input_str}")
print(f"Decoded w/ special: {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal: {input_str == decoded_str}")
input_str = common_voice["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
processor = WhisperProcessor.from_pretrained(
"openai/whisper-small", language="sv", task="transcribe"
)
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
def prepare_dataset(example):
# load and resample audio data from 48 to 16kHz
audio = example["audio"]
# compute log-Mel input features from input audio array
example["input_features"] = feature_extractor(
audio["array"], sampling_rate=audio["sampling_rate"]
).input_features[0]
# encode target text to label ids
example["labels"] = tokenizer(example["sentence"]).input_ids
return example
common_voice = common_voice.map(prepare_dataset, num_proc=8)
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
processor: Any
def __call__(
self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
) -> Dict[str, torch.Tensor]:
# split inputs and labels since they have to be of different lengths
# and need different padding methods first treat the audio inputs by
# simply returning torch tensors
input_features = [
{"input_features": feature["input_features"]} for feature in features
]
batch = self.processor.feature_extractor.pad(
input_features, return_tensors="pt"
)
# get the tokenized label sequences
label_features = [{"input_ids": feature["labels"]} for feature in features]
# pad the labels to max length
labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(
labels_batch.attention_mask.ne(1), -100
)
# if bos token is appended in previous tokenization step,
# cut bos token here as it's append later anyways
if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
labels = labels[:, 1:]
batch["labels"] = labels
return batch
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
metric = evaluate.load("wer")
def compute_metrics(pred):
pred_ids = pred.predictions
label_ids = pred.label_ids
# replace -100 with the pad_token_id
label_ids[label_ids == -100] = tokenizer.pad_token_id
# we do not want to group tokens when computing the metrics
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
wer = 100 * metric.compute(predictions=pred_str, references=label_str)
return {"wer": wer}
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
from transformers import Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
output_dir="./whisper-tiny-swedish", # change to a repo name of your choice
per_device_train_batch_size=32,
gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size
learning_rate=1e-5,
warmup_steps=500,
max_steps=4000,
gradient_checkpointing=False,
fp16=True,
evaluation_strategy="steps",
per_device_eval_batch_size=8,
predict_with_generate=True,
generation_max_length=225,
save_steps=1000,
eval_steps=1000,
logging_steps=25,
report_to=["tensorboard"],
load_best_model_at_end=True,
metric_for_best_model="wer",
greater_is_better=False,
push_to_hub=False,
dataloader_num_workers=0,
)
trainer = Seq2SeqTrainer(
args=training_args,
model=model,
train_dataset=common_voice["train"],
eval_dataset=common_voice["test"],
data_collator=data_collator,
compute_metrics=compute_metrics,
tokenizer=processor.feature_extractor,
)
trainer.train()