mirror of
https://github.com/aladdinpersson/Machine-Learning-Collection.git
synced 2026-02-21 11:18:01 +00:00
add lightning code, finetuning whisper, recommender system neural collaborative filtering
This commit is contained in:
Binary file not shown.
60
ML/Pytorch/more_advanced/VAE/lightning_vae/dataset.py
Normal file
60
ML/Pytorch/more_advanced/VAE/lightning_vae/dataset.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# Imports
|
||||
import torch
|
||||
import torchvision.datasets as datasets # Standard datasets
|
||||
import torchvision.transforms as transforms # Transformations we can perform on our dataset for augmentation
|
||||
from torch.utils.data import DataLoader
|
||||
import pytorch_lightning as pl
|
||||
|
||||
|
||||
class MNISTDataModule(pl.LightningDataModule):
|
||||
def __init__(self, batch_size, num_workers):
|
||||
super().__init__()
|
||||
self.batch_size = batch_size
|
||||
self.num_workers = num_workers
|
||||
|
||||
def setup(self, stage):
|
||||
mnist_full = train_dataset = datasets.MNIST(
|
||||
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
|
||||
)
|
||||
self.mnist_test = datasets.MNIST(
|
||||
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
|
||||
)
|
||||
self.mnist_train, self.mnist_val = torch.utils.data.random_split(
|
||||
mnist_full, [55000, 5000]
|
||||
)
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(
|
||||
self.mnist_train,
|
||||
batch_size=self.batch_size,
|
||||
num_workers=self.num_workers,
|
||||
persistent_workers=True,
|
||||
shuffle=True,
|
||||
)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(
|
||||
self.mnist_val,
|
||||
batch_size=self.batch_size,
|
||||
num_workers=self.num_workers,
|
||||
persistent_workers=True,
|
||||
shuffle=False,
|
||||
)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(
|
||||
self.mnist_test,
|
||||
batch_size=self.batch_size,
|
||||
num_workers=self.num_workers,
|
||||
persistent_workers=True,
|
||||
shuffle=False,
|
||||
)
|
||||
|
||||
|
||||
# check that it works
|
||||
if __name__ == "__main__":
|
||||
dm = MNISTDataModule()
|
||||
dm.setup("fit")
|
||||
print(len(dm.mnist_train))
|
||||
print(len(dm.mnist_val))
|
||||
print(len(dm.mnist_test))
|
||||
92
ML/Pytorch/more_advanced/VAE/lightning_vae/model.py
Normal file
92
ML/Pytorch/more_advanced/VAE/lightning_vae/model.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import torch
|
||||
import torchvision
|
||||
from torch import nn
|
||||
import pytorch_lightning as pl
|
||||
|
||||
|
||||
class VAEpl(pl.LightningModule):
|
||||
def __init__(self, lr, input_dim=784, h_dim=200, z_dim=20):
|
||||
super().__init__()
|
||||
self.lr = lr
|
||||
self.loss_fn = nn.BCELoss(reduction="sum")
|
||||
self.input_dim = input_dim
|
||||
|
||||
# encoder
|
||||
self.img_2hid = nn.Linear(input_dim, h_dim)
|
||||
self.hid_2mu = nn.Linear(h_dim, z_dim)
|
||||
self.hid_2sigma = nn.Linear(h_dim, z_dim)
|
||||
|
||||
# decoder
|
||||
self.z_2hid = nn.Linear(z_dim, h_dim)
|
||||
self.hid_2img = nn.Linear(h_dim, input_dim)
|
||||
self.relu = nn.ReLU()
|
||||
self.sigmoid = nn.Sigmoid()
|
||||
|
||||
def encode(self, x):
|
||||
h = self.relu(self.img_2hid(x))
|
||||
mu, sigma = self.hid_2mu(h), self.hid_2sigma(h)
|
||||
return mu, sigma
|
||||
|
||||
def decode(self, z):
|
||||
h = self.relu(self.z_2hid(z))
|
||||
return torch.sigmoid(self.hid_2img(h))
|
||||
|
||||
def forward(self, x):
|
||||
mu, sigma = self.encode(x)
|
||||
epsilon = torch.randn_like(sigma)
|
||||
z_new = mu + sigma * epsilon
|
||||
x_reconstructed = self.decode(z_new)
|
||||
return x_reconstructed, mu, sigma
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, _ = batch
|
||||
x = x.view(-1, self.input_dim)
|
||||
x_reconstructed, mu, sigma = self.forward(x)
|
||||
reconstruction_loss = self.loss_fn(x_reconstructed, x)
|
||||
kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
|
||||
loss = reconstruction_loss + kl_div
|
||||
self.log("train_loss", loss, sync_dist=True)
|
||||
|
||||
# add logging of images to tensorboard, x_reconstructed and x, so that
|
||||
# it updates every step and we can the progress pictures in tensorboard
|
||||
if batch_idx % 100 == 0:
|
||||
# take out the first 8
|
||||
x = x[:8]
|
||||
x_reconstructed = x_reconstructed[:8]
|
||||
grid = torchvision.utils.make_grid(x_reconstructed.view(-1, 1, 28, 28))
|
||||
self.logger.experiment.add_image("reconstructed", grid, self.global_step)
|
||||
grid = torchvision.utils.make_grid(x.view(-1, 1, 28, 28))
|
||||
self.logger.experiment.add_image("original", grid, self.global_step)
|
||||
return loss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
x, _ = batch
|
||||
x = x.view(-1, self.input_dim)
|
||||
x_reconstructed, mu, sigma = self.forward(x)
|
||||
reconstruction_loss = self.loss_fn(x_reconstructed, x)
|
||||
kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
|
||||
loss = reconstruction_loss + kl_div
|
||||
self.log("val_loss", loss, sync_dist=True)
|
||||
return loss
|
||||
|
||||
def test_step(self, batch, batch_idx):
|
||||
x, _ = batch
|
||||
x = x.view(-1, self.input_dim)
|
||||
x_reconstructed, mu, sigma = self.forward(x)
|
||||
reconstruction_loss = self.loss_fn(x_reconstructed, x)
|
||||
kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
|
||||
loss = reconstruction_loss + kl_div
|
||||
self.log("test_loss", loss, sync_dist=True)
|
||||
return loss
|
||||
|
||||
def configure_optimizers(self):
|
||||
optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
|
||||
return optimizer
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
batch_size = 8
|
||||
x = torch.randn(batch_size, 28 * 28 * 1)
|
||||
vae_pl = VAEpl()
|
||||
x_reconstructed, mu, sigma = vae_pl(x)
|
||||
print(x_reconstructed.shape)
|
||||
49
ML/Pytorch/more_advanced/VAE/lightning_vae/train.py
Normal file
49
ML/Pytorch/more_advanced/VAE/lightning_vae/train.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import torch
|
||||
import torchvision.datasets as datasets # Standard datasets
|
||||
from tqdm import tqdm
|
||||
from torch import nn, optim
|
||||
from torchvision import transforms
|
||||
from torchvision.utils import save_image
|
||||
from torch.utils.data import DataLoader
|
||||
from dataset import MNISTDataModule
|
||||
import pytorch_lightning as pl
|
||||
from model import VAEpl
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
from pytorch_lightning.strategies import DeepSpeedStrategy
|
||||
torch.set_float32_matmul_precision("medium")
|
||||
|
||||
"""
|
||||
GOALS:
|
||||
* Understand the strategy (deepspeed, ddp, etc) and how to use it
|
||||
* Setup a config, for scheduler etc instead of configuring it in each sub-module
|
||||
* Metrics
|
||||
"""
|
||||
|
||||
|
||||
# things to add
|
||||
lr = 3e-4
|
||||
batch_size = 128
|
||||
num_workers = 2
|
||||
model = VAEpl(lr)
|
||||
dm = MNISTDataModule(batch_size, num_workers)
|
||||
logger = TensorBoardLogger("my_checkpoint", name="scheduler_autolr_vae_pl_model")
|
||||
|
||||
# add callback for learning rate monitor, model checkpoint, and scheduler on plateau
|
||||
callbacks = [pl.callbacks.LearningRateMonitor(logging_interval="step"),
|
||||
pl.callbacks.ModelCheckpoint(monitor="val_loss", save_top_k=1, mode="min", save_last=True),
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
trainer = pl.Trainer(
|
||||
max_epochs=100,
|
||||
accelerator="gpu",
|
||||
devices=2,
|
||||
logger=logger,
|
||||
#precision=16,
|
||||
strategy=DeepSpeedStrategy(
|
||||
stage=0,
|
||||
),
|
||||
)
|
||||
|
||||
#trainer.tune(model, dm)
|
||||
trainer.fit(model, dm)
|
||||
41
ML/Pytorch/more_advanced/VAE/lightning_vae/utils.py
Normal file
41
ML/Pytorch/more_advanced/VAE/lightning_vae/utils.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
# import save_image from torchvision.utils
|
||||
from torchvision.utils import save_image
|
||||
|
||||
|
||||
def inference(model, dataset, digit, num_examples=1):
|
||||
"""
|
||||
Generates (num_examples) of a particular digit.
|
||||
Specifically we extract an example of each digit,
|
||||
then after we have the mu, sigma representation for
|
||||
each digit we can sample from that.
|
||||
|
||||
After we sample we can run the decoder part of the VAE
|
||||
and generate examples.
|
||||
"""
|
||||
images = []
|
||||
idx = 0
|
||||
for x, y in dataset:
|
||||
if y == idx:
|
||||
images.append(x)
|
||||
idx += 1
|
||||
if idx == 10:
|
||||
break
|
||||
|
||||
encodings_digit = []
|
||||
for d in range(10):
|
||||
with torch.no_grad():
|
||||
mu, sigma = model.encode(images[d].view(1, 784))
|
||||
encodings_digit.append((mu, sigma))
|
||||
|
||||
mu, sigma = encodings_digit[digit]
|
||||
for example in range(num_examples):
|
||||
epsilon = torch.randn_like(sigma)
|
||||
z = mu + sigma * epsilon
|
||||
out = model.decode(z)
|
||||
out = out.view(-1, 1, 28, 28)
|
||||
save_image(out, f"generated_{digit}_ex{example}.png")
|
||||
|
||||
|
||||
@@ -23,27 +23,7 @@ model = VariationalAutoEncoder(INPUT_DIM, H_DIM, Z_DIM).to(DEVICE)
|
||||
optimizer = optim.Adam(model.parameters(), lr=LR_RATE)
|
||||
loss_fn = nn.BCELoss(reduction="sum")
|
||||
|
||||
# Start Training
|
||||
for epoch in range(NUM_EPOCHS):
|
||||
loop = tqdm(enumerate(train_loader))
|
||||
for i, (x, _) in loop:
|
||||
# Forward pass
|
||||
x = x.to(DEVICE).view(x.shape[0], INPUT_DIM)
|
||||
x_reconstructed, mu, sigma = model(x)
|
||||
|
||||
# Compute loss
|
||||
reconstruction_loss = loss_fn(x_reconstructed, x)
|
||||
kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
|
||||
|
||||
# Backprop
|
||||
loss = reconstruction_loss + kl_div
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
loop.set_postfix(loss=loss.item())
|
||||
|
||||
|
||||
model = model.to("cpu")
|
||||
def inference(digit, num_examples=1):
|
||||
"""
|
||||
Generates (num_examples) of a particular digit.
|
||||
@@ -79,8 +59,3 @@ def inference(digit, num_examples=1):
|
||||
|
||||
for idx in range(10):
|
||||
inference(idx, num_examples=5)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
120
ML/Pytorch/more_advanced/finetuning_whisper/dataset.py
Normal file
120
ML/Pytorch/more_advanced/finetuning_whisper/dataset.py
Normal file
@@ -0,0 +1,120 @@
|
||||
"""
|
||||
Create a PyTorch Custom dataset that loads file in data/other.tsv that contains
|
||||
the path to image audio and text transcription.
|
||||
"""
|
||||
import pytorch_lightning as pl
|
||||
from tqdm import tqdm
|
||||
import ffmpeg
|
||||
import os
|
||||
import torch
|
||||
import numpy as np
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
import pandas as pd
|
||||
from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
|
||||
import sys
|
||||
|
||||
class CommonVoice(Dataset):
|
||||
def __init__(self, data_dir, whisper_model="tiny"):
|
||||
self.sampling_rate = 16_000
|
||||
self.data_dir = data_dir
|
||||
self.data = pd.read_csv(
|
||||
os.path.join(data_dir, "other.tsv"),
|
||||
sep="\t",
|
||||
)
|
||||
self.feature_extractor = WhisperFeatureExtractor.from_pretrained(
|
||||
f"openai/whisper-{whisper_model}"
|
||||
)
|
||||
self.tokenizer = WhisperTokenizer.from_pretrained(
|
||||
f"openai/whisper-{whisper_model}", language="sv", task="transcribe"
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
audio_file_path = os.path.join(
|
||||
self.data_dir + "clips/", self.data.iloc[idx]["path"]
|
||||
)
|
||||
sentence = self.data.iloc[idx]["sentence"]
|
||||
text = self.tokenizer(sentence).input_ids
|
||||
|
||||
out, _ = (
|
||||
ffmpeg.input(audio_file_path, threads=0)
|
||||
.output(
|
||||
"-", format="s16le", acodec="pcm_s16le", ac=1, ar=self.sampling_rate
|
||||
)
|
||||
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
|
||||
)
|
||||
out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
|
||||
|
||||
# run feature extractor
|
||||
audio_features = self.feature_extractor(
|
||||
out, sampling_rate=self.sampling_rate, return_tensors="pt"
|
||||
)
|
||||
|
||||
return audio_features, text
|
||||
|
||||
|
||||
# Create a collator that will pad the audio features and text labels
|
||||
class DataCollatorSpeechSeq2SeqWithPadding:
|
||||
def __init__(self, feature_extractor, tokenizer):
|
||||
self.feature_extractor = feature_extractor
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
def __call__(self, batch):
|
||||
text_features = [{"input_ids": x[1]} for x in batch]
|
||||
batch_text = self.tokenizer.pad(
|
||||
text_features, return_tensors="pt",
|
||||
)
|
||||
audio_features = [{"input_features": x[0]["input_features"]} for x in batch]
|
||||
|
||||
batch_audio = self.feature_extractor.pad(
|
||||
audio_features, return_tensors="pt",
|
||||
)
|
||||
batch_text["input_ids"] = batch_text["input_ids"].masked_fill(
|
||||
batch_text["attention_mask"].ne(1), -100
|
||||
)
|
||||
|
||||
batch_audio["input_features"] = batch_audio["input_features"].squeeze(1)
|
||||
|
||||
labels = batch_text["input_ids"].clone()
|
||||
if (labels[:, 0] == self.tokenizer.encode("")[0]).all().cpu().item():
|
||||
labels = labels[:, 1:]
|
||||
|
||||
batch_text["labels"] = labels
|
||||
return batch_audio, batch_text
|
||||
|
||||
|
||||
# Put into a lightning datamodule
|
||||
class WhisperDataset(pl.LightningDataModule):
|
||||
def __init__(self, data_dir, batch_size=32, num_workers=0, whisper_model="tiny"):
|
||||
super().__init__()
|
||||
self.data_dir = data_dir
|
||||
self.batch_size = batch_size
|
||||
self.num_workers = num_workers
|
||||
self.whisper_model = whisper_model
|
||||
self.sampling_rate = 16_000
|
||||
|
||||
def setup(self, stage=None):
|
||||
self.dataset = CommonVoice(self.data_dir, self.whisper_model)
|
||||
self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(
|
||||
self.dataset.feature_extractor, self.dataset.tokenizer
|
||||
)
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(
|
||||
self.dataset,
|
||||
batch_size=self.batch_size,
|
||||
shuffle=True,
|
||||
num_workers=self.num_workers,
|
||||
collate_fn=self.data_collator,
|
||||
)
|
||||
|
||||
|
||||
# Test if lightning datamodule working as intended
|
||||
if __name__ == "__main__":
|
||||
dm = WhisperDataset(data_dir="data/")
|
||||
dm.setup()
|
||||
from tqdm import tqdm
|
||||
for batch in tqdm(dm.train_dataloader()):
|
||||
pass
|
||||
34
ML/Pytorch/more_advanced/finetuning_whisper/model.py
Normal file
34
ML/Pytorch/more_advanced/finetuning_whisper/model.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import torch
|
||||
import torchvision
|
||||
from torch import nn
|
||||
import pytorch_lightning as pl
|
||||
from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
|
||||
from transformers import WhisperForConditionalGeneration
|
||||
|
||||
|
||||
class WhisperFinetuning(pl.LightningModule):
|
||||
def __init__(self, lr, whisper_model="tiny"):
|
||||
super().__init__()
|
||||
self.lr = lr
|
||||
self.model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{whisper_model}")
|
||||
self.model.config.forced_decoder_ids = None
|
||||
self.model.config.suppress_tokens = []
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
encoder_input = batch[0]["input_features"]
|
||||
decoder_labels = batch[1]["labels"]
|
||||
|
||||
out = self.model(
|
||||
input_features=encoder_input,
|
||||
labels=decoder_labels,
|
||||
)
|
||||
loss = out["loss"]
|
||||
return loss
|
||||
|
||||
def configure_optimizers(self):
|
||||
optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
|
||||
return optimizer
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pass
|
||||
9
ML/Pytorch/more_advanced/finetuning_whisper/steps.txt
Normal file
9
ML/Pytorch/more_advanced/finetuning_whisper/steps.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
Goal: re-write the code of huggingface whisper finetuning to use pytorch lightning
|
||||
1. load the dataset using lightning datamodule
|
||||
* integrate huggingface loading data, or we can write it ourselves and use lightning datamodule
|
||||
2. load the model using lightning module
|
||||
3. train the model using lightning trainer
|
||||
(4. See if we can sharded training with lightning trainer to maybe finetune a large whisper model
|
||||
that we couldn't on single GPU)
|
||||
|
||||
End goal: Finetune the model on our own dataset for some cool application
|
||||
7
ML/Pytorch/more_advanced/finetuning_whisper/test.py
Normal file
7
ML/Pytorch/more_advanced/finetuning_whisper/test.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from transformers import WhisperTokenizer
|
||||
tokenizer = WhisperTokenizer.from_pretrained(
|
||||
f"openai/whisper-tiny", task="transcribe"
|
||||
)
|
||||
encoded_string = tokenizer.encode("")[0]
|
||||
print(encoded_string) # should print 50258
|
||||
print(tokenizer.bos_token_id) # should print 50257
|
||||
31
ML/Pytorch/more_advanced/finetuning_whisper/train.py
Normal file
31
ML/Pytorch/more_advanced/finetuning_whisper/train.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import torch
|
||||
import torchvision.datasets as datasets # Standard datasets
|
||||
from tqdm import tqdm
|
||||
from torch import nn, optim
|
||||
from torchvision import transforms
|
||||
from torchvision.utils import save_image
|
||||
from torch.utils.data import DataLoader
|
||||
import pytorch_lightning as pl
|
||||
from model import WhisperFinetuning
|
||||
from dataset import WhisperDataset
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
from pytorch_lightning.strategies import DeepSpeedStrategy
|
||||
torch.set_float32_matmul_precision("medium")
|
||||
|
||||
# things to add
|
||||
lr = 1e-5
|
||||
batch_size = 32
|
||||
num_workers = 4
|
||||
model = WhisperFinetuning(lr)
|
||||
dm = WhisperDataset(data_dir="data/", batch_size=batch_size, num_workers=num_workers)
|
||||
|
||||
if __name__ == "__main__":
|
||||
trainer = pl.Trainer(
|
||||
max_epochs=1000,
|
||||
accelerator="gpu",
|
||||
devices=[0],
|
||||
precision=16,
|
||||
)
|
||||
|
||||
trainer.fit(model, dm)
|
||||
|
||||
181
ML/Pytorch/more_advanced/finetuning_whisper/whisper.py
Normal file
181
ML/Pytorch/more_advanced/finetuning_whisper/whisper.py
Normal file
@@ -0,0 +1,181 @@
|
||||
import evaluate
|
||||
from transformers import Seq2SeqTrainer
|
||||
from transformers import WhisperForConditionalGeneration
|
||||
import torch
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Union
|
||||
from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
|
||||
from datasets import load_dataset, DatasetDict, Audio
|
||||
# set so we only can see first cuda device
|
||||
import os
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
|
||||
|
||||
common_voice = DatasetDict()
|
||||
common_voice["train"] = load_dataset(
|
||||
"mozilla-foundation/common_voice_11_0",
|
||||
"sv-SE",
|
||||
split="train+validation",
|
||||
use_auth_token=False,
|
||||
)
|
||||
common_voice["test"] = load_dataset(
|
||||
"mozilla-foundation/common_voice_11_0",
|
||||
"sv-SE",
|
||||
split="test",
|
||||
use_auth_token=False,
|
||||
)
|
||||
|
||||
# common_voice = common_voice.remove_columns(
|
||||
# [
|
||||
# "accent",
|
||||
# "age",
|
||||
# "client_id",
|
||||
# "down_votes",
|
||||
# "gender",
|
||||
# "locale",
|
||||
# "path",
|
||||
# "segment",
|
||||
# "up_votes",
|
||||
# ]
|
||||
# )
|
||||
|
||||
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
|
||||
tokenizer = WhisperTokenizer.from_pretrained(
|
||||
"openai/whisper-tiny", language="sv", task="transcribe"
|
||||
)
|
||||
|
||||
input_str = common_voice["train"][0]["sentence"]
|
||||
labels = tokenizer(input_str).input_ids
|
||||
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
|
||||
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
|
||||
|
||||
print(f"Input: {input_str}")
|
||||
print(f"Decoded w/ special: {decoded_with_special}")
|
||||
print(f"Decoded w/out special: {decoded_str}")
|
||||
print(f"Are equal: {input_str == decoded_str}")
|
||||
|
||||
input_str = common_voice["train"][0]["sentence"]
|
||||
labels = tokenizer(input_str).input_ids
|
||||
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
|
||||
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
|
||||
|
||||
processor = WhisperProcessor.from_pretrained(
|
||||
"openai/whisper-small", language="sv", task="transcribe"
|
||||
)
|
||||
|
||||
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
|
||||
|
||||
|
||||
def prepare_dataset(example):
|
||||
# load and resample audio data from 48 to 16kHz
|
||||
audio = example["audio"]
|
||||
|
||||
# compute log-Mel input features from input audio array
|
||||
example["input_features"] = feature_extractor(
|
||||
audio["array"], sampling_rate=audio["sampling_rate"]
|
||||
).input_features[0]
|
||||
|
||||
# encode target text to label ids
|
||||
example["labels"] = tokenizer(example["sentence"]).input_ids
|
||||
return example
|
||||
|
||||
|
||||
common_voice = common_voice.map(prepare_dataset, num_proc=8)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataCollatorSpeechSeq2SeqWithPadding:
|
||||
processor: Any
|
||||
|
||||
def __call__(
|
||||
self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
|
||||
) -> Dict[str, torch.Tensor]:
|
||||
# split inputs and labels since they have to be of different lengths
|
||||
# and need different padding methods first treat the audio inputs by
|
||||
# simply returning torch tensors
|
||||
input_features = [
|
||||
{"input_features": feature["input_features"]} for feature in features
|
||||
]
|
||||
batch = self.processor.feature_extractor.pad(
|
||||
input_features, return_tensors="pt"
|
||||
)
|
||||
|
||||
# get the tokenized label sequences
|
||||
label_features = [{"input_ids": feature["labels"]} for feature in features]
|
||||
# pad the labels to max length
|
||||
labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
|
||||
|
||||
# replace padding with -100 to ignore loss correctly
|
||||
labels = labels_batch["input_ids"].masked_fill(
|
||||
labels_batch.attention_mask.ne(1), -100
|
||||
)
|
||||
|
||||
# if bos token is appended in previous tokenization step,
|
||||
# cut bos token here as it's append later anyways
|
||||
if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
|
||||
labels = labels[:, 1:]
|
||||
|
||||
batch["labels"] = labels
|
||||
return batch
|
||||
|
||||
|
||||
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
|
||||
metric = evaluate.load("wer")
|
||||
|
||||
|
||||
def compute_metrics(pred):
|
||||
pred_ids = pred.predictions
|
||||
label_ids = pred.label_ids
|
||||
|
||||
# replace -100 with the pad_token_id
|
||||
label_ids[label_ids == -100] = tokenizer.pad_token_id
|
||||
|
||||
# we do not want to group tokens when computing the metrics
|
||||
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
|
||||
label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
|
||||
wer = 100 * metric.compute(predictions=pred_str, references=label_str)
|
||||
|
||||
return {"wer": wer}
|
||||
|
||||
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
|
||||
model.config.forced_decoder_ids = None
|
||||
model.config.suppress_tokens = []
|
||||
|
||||
from transformers import Seq2SeqTrainingArguments
|
||||
|
||||
training_args = Seq2SeqTrainingArguments(
|
||||
output_dir="./whisper-tiny-swedish", # change to a repo name of your choice
|
||||
per_device_train_batch_size=32,
|
||||
gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size
|
||||
learning_rate=1e-5,
|
||||
warmup_steps=500,
|
||||
max_steps=4000,
|
||||
gradient_checkpointing=False,
|
||||
fp16=True,
|
||||
evaluation_strategy="steps",
|
||||
per_device_eval_batch_size=8,
|
||||
predict_with_generate=True,
|
||||
generation_max_length=225,
|
||||
save_steps=1000,
|
||||
eval_steps=1000,
|
||||
logging_steps=25,
|
||||
report_to=["tensorboard"],
|
||||
load_best_model_at_end=True,
|
||||
metric_for_best_model="wer",
|
||||
greater_is_better=False,
|
||||
push_to_hub=False,
|
||||
dataloader_num_workers=0,
|
||||
)
|
||||
|
||||
|
||||
trainer = Seq2SeqTrainer(
|
||||
args=training_args,
|
||||
model=model,
|
||||
train_dataset=common_voice["train"],
|
||||
eval_dataset=common_voice["test"],
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
tokenizer=processor.feature_extractor,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
Reference in New Issue
Block a user