add lightning code, finetuning whisper, recommender system neural collaborative filtering

2026-02-21 11:18:01 +00:00 · 2023-02-21 16:25:42 +01:00
parent c646ef65e2
commit 94f6c024fe
51 changed files with 17977 additions and 25 deletions
--- a/ML/Pytorch/more_advanced/VAE/lightning_vae/.scale_batch_size_2121f91a-3045-4142-95c9-9f29857117c1.ckpt
+++ b/ML/Pytorch/more_advanced/VAE/lightning_vae/.scale_batch_size_2121f91a-3045-4142-95c9-9f29857117c1.ckpt
--- a/ML/Pytorch/more_advanced/VAE/lightning_vae/dataset.py
+++ b/ML/Pytorch/more_advanced/VAE/lightning_vae/dataset.py
@@ -0,0 +1,60 @@
+# Imports
+import torch
+import torchvision.datasets as datasets  # Standard datasets
+import torchvision.transforms as transforms  # Transformations we can perform on our dataset for augmentation
+from torch.utils.data import DataLoader
+import pytorch_lightning as pl
+
+
+class MNISTDataModule(pl.LightningDataModule):
+    def __init__(self, batch_size, num_workers):
+        super().__init__()
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+
+    def setup(self, stage):
+        mnist_full = train_dataset = datasets.MNIST(
+            root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+        )
+        self.mnist_test = datasets.MNIST(
+            root="dataset/", train=False, transform=transforms.ToTensor(), download=True
+        )
+        self.mnist_train, self.mnist_val = torch.utils.data.random_split(
+            mnist_full, [55000, 5000]
+        )
+
+    def train_dataloader(self):
+        return DataLoader(
+            self.mnist_train,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            persistent_workers=True,
+            shuffle=True,
+        )
+
+    def val_dataloader(self):
+        return DataLoader(
+            self.mnist_val,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            persistent_workers=True,
+            shuffle=False,
+        )
+
+    def test_dataloader(self):
+        return DataLoader(
+            self.mnist_test,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            persistent_workers=True,
+            shuffle=False,
+        )
+
+
+# check that it works
+if __name__ == "__main__":
+    dm = MNISTDataModule()
+    dm.setup("fit")
+    print(len(dm.mnist_train))
+    print(len(dm.mnist_val))
+    print(len(dm.mnist_test))
--- a/ML/Pytorch/more_advanced/VAE/lightning_vae/model.py
+++ b/ML/Pytorch/more_advanced/VAE/lightning_vae/model.py
@@ -0,0 +1,92 @@
+import torch
+import torchvision
+from torch import nn
+import pytorch_lightning as pl
+
+
+class VAEpl(pl.LightningModule):
+    def __init__(self, lr, input_dim=784, h_dim=200, z_dim=20):
+        super().__init__()
+        self.lr = lr
+        self.loss_fn = nn.BCELoss(reduction="sum")
+        self.input_dim = input_dim
+
+        # encoder
+        self.img_2hid = nn.Linear(input_dim, h_dim)
+        self.hid_2mu = nn.Linear(h_dim, z_dim)
+        self.hid_2sigma = nn.Linear(h_dim, z_dim)
+
+        # decoder
+        self.z_2hid = nn.Linear(z_dim, h_dim)
+        self.hid_2img = nn.Linear(h_dim, input_dim)
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+
+    def encode(self, x):
+        h = self.relu(self.img_2hid(x))
+        mu, sigma = self.hid_2mu(h), self.hid_2sigma(h)
+        return mu, sigma
+
+    def decode(self, z):
+        h = self.relu(self.z_2hid(z))
+        return torch.sigmoid(self.hid_2img(h))
+
+    def forward(self, x):
+        mu, sigma = self.encode(x)
+        epsilon = torch.randn_like(sigma)
+        z_new = mu + sigma * epsilon
+        x_reconstructed = self.decode(z_new)
+        return x_reconstructed, mu, sigma
+
+    def training_step(self, batch, batch_idx):
+        x, _ = batch
+        x = x.view(-1, self.input_dim)
+        x_reconstructed, mu, sigma = self.forward(x)
+        reconstruction_loss = self.loss_fn(x_reconstructed, x)
+        kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
+        loss = reconstruction_loss + kl_div
+        self.log("train_loss", loss, sync_dist=True)
+
+        # add logging of images to tensorboard, x_reconstructed and x, so that
+        # it updates every step and we can the progress pictures in tensorboard
+        if batch_idx % 100 == 0:
+            # take out the first 8
+            x = x[:8]
+            x_reconstructed = x_reconstructed[:8]
+            grid = torchvision.utils.make_grid(x_reconstructed.view(-1, 1, 28, 28))
+            self.logger.experiment.add_image("reconstructed", grid, self.global_step)
+            grid = torchvision.utils.make_grid(x.view(-1, 1, 28, 28))
+            self.logger.experiment.add_image("original", grid, self.global_step)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, _ = batch
+        x = x.view(-1, self.input_dim)
+        x_reconstructed, mu, sigma = self.forward(x)
+        reconstruction_loss = self.loss_fn(x_reconstructed, x)
+        kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
+        loss = reconstruction_loss + kl_div
+        self.log("val_loss", loss, sync_dist=True)
+        return loss
+
+    def test_step(self, batch, batch_idx):
+        x, _ = batch
+        x = x.view(-1, self.input_dim)
+        x_reconstructed, mu, sigma = self.forward(x)
+        reconstruction_loss = self.loss_fn(x_reconstructed, x)
+        kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
+        loss = reconstruction_loss + kl_div
+        self.log("test_loss", loss, sync_dist=True)
+        return loss
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
+        return optimizer
+
+
+if __name__ == "__main__":
+    batch_size = 8
+    x = torch.randn(batch_size, 28 * 28 * 1)
+    vae_pl = VAEpl()
+    x_reconstructed, mu, sigma = vae_pl(x)
+    print(x_reconstructed.shape)
--- a/ML/Pytorch/more_advanced/VAE/lightning_vae/train.py
+++ b/ML/Pytorch/more_advanced/VAE/lightning_vae/train.py
@@ -0,0 +1,49 @@
+import torch
+import torchvision.datasets as datasets  # Standard datasets
+from tqdm import tqdm
+from torch import nn, optim
+from torchvision import transforms
+from torchvision.utils import save_image
+from torch.utils.data import DataLoader
+from dataset import MNISTDataModule
+import pytorch_lightning as pl
+from model import VAEpl
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.strategies import DeepSpeedStrategy
+torch.set_float32_matmul_precision("medium")
+
+""" 
+GOALS:
+* Understand the strategy (deepspeed, ddp, etc) and how to use it
+* Setup a config, for scheduler etc instead of configuring it in each sub-module
+* Metrics
+"""
+
+
+# things to add
+lr = 3e-4
+batch_size = 128
+num_workers = 2
+model = VAEpl(lr)
+dm = MNISTDataModule(batch_size, num_workers)
+logger = TensorBoardLogger("my_checkpoint", name="scheduler_autolr_vae_pl_model")
+
+# add callback for learning rate monitor, model checkpoint, and scheduler on plateau
+callbacks = [pl.callbacks.LearningRateMonitor(logging_interval="step"),
+             pl.callbacks.ModelCheckpoint(monitor="val_loss", save_top_k=1, mode="min", save_last=True),
+             ]
+
+if __name__ == "__main__":
+    trainer = pl.Trainer(
+        max_epochs=100,
+        accelerator="gpu",
+        devices=2,
+        logger=logger,
+        #precision=16,
+        strategy=DeepSpeedStrategy(
+            stage=0,
+        ),
+    )
+
+    #trainer.tune(model, dm)
+    trainer.fit(model, dm) 
--- a/ML/Pytorch/more_advanced/VAE/lightning_vae/utils.py
+++ b/ML/Pytorch/more_advanced/VAE/lightning_vae/utils.py
@@ -0,0 +1,41 @@
+import torch 
+import torch.nn as nn
+import torch.nn.functional as F
+# import save_image from torchvision.utils
+from torchvision.utils import save_image
+
+
+def inference(model, dataset, digit, num_examples=1):
+    """
+    Generates (num_examples) of a particular digit.
+    Specifically we extract an example of each digit,
+    then after we have the mu, sigma representation for
+    each digit we can sample from that.
+
+    After we sample we can run the decoder part of the VAE
+    and generate examples.
+    """
+    images = []
+    idx = 0
+    for x, y in dataset:
+        if y == idx:
+            images.append(x)
+            idx += 1
+        if idx == 10:
+            break
+
+    encodings_digit = []
+    for d in range(10):
+        with torch.no_grad():
+            mu, sigma = model.encode(images[d].view(1, 784))
+        encodings_digit.append((mu, sigma))
+
+    mu, sigma = encodings_digit[digit]
+    for example in range(num_examples):
+        epsilon = torch.randn_like(sigma)
+        z = mu + sigma * epsilon
+        out = model.decode(z)
+        out = out.view(-1, 1, 28, 28)
+        save_image(out, f"generated_{digit}_ex{example}.png")
+
+
--- a/ML/Pytorch/more_advanced/VAE/train.py
+++ b/ML/Pytorch/more_advanced/VAE/train.py
@@ -23,27 +23,7 @@ model = VariationalAutoEncoder(INPUT_DIM, H_DIM, Z_DIM).to(DEVICE)
 optimizer = optim.Adam(model.parameters(), lr=LR_RATE)
 loss_fn = nn.BCELoss(reduction="sum")

-# Start Training
-for epoch in range(NUM_EPOCHS):
-    loop = tqdm(enumerate(train_loader))
-    for i, (x, _) in loop:
-        # Forward pass
-        x = x.to(DEVICE).view(x.shape[0], INPUT_DIM)
-        x_reconstructed, mu, sigma = model(x)

-        # Compute loss
-        reconstruction_loss = loss_fn(x_reconstructed, x)
-        kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
-
-        # Backprop
-        loss = reconstruction_loss + kl_div
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        loop.set_postfix(loss=loss.item())
-
-
-model = model.to("cpu")
 def inference(digit, num_examples=1):
    """
    Generates (num_examples) of a particular digit.
@@ -79,8 +59,3 @@ def inference(digit, num_examples=1):

 for idx in range(10):
    inference(idx, num_examples=5)
-
-
-
-
-
--- a/ML/Pytorch/more_advanced/finetuning_whisper/dataset.py
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/dataset.py
@@ -0,0 +1,120 @@
+"""
+Create a PyTorch Custom dataset that loads file in data/other.tsv that contains 
+the path to image audio and text transcription.
+"""
+import pytorch_lightning as pl
+from tqdm import tqdm
+import ffmpeg
+import os
+import torch
+import numpy as np
+from torch.utils.data import Dataset, DataLoader
+import pandas as pd
+from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
+import sys 
+
+class CommonVoice(Dataset):
+    def __init__(self, data_dir, whisper_model="tiny"):
+        self.sampling_rate = 16_000
+        self.data_dir = data_dir
+        self.data = pd.read_csv(
+            os.path.join(data_dir, "other.tsv"),
+            sep="\t",
+        )
+        self.feature_extractor = WhisperFeatureExtractor.from_pretrained(
+            f"openai/whisper-{whisper_model}"
+        )
+        self.tokenizer = WhisperTokenizer.from_pretrained(
+            f"openai/whisper-{whisper_model}", language="sv", task="transcribe"
+        )
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        audio_file_path = os.path.join(
+            self.data_dir + "clips/", self.data.iloc[idx]["path"]
+        )
+        sentence = self.data.iloc[idx]["sentence"]
+        text = self.tokenizer(sentence).input_ids
+        
+        out, _ = (
+            ffmpeg.input(audio_file_path, threads=0)
+            .output(
+                "-", format="s16le", acodec="pcm_s16le", ac=1, ar=self.sampling_rate
+            )
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+        )
+        out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+
+        # run feature extractor
+        audio_features = self.feature_extractor(
+            out, sampling_rate=self.sampling_rate, return_tensors="pt"
+        )
+
+        return audio_features, text
+
+
+# Create a collator that will pad the audio features and text labels
+class DataCollatorSpeechSeq2SeqWithPadding:
+    def __init__(self, feature_extractor, tokenizer):
+        self.feature_extractor = feature_extractor
+        self.tokenizer = tokenizer
+
+    def __call__(self, batch):
+        text_features = [{"input_ids": x[1]} for x in batch]
+        batch_text = self.tokenizer.pad(
+            text_features, return_tensors="pt",
+        )
+        audio_features = [{"input_features": x[0]["input_features"]} for x in batch]
+
+        batch_audio = self.feature_extractor.pad(
+            audio_features, return_tensors="pt",
+        )
+        batch_text["input_ids"] = batch_text["input_ids"].masked_fill(
+            batch_text["attention_mask"].ne(1), -100
+        )
+        
+        batch_audio["input_features"] = batch_audio["input_features"].squeeze(1)
+
+        labels = batch_text["input_ids"].clone()
+        if (labels[:, 0] == self.tokenizer.encode("")[0]).all().cpu().item():
+            labels = labels[:, 1:]
+
+        batch_text["labels"] = labels
+        return batch_audio, batch_text
+
+
+# Put into a lightning datamodule 
+class WhisperDataset(pl.LightningDataModule):
+    def __init__(self, data_dir, batch_size=32, num_workers=0, whisper_model="tiny"):
+        super().__init__()
+        self.data_dir = data_dir
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.whisper_model = whisper_model
+        self.sampling_rate = 16_000
+
+    def setup(self, stage=None):
+        self.dataset = CommonVoice(self.data_dir, self.whisper_model)
+        self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(
+            self.dataset.feature_extractor, self.dataset.tokenizer
+        )
+
+    def train_dataloader(self):
+        return DataLoader(
+            self.dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            collate_fn=self.data_collator,
+        )
+
+
+# Test if lightning datamodule working as intended 
+if __name__ == "__main__":
+    dm = WhisperDataset(data_dir="data/")
+    dm.setup()
+    from tqdm import tqdm 
+    for batch in tqdm(dm.train_dataloader()):
+        pass
--- a/ML/Pytorch/more_advanced/finetuning_whisper/model.py
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/model.py
@@ -0,0 +1,34 @@
+import torch
+import torchvision
+from torch import nn
+import pytorch_lightning as pl
+from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
+from transformers import WhisperForConditionalGeneration
+
+
+class WhisperFinetuning(pl.LightningModule):
+    def __init__(self, lr, whisper_model="tiny"):
+        super().__init__()
+        self.lr = lr
+        self.model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{whisper_model}")
+        self.model.config.forced_decoder_ids = None
+        self.model.config.suppress_tokens = []
+
+    def training_step(self, batch, batch_idx):
+        encoder_input = batch[0]["input_features"]
+        decoder_labels = batch[1]["labels"]
+        
+        out = self.model(
+            input_features=encoder_input,
+            labels=decoder_labels,
+        )
+        loss = out["loss"] 
+        return loss 
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
+        return optimizer
+
+
+if __name__ == "__main__":
+    pass
--- a/ML/Pytorch/more_advanced/finetuning_whisper/steps.txt
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/steps.txt
@@ -0,0 +1,9 @@
+Goal: re-write the code of huggingface whisper finetuning to use pytorch lightning
+1. load the dataset using lightning datamodule
+* integrate huggingface loading data, or we can write it ourselves and use lightning datamodule
+2. load the model using lightning module
+3. train the model using lightning trainer
+(4. See if we can sharded training with lightning trainer to maybe finetune a large whisper model 
+that we couldn't on single GPU)
+
+End goal: Finetune the model on our own dataset for some cool application
--- a/ML/Pytorch/more_advanced/finetuning_whisper/test.py
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/test.py
@@ -0,0 +1,7 @@
+from transformers import WhisperTokenizer
+tokenizer = WhisperTokenizer.from_pretrained(
+    f"openai/whisper-tiny", task="transcribe"
+)
+encoded_string = tokenizer.encode("")[0]
+print(encoded_string) # should print 50258
+print(tokenizer.bos_token_id) # should print 50257
--- a/ML/Pytorch/more_advanced/finetuning_whisper/train.py
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/train.py
@@ -0,0 +1,31 @@
+import torch
+import torchvision.datasets as datasets  # Standard datasets
+from tqdm import tqdm
+from torch import nn, optim
+from torchvision import transforms
+from torchvision.utils import save_image
+from torch.utils.data import DataLoader
+import pytorch_lightning as pl
+from model import WhisperFinetuning
+from dataset import  WhisperDataset
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.strategies import DeepSpeedStrategy
+torch.set_float32_matmul_precision("medium")
+
+# things to add
+lr = 1e-5
+batch_size = 32
+num_workers = 4
+model = WhisperFinetuning(lr)
+dm = WhisperDataset(data_dir="data/", batch_size=batch_size, num_workers=num_workers)
+
+if __name__ == "__main__":
+    trainer = pl.Trainer(
+        max_epochs=1000,
+        accelerator="gpu",
+        devices=[0],
+        precision=16,
+    )
+
+    trainer.fit(model, dm)
+ 
--- a/ML/Pytorch/more_advanced/finetuning_whisper/whisper.py
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/whisper.py
@@ -0,0 +1,181 @@
+import evaluate
+from transformers import Seq2SeqTrainer
+from transformers import WhisperForConditionalGeneration
+import torch
+from dataclasses import dataclass
+from typing import Any, Dict, List, Union
+from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
+from datasets import load_dataset, DatasetDict, Audio
+# set so we only can see first cuda device 
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+
+common_voice = DatasetDict()
+common_voice["train"] = load_dataset(
+    "mozilla-foundation/common_voice_11_0",
+    "sv-SE",
+    split="train+validation",
+    use_auth_token=False,
+)
+common_voice["test"] = load_dataset(
+    "mozilla-foundation/common_voice_11_0",
+    "sv-SE",
+    split="test",
+    use_auth_token=False,
+)
+
+# common_voice = common_voice.remove_columns(
+#     [
+#         "accent",
+#         "age",
+#         "client_id",
+#         "down_votes",
+#         "gender",
+#         "locale",
+#         "path",
+#         "segment",
+#         "up_votes",
+#     ]
+# )
+
+feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
+tokenizer = WhisperTokenizer.from_pretrained(
+    "openai/whisper-tiny", language="sv", task="transcribe"
+)
+
+input_str = common_voice["train"][0]["sentence"]
+labels = tokenizer(input_str).input_ids
+decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
+decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
+
+print(f"Input:                 {input_str}")
+print(f"Decoded w/ special:    {decoded_with_special}")
+print(f"Decoded w/out special: {decoded_str}")
+print(f"Are equal:             {input_str == decoded_str}")
+
+input_str = common_voice["train"][0]["sentence"]
+labels = tokenizer(input_str).input_ids
+decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
+decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
+
+processor = WhisperProcessor.from_pretrained(
+    "openai/whisper-small", language="sv", task="transcribe"
+)
+
+common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
+
+
+def prepare_dataset(example):
+    # load and resample audio data from 48 to 16kHz
+    audio = example["audio"]
+
+    # compute log-Mel input features from input audio array
+    example["input_features"] = feature_extractor(
+        audio["array"], sampling_rate=audio["sampling_rate"]
+    ).input_features[0]
+
+    # encode target text to label ids
+    example["labels"] = tokenizer(example["sentence"]).input_ids
+    return example
+
+
+common_voice = common_voice.map(prepare_dataset, num_proc=8)
+
+
+@dataclass
+class DataCollatorSpeechSeq2SeqWithPadding:
+    processor: Any
+
+    def __call__(
+        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
+    ) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths
+        # and need different padding methods first treat the audio inputs by
+        # simply returning torch tensors
+        input_features = [
+            {"input_features": feature["input_features"]} for feature in features
+        ]
+        batch = self.processor.feature_extractor.pad(
+            input_features, return_tensors="pt"
+        )
+
+        # get the tokenized label sequences
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        # pad the labels to max length
+        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
+
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(
+            labels_batch.attention_mask.ne(1), -100
+        )
+
+        # if bos token is appended in previous tokenization step,
+        # cut bos token here as it's append later anyways
+        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
+            labels = labels[:, 1:]
+
+        batch["labels"] = labels
+        return batch
+
+
+data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
+metric = evaluate.load("wer")
+
+
+def compute_metrics(pred):
+    pred_ids = pred.predictions
+    label_ids = pred.label_ids
+
+    # replace -100 with the pad_token_id
+    label_ids[label_ids == -100] = tokenizer.pad_token_id
+
+    # we do not want to group tokens when computing the metrics
+    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
+    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
+    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
+
+    return {"wer": wer}
+
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+model.config.forced_decoder_ids = None
+model.config.suppress_tokens = []
+
+from transformers import Seq2SeqTrainingArguments
+
+training_args = Seq2SeqTrainingArguments(
+    output_dir="./whisper-tiny-swedish",  # change to a repo name of your choice
+    per_device_train_batch_size=32,
+    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
+    learning_rate=1e-5,
+    warmup_steps=500,
+    max_steps=4000,
+    gradient_checkpointing=False,
+    fp16=True,
+    evaluation_strategy="steps",
+    per_device_eval_batch_size=8,
+    predict_with_generate=True,
+    generation_max_length=225,
+    save_steps=1000,
+    eval_steps=1000,
+    logging_steps=25,
+    report_to=["tensorboard"],
+    load_best_model_at_end=True,
+    metric_for_best_model="wer",
+    greater_is_better=False,
+    push_to_hub=False,
+    dataloader_num_workers=0,
+)
+
+
+trainer = Seq2SeqTrainer(
+    args=training_args,
+    model=model,
+    train_dataset=common_voice["train"],
+    eval_dataset=common_voice["test"],
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+    tokenizer=processor.feature_extractor,
+)
+
+trainer.train()