add lightning code, finetuning whisper, recommender system neural collaborative filtering

2026-04-10 12:33:44 +00:00 · 2023-02-21 16:25:42 +01:00
parent c646ef65e2
commit 94f6c024fe
51 changed files with 17977 additions and 25 deletions
--- a/ML/Pytorch/more_advanced/finetuning_whisper/dataset.py
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/dataset.py
@@ -0,0 +1,120 @@
+"""
+Create a PyTorch Custom dataset that loads file in data/other.tsv that contains 
+the path to image audio and text transcription.
+"""
+import pytorch_lightning as pl
+from tqdm import tqdm
+import ffmpeg
+import os
+import torch
+import numpy as np
+from torch.utils.data import Dataset, DataLoader
+import pandas as pd
+from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
+import sys 
+
+class CommonVoice(Dataset):
+    def __init__(self, data_dir, whisper_model="tiny"):
+        self.sampling_rate = 16_000
+        self.data_dir = data_dir
+        self.data = pd.read_csv(
+            os.path.join(data_dir, "other.tsv"),
+            sep="\t",
+        )
+        self.feature_extractor = WhisperFeatureExtractor.from_pretrained(
+            f"openai/whisper-{whisper_model}"
+        )
+        self.tokenizer = WhisperTokenizer.from_pretrained(
+            f"openai/whisper-{whisper_model}", language="sv", task="transcribe"
+        )
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        audio_file_path = os.path.join(
+            self.data_dir + "clips/", self.data.iloc[idx]["path"]
+        )
+        sentence = self.data.iloc[idx]["sentence"]
+        text = self.tokenizer(sentence).input_ids
+        
+        out, _ = (
+            ffmpeg.input(audio_file_path, threads=0)
+            .output(
+                "-", format="s16le", acodec="pcm_s16le", ac=1, ar=self.sampling_rate
+            )
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+        )
+        out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+
+        # run feature extractor
+        audio_features = self.feature_extractor(
+            out, sampling_rate=self.sampling_rate, return_tensors="pt"
+        )
+
+        return audio_features, text
+
+
+# Create a collator that will pad the audio features and text labels
+class DataCollatorSpeechSeq2SeqWithPadding:
+    def __init__(self, feature_extractor, tokenizer):
+        self.feature_extractor = feature_extractor
+        self.tokenizer = tokenizer
+
+    def __call__(self, batch):
+        text_features = [{"input_ids": x[1]} for x in batch]
+        batch_text = self.tokenizer.pad(
+            text_features, return_tensors="pt",
+        )
+        audio_features = [{"input_features": x[0]["input_features"]} for x in batch]
+
+        batch_audio = self.feature_extractor.pad(
+            audio_features, return_tensors="pt",
+        )
+        batch_text["input_ids"] = batch_text["input_ids"].masked_fill(
+            batch_text["attention_mask"].ne(1), -100
+        )
+        
+        batch_audio["input_features"] = batch_audio["input_features"].squeeze(1)
+
+        labels = batch_text["input_ids"].clone()
+        if (labels[:, 0] == self.tokenizer.encode("")[0]).all().cpu().item():
+            labels = labels[:, 1:]
+
+        batch_text["labels"] = labels
+        return batch_audio, batch_text
+
+
+# Put into a lightning datamodule 
+class WhisperDataset(pl.LightningDataModule):
+    def __init__(self, data_dir, batch_size=32, num_workers=0, whisper_model="tiny"):
+        super().__init__()
+        self.data_dir = data_dir
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.whisper_model = whisper_model
+        self.sampling_rate = 16_000
+
+    def setup(self, stage=None):
+        self.dataset = CommonVoice(self.data_dir, self.whisper_model)
+        self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(
+            self.dataset.feature_extractor, self.dataset.tokenizer
+        )
+
+    def train_dataloader(self):
+        return DataLoader(
+            self.dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            collate_fn=self.data_collator,
+        )
+
+
+# Test if lightning datamodule working as intended 
+if __name__ == "__main__":
+    dm = WhisperDataset(data_dir="data/")
+    dm.setup()
+    from tqdm import tqdm 
+    for batch in tqdm(dm.train_dataloader()):
+        pass
--- a/ML/Pytorch/more_advanced/finetuning_whisper/model.py
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/model.py
@@ -0,0 +1,34 @@
+import torch
+import torchvision
+from torch import nn
+import pytorch_lightning as pl
+from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
+from transformers import WhisperForConditionalGeneration
+
+
+class WhisperFinetuning(pl.LightningModule):
+    def __init__(self, lr, whisper_model="tiny"):
+        super().__init__()
+        self.lr = lr
+        self.model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{whisper_model}")
+        self.model.config.forced_decoder_ids = None
+        self.model.config.suppress_tokens = []
+
+    def training_step(self, batch, batch_idx):
+        encoder_input = batch[0]["input_features"]
+        decoder_labels = batch[1]["labels"]
+        
+        out = self.model(
+            input_features=encoder_input,
+            labels=decoder_labels,
+        )
+        loss = out["loss"] 
+        return loss 
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
+        return optimizer
+
+
+if __name__ == "__main__":
+    pass
--- a/ML/Pytorch/more_advanced/finetuning_whisper/steps.txt
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/steps.txt
@@ -0,0 +1,9 @@
+Goal: re-write the code of huggingface whisper finetuning to use pytorch lightning
+1. load the dataset using lightning datamodule
+* integrate huggingface loading data, or we can write it ourselves and use lightning datamodule
+2. load the model using lightning module
+3. train the model using lightning trainer
+(4. See if we can sharded training with lightning trainer to maybe finetune a large whisper model 
+that we couldn't on single GPU)
+
+End goal: Finetune the model on our own dataset for some cool application
--- a/ML/Pytorch/more_advanced/finetuning_whisper/test.py
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/test.py
@@ -0,0 +1,7 @@
+from transformers import WhisperTokenizer
+tokenizer = WhisperTokenizer.from_pretrained(
+    f"openai/whisper-tiny", task="transcribe"
+)
+encoded_string = tokenizer.encode("")[0]
+print(encoded_string) # should print 50258
+print(tokenizer.bos_token_id) # should print 50257
--- a/ML/Pytorch/more_advanced/finetuning_whisper/train.py
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/train.py
@@ -0,0 +1,31 @@
+import torch
+import torchvision.datasets as datasets  # Standard datasets
+from tqdm import tqdm
+from torch import nn, optim
+from torchvision import transforms
+from torchvision.utils import save_image
+from torch.utils.data import DataLoader
+import pytorch_lightning as pl
+from model import WhisperFinetuning
+from dataset import  WhisperDataset
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.strategies import DeepSpeedStrategy
+torch.set_float32_matmul_precision("medium")
+
+# things to add
+lr = 1e-5
+batch_size = 32
+num_workers = 4
+model = WhisperFinetuning(lr)
+dm = WhisperDataset(data_dir="data/", batch_size=batch_size, num_workers=num_workers)
+
+if __name__ == "__main__":
+    trainer = pl.Trainer(
+        max_epochs=1000,
+        accelerator="gpu",
+        devices=[0],
+        precision=16,
+    )
+
+    trainer.fit(model, dm)
+ 
--- a/ML/Pytorch/more_advanced/finetuning_whisper/whisper.py
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/whisper.py
@@ -0,0 +1,181 @@
+import evaluate
+from transformers import Seq2SeqTrainer
+from transformers import WhisperForConditionalGeneration
+import torch
+from dataclasses import dataclass
+from typing import Any, Dict, List, Union
+from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
+from datasets import load_dataset, DatasetDict, Audio
+# set so we only can see first cuda device 
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+
+common_voice = DatasetDict()
+common_voice["train"] = load_dataset(
+    "mozilla-foundation/common_voice_11_0",
+    "sv-SE",
+    split="train+validation",
+    use_auth_token=False,
+)
+common_voice["test"] = load_dataset(
+    "mozilla-foundation/common_voice_11_0",
+    "sv-SE",
+    split="test",
+    use_auth_token=False,
+)
+
+# common_voice = common_voice.remove_columns(
+#     [
+#         "accent",
+#         "age",
+#         "client_id",
+#         "down_votes",
+#         "gender",
+#         "locale",
+#         "path",
+#         "segment",
+#         "up_votes",
+#     ]
+# )
+
+feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
+tokenizer = WhisperTokenizer.from_pretrained(
+    "openai/whisper-tiny", language="sv", task="transcribe"
+)
+
+input_str = common_voice["train"][0]["sentence"]
+labels = tokenizer(input_str).input_ids
+decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
+decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
+
+print(f"Input:                 {input_str}")
+print(f"Decoded w/ special:    {decoded_with_special}")
+print(f"Decoded w/out special: {decoded_str}")
+print(f"Are equal:             {input_str == decoded_str}")
+
+input_str = common_voice["train"][0]["sentence"]
+labels = tokenizer(input_str).input_ids
+decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
+decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
+
+processor = WhisperProcessor.from_pretrained(
+    "openai/whisper-small", language="sv", task="transcribe"
+)
+
+common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
+
+
+def prepare_dataset(example):
+    # load and resample audio data from 48 to 16kHz
+    audio = example["audio"]
+
+    # compute log-Mel input features from input audio array
+    example["input_features"] = feature_extractor(
+        audio["array"], sampling_rate=audio["sampling_rate"]
+    ).input_features[0]
+
+    # encode target text to label ids
+    example["labels"] = tokenizer(example["sentence"]).input_ids
+    return example
+
+
+common_voice = common_voice.map(prepare_dataset, num_proc=8)
+
+
+@dataclass
+class DataCollatorSpeechSeq2SeqWithPadding:
+    processor: Any
+
+    def __call__(
+        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
+    ) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths
+        # and need different padding methods first treat the audio inputs by
+        # simply returning torch tensors
+        input_features = [
+            {"input_features": feature["input_features"]} for feature in features
+        ]
+        batch = self.processor.feature_extractor.pad(
+            input_features, return_tensors="pt"
+        )
+
+        # get the tokenized label sequences
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        # pad the labels to max length
+        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
+
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(
+            labels_batch.attention_mask.ne(1), -100
+        )
+
+        # if bos token is appended in previous tokenization step,
+        # cut bos token here as it's append later anyways
+        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
+            labels = labels[:, 1:]
+
+        batch["labels"] = labels
+        return batch
+
+
+data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
+metric = evaluate.load("wer")
+
+
+def compute_metrics(pred):
+    pred_ids = pred.predictions
+    label_ids = pred.label_ids
+
+    # replace -100 with the pad_token_id
+    label_ids[label_ids == -100] = tokenizer.pad_token_id
+
+    # we do not want to group tokens when computing the metrics
+    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
+    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
+    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
+
+    return {"wer": wer}
+
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+model.config.forced_decoder_ids = None
+model.config.suppress_tokens = []
+
+from transformers import Seq2SeqTrainingArguments
+
+training_args = Seq2SeqTrainingArguments(
+    output_dir="./whisper-tiny-swedish",  # change to a repo name of your choice
+    per_device_train_batch_size=32,
+    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
+    learning_rate=1e-5,
+    warmup_steps=500,
+    max_steps=4000,
+    gradient_checkpointing=False,
+    fp16=True,
+    evaluation_strategy="steps",
+    per_device_eval_batch_size=8,
+    predict_with_generate=True,
+    generation_max_length=225,
+    save_steps=1000,
+    eval_steps=1000,
+    logging_steps=25,
+    report_to=["tensorboard"],
+    load_best_model_at_end=True,
+    metric_for_best_model="wer",
+    greater_is_better=False,
+    push_to_hub=False,
+    dataloader_num_workers=0,
+)
+
+
+trainer = Seq2SeqTrainer(
+    args=training_args,
+    model=model,
+    train_dataset=common_voice["train"],
+    eval_dataset=common_voice["test"],
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+    tokenizer=processor.feature_extractor,
+)
+
+trainer.train()