diff --git a/ML/Pytorch/Basics/lightning_simple_CNN.py b/ML/Pytorch/Basics/lightning_simple_CNN.py
new file mode 100644
index 0000000..6d185a5
--- /dev/null
+++ b/ML/Pytorch/Basics/lightning_simple_CNN.py
@@ -0,0 +1,190 @@
+"""
+Simple pytorch lightning example
+"""
+
+# Imports
+import torch
+import torch.nn.functional as F # Parameterless functions, like (some) activation functions
+import torchvision.datasets as datasets # Standard datasets
+import torchvision.transforms as transforms # Transformations we can perform on our dataset for augmentation
+from torch import optim # For optimizers like SGD, Adam, etc.
+from torch import nn # All neural network modules
+from torch.utils.data import (
+ DataLoader,
+) # Gives easier dataset managment by creating mini batches etc.
+from tqdm import tqdm # For nice progress bar!
+import pytorch_lightning as pl
+import torchmetrics
+from pytorch_lightning.callbacks import Callback, EarlyStopping
+
+
+precision = "medium"
+torch.set_float32_matmul_precision(precision)
+criterion = nn.CrossEntropyLoss()
+
+
+## use 20% of training data for validation
+# train_set_size = int(len(train_dataset) * 0.8)
+# valid_set_size = len(train_dataset) - train_set_size
+#
+## split the train set into two
+# seed = torch.Generator().manual_seed(42)
+# train_dataset, val_dataset = torch.utils.data.random_split(
+# train_dataset, [train_set_size, valid_set_size], generator=seed
+# )
+
+
+class CNNLightning(pl.LightningModule):
+ def __init__(self, lr=3e-4, in_channels=1, num_classes=10):
+ super().__init__()
+ self.lr = lr
+ self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10)
+ self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10)
+ self.conv1 = nn.Conv2d(
+ in_channels=in_channels,
+ out_channels=8,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ )
+ self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+ self.conv2 = nn.Conv2d(
+ in_channels=8,
+ out_channels=16,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ )
+ self.fc1 = nn.Linear(16 * 7 * 7, num_classes)
+ self.lr = lr
+
+ def training_step(self, batch, batch_idx):
+ x, y = batch
+ y_hat = self._common_step(x, batch_idx)
+ loss = criterion(y_hat, y)
+ accuracy = self.train_acc(y_hat, y)
+ self.log(
+ "train_acc_step",
+ self.train_acc,
+ on_step=True,
+ on_epoch=False,
+ prog_bar=True,
+ )
+ return loss
+
+ def training_epoch_end(self, outputs):
+ self.train_acc.reset()
+
+ def test_step(self, batch, batch_idx):
+ x, y = batch
+ y_hat = self._common_step(x, batch_idx)
+ loss = F.cross_entropy(y_hat, y)
+ accuracy = self.test_acc(y_hat, y)
+ self.log("test_loss", loss, on_step=True)
+ self.log("test_acc", accuracy, on_step=True)
+
+ def validation_step(self, batch, batch_idx):
+ x, y = batch
+ y_hat = self._common_step(x, batch_idx)
+ loss = F.cross_entropy(y_hat, y)
+ accuracy = self.test_acc(y_hat, y)
+ self.log("val_loss", loss, on_step=True)
+ self.log("val_acc", accuracy, on_step=True)
+
+ def predict_step(self, batch, batch_idx):
+ x, y = batch
+ y_hat = self._common_step(x)
+ return y_hat
+
+ def _common_step(self, x, batch_idx):
+ x = self.pool(F.relu(self.conv1(x)))
+ x = self.pool(F.relu(self.conv2(x)))
+ x = x.reshape(x.shape[0], -1)
+ y_hat = self.fc1(x)
+ return y_hat
+
+ def configure_optimizers(self):
+ optimizer = optim.Adam(self.parameters(), lr=self.lr)
+ return optimizer
+
+
+class MNISTDataModule(pl.LightningDataModule):
+ def __init__(self, batch_size=512):
+ super().__init__()
+ self.batch_size = batch_size
+
+ def setup(self, stage):
+ mnist_full = train_dataset = datasets.MNIST(
+ root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+ )
+ self.mnist_test = datasets.MNIST(
+ root="dataset/", train=False, transform=transforms.ToTensor(), download=True
+ )
+ self.mnist_train, self.mnist_val = torch.utils.data.random_split(
+ mnist_full, [55000, 5000]
+ )
+
+ def train_dataloader(self):
+ return DataLoader(
+ self.mnist_train,
+ batch_size=self.batch_size,
+ num_workers=6,
+ shuffle=True,
+ )
+
+ def val_dataloader(self):
+ return DataLoader(
+ self.mnist_val, batch_size=self.batch_size, num_workers=2, shuffle=False
+ )
+
+ def test_dataloader(self):
+ return DataLoader(
+ self.mnist_test, batch_size=self.batch_size, num_workers=2, shuffle=False
+ )
+
+
+class MyPrintingCallback(Callback):
+ def on_train_start(self, trainer, pl_module):
+ print("Training is starting")
+
+ def on_train_end(self, trainer, pl_module):
+ print("Training is ending")
+
+
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Load Data
+if __name__ == "__main__":
+ # Initialize network
+ model_lightning = CNNLightning()
+
+ trainer = pl.Trainer(
+ #fast_dev_run=True,
+ # overfit_batches=3,
+ max_epochs=5,
+ precision=16,
+ accelerator="gpu",
+ devices=[0,1],
+ callbacks=[EarlyStopping(monitor="val_loss", mode="min")],
+ auto_lr_find=True,
+ enable_model_summary=True,
+ profiler="simple",
+ strategy="deepspeed_stage_1",
+ # accumulate_grad_batches=2,
+ # auto_scale_batch_size="binsearch",
+ # log_every_n_steps=1,
+ )
+
+ dm = MNISTDataModule()
+
+ # trainer tune first to find best batch size and lr
+ trainer.tune(model_lightning, dm)
+
+ trainer.fit(
+ model=model_lightning,
+ datamodule=dm,
+ )
+
+ # test model on test loader from LightningDataModule
+ trainer.test(model=model_lightning, datamodule=dm)
diff --git a/ML/Pytorch/huggingface/.ipynb_checkpoints/finetune_t5_lightning-checkpoint.ipynb b/ML/Pytorch/huggingface/.ipynb_checkpoints/finetune_t5_lightning-checkpoint.ipynb
new file mode 100644
index 0000000..b763a5c
--- /dev/null
+++ b/ML/Pytorch/huggingface/.ipynb_checkpoints/finetune_t5_lightning-checkpoint.ipynb
@@ -0,0 +1,3487 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "87ef8027",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from jupyterthemes.stylefx import set_nb_theme\n",
+ "set_nb_theme('chesterish')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "225eab36",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.simplefilter(\"ignore\")\n",
+ "\n",
+ "import os\n",
+ "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n",
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n",
+ "\n",
+ "import numpy as np\n",
+ "import torch\n",
+ "\n",
+ "import datasets \n",
+ "import pytorch_lightning as pl\n",
+ "\n",
+ "from datasets import load_dataset, load_metric\n",
+ "\n",
+ "from transformers import (\n",
+ " AutoModel,\n",
+ " AutoModelForSeq2SeqLM,\n",
+ " AutoTokenizer,\n",
+ " DataCollatorForSeq2Seq,\n",
+ " Seq2SeqTrainingArguments,\n",
+ " Seq2SeqTrainer,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "9f7d2829",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Define the LightningDataModule\n",
+ "class MyDataModule(pl.LightningDataModule):\n",
+ " def __init__(self, batch_size):\n",
+ " super().__init__()\n",
+ " self.batch_size = batch_size\n",
+ " \n",
+ " def prepare_data(self):\n",
+ " # Download and preprocess the data\n",
+ " load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train\")\n",
+ " load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"validation[:10%]\")\n",
+ " \n",
+ " def setup(self, stage=None):\n",
+ " # Load and preprocess the data\n",
+ " train_data = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train\")\n",
+ " val_data = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"validation[:10%]\")\n",
+ "\n",
+ " self.train_ds = train_data.map(\n",
+ " self.preprocess_function, \n",
+ " batched=True, \n",
+ " batch_size=self.batch_size, \n",
+ " remove_columns=[\"article\", \"highlights\", \"id\"]\n",
+ " )\n",
+ "\n",
+ " self.val_ds = val_data.map(\n",
+ " self.preprocess_function, \n",
+ " batched=True, \n",
+ " batch_size=self.batch_size,\n",
+ " remove_columns=[\"article\", \"highlights\", \"id\"]\n",
+ " )\n",
+ "\n",
+ " def preprocess_function(self, batch):\n",
+ " inputs = tokenizer(batch[\"article\"], padding=\"max_length\", truncation=True, max_length=512)\n",
+ " outputs = tokenizer(batch[\"highlights\"], padding=\"max_length\", truncation=True, max_length=128)\n",
+ " batch[\"input_ids\"] = inputs.input_ids\n",
+ " batch[\"attention_mask\"] = inputs.attention_mask\n",
+ " batch[\"labels\"] = outputs.input_ids.copy()\n",
+ " return batch\n",
+ "\n",
+ " def train_dataloader(self):\n",
+ " return torch.utils.data.DataLoader(self.train_ds, batch_size=self.batch_size)\n",
+ "\n",
+ " def val_dataloader(self):\n",
+ " return torch.utils.data.DataLoader(self.val_ds, batch_size=self.batch_size)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "a99bdbb0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class MyLightningModule(pl.LightningModule):\n",
+ " def __init__(self, model_name, learning_rate, weight_decay, batch_size):\n",
+ " super().__init__()\n",
+ " self.model_name = model_name\n",
+ " self.learning_rate = learning_rate\n",
+ " self.weight_decay = weight_decay\n",
+ " self.batch_size = batch_size\n",
+ " \n",
+ " # Load the pre-trained model and tokenizer\n",
+ " self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)\n",
+ "\n",
+ " # Load the ROUGE metric\n",
+ " self.metric = load_metric(\"rouge\")\n",
+ "\n",
+ " def forward(self, input_ids, attention_mask, labels=None):\n",
+ " output = self.model(\n",
+ " input_ids=input_ids,\n",
+ " attention_mask=attention_mask,\n",
+ " labels=labels,\n",
+ " )\n",
+ " return output.loss, output.logits\n",
+ " \n",
+ " def training_step(self, batch, batch_idx):\n",
+ " input_ids = batch[\"input_ids\"]\n",
+ " attention_mask = batch[\"attention_mask\"]\n",
+ " labels = batch[\"labels\"]\n",
+ " \n",
+ " loss, logits = self(input_ids, attention_mask, labels)\n",
+ " self.log('train_loss', loss, on_epoch=True, on_step=True)\n",
+ " return {'loss': loss, 'logits': logits}\n",
+ " \n",
+ " def validation_step(self, batch, batch_idx):\n",
+ " input_ids = batch[\"input_ids\"]\n",
+ " attention_mask = batch[\"attention_mask\"]\n",
+ " labels = batch[\"labels\"]\n",
+ " loss, logits = self(input_ids, attention_mask, labels)\n",
+ " self.log('val_loss', loss, on_epoch=True, on_step=False)\n",
+ " return {'loss': loss, 'logits': logits}\n",
+ " \n",
+ " def validation_epoch_end(self, outputs):\n",
+ " decoded_preds = []\n",
+ " decoded_labels = []\n",
+ " for output in outputs:\n",
+ " logits = output['logits']\n",
+ " labels = output['labels']\n",
+ " decoded_preds += self.tokenizer.batch_decode(logits, skip_special_tokens=True)\n",
+ " decoded_labels += self.tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+ " \n",
+ " scores = self.metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=[\"rouge1\"])[\"rouge1\"].mid\n",
+ " \n",
+ " self.log('rouge1_precision', scores.precision, prog_bar=True)\n",
+ " self.log('rouge1_recall', scores.recall, prog_bar=True)\n",
+ " self.log('rouge1_fmeasure', scores.fmeasure, prog_bar=True)\n",
+ " \n",
+ " def configure_optimizers(self):\n",
+ " optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)\n",
+ " return optimizer\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "3c28da7c",
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "TypeError",
+ "evalue": "Trainer.__init__() got an unexpected keyword argument 'num_epochs'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[20], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m model \u001b[38;5;241m=\u001b[39m MyLightningModule(model_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mt5-small\u001b[39m\u001b[38;5;124m\"\u001b[39m, learning_rate\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-5\u001b[39m, weight_decay\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-4\u001b[39m, batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16\u001b[39m)\n\u001b[0;32m----> 2\u001b[0m trainer \u001b[38;5;241m=\u001b[39m \u001b[43mpl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mTrainer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevices\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_epochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdeterministic\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogger\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m dm \u001b[38;5;241m=\u001b[39m MyDataModule(batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16\u001b[39m)\n\u001b[1;32m 4\u001b[0m trainer\u001b[38;5;241m.\u001b[39mfit(model, datamodule\u001b[38;5;241m=\u001b[39mdm)\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/pytorch_lightning/utilities/argparse.py:348\u001b[0m, in \u001b[0;36m_defaults_from_env_vars..insert_env_defaults\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 345\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(\u001b[38;5;28mlist\u001b[39m(env_variables\u001b[38;5;241m.\u001b[39mitems()) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlist\u001b[39m(kwargs\u001b[38;5;241m.\u001b[39mitems()))\n\u001b[1;32m 347\u001b[0m \u001b[38;5;66;03m# all args were already moved to kwargs\u001b[39;00m\n\u001b[0;32m--> 348\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+ "\u001b[0;31mTypeError\u001b[0m: Trainer.__init__() got an unexpected keyword argument 'num_epochs'"
+ ]
+ }
+ ],
+ "source": [
+ "model = MyLightningModule(model_name=\"t5-small\", learning_rate=1e-5, weight_decay=1e-4, batch_size=16)\n",
+ "trainer = pl.Trainer(devices=[0], num_epochs=10, deterministic=True, logger=False)\n",
+ "dm = MyDataModule(batch_size=16)\n",
+ "trainer.fit(model, datamodule=dm)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "55729d94",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/ML/Pytorch/huggingface/.ipynb_checkpoints/finetuning_t5_small_cnndaily-checkpoint.ipynb b/ML/Pytorch/huggingface/.ipynb_checkpoints/finetuning_t5_small_cnndaily-checkpoint.ipynb
new file mode 100644
index 0000000..8cfe998
--- /dev/null
+++ b/ML/Pytorch/huggingface/.ipynb_checkpoints/finetuning_t5_small_cnndaily-checkpoint.ipynb
@@ -0,0 +1,3585 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "bd8e3b95",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from jupyterthemes.stylefx import set_nb_theme\n",
+ "set_nb_theme('chesterish')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "8c2a24cb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n",
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "f45eb6b0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/mrbean/.conda/envs/whisper_lightning/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n",
+ "2023-02-21 15:40:52.888700: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-02-21 15:40:53.473104: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
+ "2023-02-21 15:40:53.473149: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
+ "2023-02-21 15:40:53.473154: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import torch\n",
+ "\n",
+ "import datasets \n",
+ "\n",
+ "from datasets import load_dataset, load_metric\n",
+ "\n",
+ "from transformers import (\n",
+ " AutoModel,\n",
+ " AutoModelForMaskedLM,\n",
+ " AutoModelForSeq2SeqLM,\n",
+ " AutoModelForTokenClassification,\n",
+ " AutoTokenizer,\n",
+ " DataCollatorForSeq2Seq,\n",
+ " pipeline,\n",
+ " Seq2SeqTrainingArguments,\n",
+ " Seq2SeqTrainer,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "7fc4eb40",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/mrbean/.conda/envs/whisper_lightning/lib/python3.10/site-packages/transformers/models/t5/tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n",
+ "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n",
+ "- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.\n",
+ "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n",
+ "- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Load the pre-trained model and tokenizer\n",
+ "model_name = \"t5-small\"\n",
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+ "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "363045f5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Found cached dataset cnn_dailymail (/home/mrbean/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)\n",
+ "Found cached dataset cnn_dailymail (/home/mrbean/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)\n",
+ "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1122/1122 [02:06<00:00, 8.88ba/s]\n",
+ "Loading cached processed dataset at /home/mrbean/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de/cache-2d3b7edd75fb1188.arrow\n"
+ ]
+ }
+ ],
+ "source": [
+ "def preprocess_function(batch):\n",
+ " inputs = tokenizer(batch[\"article\"], padding=\"max_length\", truncation=True, max_length=512)\n",
+ " outputs = tokenizer(batch[\"highlights\"], padding=\"max_length\", truncation=True, max_length=128)\n",
+ " batch[\"input_ids\"] = inputs.input_ids\n",
+ " batch[\"attention_mask\"] = inputs.attention_mask\n",
+ " batch[\"labels\"] = outputs.input_ids.copy()\n",
+ " return batch\n",
+ "\n",
+ "# Load the dataset\n",
+ "train_data = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train\")\n",
+ "val_data = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"validation[:10%]\")\n",
+ "\n",
+ "train_ds = train_data.map(\n",
+ " preprocess_function, \n",
+ " batched=True, \n",
+ " batch_size=256, \n",
+ " remove_columns=[\"article\", \"highlights\", \"id\"]\n",
+ ")\n",
+ "\n",
+ "val_ds = val_data.map(\n",
+ " preprocess_function, \n",
+ " batched=True, \n",
+ " batch_size=256, \n",
+ " remove_columns=[\"article\", \"highlights\", \"id\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "6faa8c86",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_478601/1088570042.py:23: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
+ " metric = load_metric(\"rouge\")\n",
+ "max_steps is given, it will override any value given in num_train_epochs\n",
+ "Using cuda_amp half precision backend\n",
+ "The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: id, article, highlights. If id, article, highlights are not expected by `T5ForConditionalGeneration.forward`, you can safely ignore this message.\n",
+ "/home/mrbean/.conda/envs/whisper_lightning/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+ " warnings.warn(\n",
+ "***** Running training *****\n",
+ " Num examples = 0\n",
+ " Num Epochs = 1\n",
+ " Instantaneous batch size per device = 16\n",
+ " Total train batch size (w. parallel, distributed & accumulation) = 16\n",
+ " Gradient Accumulation steps = 1\n",
+ " Total optimization steps = 5000\n",
+ " Number of trainable parameters = 60506624\n"
+ ]
+ },
+ {
+ "ename": "IndexError",
+ "evalue": "Invalid key: 90427 is out of bounds for size 0",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[6], line 47\u001b[0m\n\u001b[1;32m 36\u001b[0m trainer \u001b[38;5;241m=\u001b[39m Seq2SeqTrainer(\n\u001b[1;32m 37\u001b[0m model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m 38\u001b[0m args\u001b[38;5;241m=\u001b[39mtraining_args,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 43\u001b[0m compute_metrics\u001b[38;5;241m=\u001b[39mcompute_metrics,\n\u001b[1;32m 44\u001b[0m )\n\u001b[1;32m 46\u001b[0m \u001b[38;5;66;03m# Start the training\u001b[39;00m\n\u001b[0;32m---> 47\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/transformers/trainer.py:1539\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1534\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_wrapped \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\n\u001b[1;32m 1536\u001b[0m inner_training_loop \u001b[38;5;241m=\u001b[39m find_executable_batch_size(\n\u001b[1;32m 1537\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inner_training_loop, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_batch_size, args\u001b[38;5;241m.\u001b[39mauto_find_batch_size\n\u001b[1;32m 1538\u001b[0m )\n\u001b[0;32m-> 1539\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1540\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1541\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1542\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1543\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1544\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/transformers/trainer.py:1761\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 1758\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_rng_state(resume_from_checkpoint)\n\u001b[1;32m 1760\u001b[0m step \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[0;32m-> 1761\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m step, inputs \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(epoch_iterator):\n\u001b[1;32m 1762\u001b[0m \n\u001b[1;32m 1763\u001b[0m \u001b[38;5;66;03m# Skip past any already trained steps if resuming training\u001b[39;00m\n\u001b[1;32m 1764\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m steps_trained_in_current_epoch \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 1765\u001b[0m steps_trained_in_current_epoch \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/torch/utils/data/dataloader.py:628\u001b[0m, in \u001b[0;36m_BaseDataLoaderIter.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 625\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampler_iter \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 626\u001b[0m \u001b[38;5;66;03m# TODO(https://github.com/pytorch/pytorch/issues/76750)\u001b[39;00m\n\u001b[1;32m 627\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset() \u001b[38;5;66;03m# type: ignore[call-arg]\u001b[39;00m\n\u001b[0;32m--> 628\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_next_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 630\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataset_kind \u001b[38;5;241m==\u001b[39m _DatasetKind\u001b[38;5;241m.\u001b[39mIterable \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m 632\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called:\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/torch/utils/data/dataloader.py:671\u001b[0m, in \u001b[0;36m_SingleProcessDataLoaderIter._next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 669\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_next_data\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 670\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_next_index() \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[0;32m--> 671\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset_fetcher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[1;32m 672\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory:\n\u001b[1;32m 673\u001b[0m data \u001b[38;5;241m=\u001b[39m _utils\u001b[38;5;241m.\u001b[39mpin_memory\u001b[38;5;241m.\u001b[39mpin_memory(data, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory_device)\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:58\u001b[0m, in \u001b[0;36m_MapDatasetFetcher.fetch\u001b[0;34m(self, possibly_batched_index)\u001b[0m\n\u001b[1;32m 56\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39m__getitems__(possibly_batched_index)\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m data \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[idx] \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m possibly_batched_index]\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 60\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:58\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 56\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39m__getitems__(possibly_batched_index)\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m data \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdataset\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m possibly_batched_index]\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 60\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/datasets/arrow_dataset.py:2601\u001b[0m, in \u001b[0;36mDataset.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2599\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, key): \u001b[38;5;66;03m# noqa: F811\u001b[39;00m\n\u001b[1;32m 2600\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools).\"\"\"\u001b[39;00m\n\u001b[0;32m-> 2601\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2602\u001b[0m \u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2603\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/datasets/arrow_dataset.py:2585\u001b[0m, in \u001b[0;36mDataset._getitem\u001b[0;34m(self, key, **kwargs)\u001b[0m\n\u001b[1;32m 2583\u001b[0m format_kwargs \u001b[38;5;241m=\u001b[39m format_kwargs \u001b[38;5;28;01mif\u001b[39;00m format_kwargs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m {}\n\u001b[1;32m 2584\u001b[0m formatter \u001b[38;5;241m=\u001b[39m get_formatter(format_type, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mformat_kwargs)\n\u001b[0;32m-> 2585\u001b[0m pa_subtable \u001b[38;5;241m=\u001b[39m \u001b[43mquery_table\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindices\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_indices\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_indices\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 2586\u001b[0m formatted_output \u001b[38;5;241m=\u001b[39m format_table(\n\u001b[1;32m 2587\u001b[0m pa_subtable, key, formatter\u001b[38;5;241m=\u001b[39mformatter, format_columns\u001b[38;5;241m=\u001b[39mformat_columns, output_all_columns\u001b[38;5;241m=\u001b[39moutput_all_columns\n\u001b[1;32m 2588\u001b[0m )\n\u001b[1;32m 2589\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m formatted_output\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/datasets/formatting/formatting.py:588\u001b[0m, in \u001b[0;36mquery_table\u001b[0;34m(table, key, indices)\u001b[0m\n\u001b[1;32m 586\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 587\u001b[0m size \u001b[38;5;241m=\u001b[39m indices\u001b[38;5;241m.\u001b[39mnum_rows \u001b[38;5;28;01mif\u001b[39;00m indices \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m table\u001b[38;5;241m.\u001b[39mnum_rows\n\u001b[0;32m--> 588\u001b[0m \u001b[43m_check_valid_index_key\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 589\u001b[0m \u001b[38;5;66;03m# Query the main table\u001b[39;00m\n\u001b[1;32m 590\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m indices \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/datasets/formatting/formatting.py:531\u001b[0m, in \u001b[0;36m_check_valid_index_key\u001b[0;34m(key, size)\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mint\u001b[39m):\n\u001b[1;32m 530\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (key \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m key \u001b[38;5;241m+\u001b[39m size \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (key \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m size):\n\u001b[0;32m--> 531\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIndexError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid key: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m is out of bounds for size \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msize\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 533\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mslice\u001b[39m):\n",
+ "\u001b[0;31mIndexError\u001b[0m: Invalid key: 90427 is out of bounds for size 0"
+ ]
+ }
+ ],
+ "source": [
+ "class MyLightningModule(pl.LightningModule):\n",
+ " def __init__(self, model_name, learning_rate, weight_decay, batch_size, num_training_steps):\n",
+ " super().__init__()\n",
+ " self.model_name = model_name\n",
+ " self.learning_rate = learning_rate\n",
+ " self.weight_decay = weight_decay\n",
+ " self.batch_size = batch_size\n",
+ " self.num_training_steps = num_training_steps\n",
+ " \n",
+ " # Load the pre-trained model and tokenizer\n",
+ " self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)\n",
+ " self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)\n",
+ "\n",
+ " def forward(self, input_ids, attention_mask, labels=None):\n",
+ " output = self.model(\n",
+ " input_ids=input_ids,\n",
+ " attention_mask=attention_mask,\n",
+ " labels=labels,\n",
+ " )\n",
+ " return output.loss, output.logits\n",
+ " \n",
+ " def training_step(self, batch, batch_idx):\n",
+ " input_ids = batch[\"input_ids\"]\n",
+ " attention_mask = batch[\"attention_mask\"]\n",
+ " labels = batch[\"labels\"]\n",
+ " \n",
+ " loss\n",
+ "\n",
+ "# Define the data collator\n",
+ "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
+ "\n",
+ "# Initialize the trainer arguments\n",
+ "training_args = Seq2SeqTrainingArguments(\n",
+ " output_dir=\"./results\",\n",
+ " learning_rate=1e-5,\n",
+ " per_device_train_batch_size=16,\n",
+ " per_device_eval_batch_size=16,\n",
+ " max_steps=5000,\n",
+ " weight_decay=1e-4,\n",
+ " push_to_hub=False,\n",
+ " evaluation_strategy = \"steps\",\n",
+ " eval_steps = 50,\n",
+ " generation_max_length=128,\n",
+ " predict_with_generate=True,\n",
+ " logging_steps=100,\n",
+ " gradient_accumulation_steps=1,\n",
+ " fp16=True,\n",
+ ")\n",
+ "\n",
+ "# Load the ROUGE metric\n",
+ "metric = load_metric(\"rouge\")\n",
+ "\n",
+ "# Define the evaluation function\n",
+ "def compute_metrics(pred):\n",
+ " labels = pred.label_ids\n",
+ " preds = pred.predictions\n",
+ " decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
+ " decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+ " scores = metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=[\"rouge1\"])[\"rouge1\"].mid\n",
+ " return {\"rouge1_precision\": scores.precision, \"rouge1_recall\": scores.recall, \"rouge1_fmeasure\": scores.fmeasure}\n",
+ "\n",
+ "\n",
+ "# Initialize the trainer\n",
+ "trainer = Seq2SeqTrainer(\n",
+ " model=model,\n",
+ " args=training_args,\n",
+ " train_dataset=train_data,\n",
+ " eval_dataset=val_data,\n",
+ " data_collator=data_collator,\n",
+ " tokenizer=tokenizer,\n",
+ " compute_metrics=compute_metrics,\n",
+ ")\n",
+ "\n",
+ "# Start the training\n",
+ "trainer.train()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1b0f9a76",
+ "metadata": {},
+ "source": [
+ "# Steps:\n",
+ "1. Rewrite code to be more general\n",
+ "\n",
+ "a) Data loading should be from disk rather than their load_dataset, and should be on the fly\n",
+ "\n",
+ "b) Rewrite to Lightning code, Trainer etc using Lightning, compute metric fine that we use huggingface"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ff03c8bb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!nvidia-smi"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aafc4b27",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/ML/Pytorch/huggingface/.ipynb_checkpoints/learning-checkpoint.ipynb b/ML/Pytorch/huggingface/.ipynb_checkpoints/learning-checkpoint.ipynb
new file mode 100644
index 0000000..1a9ac79
--- /dev/null
+++ b/ML/Pytorch/huggingface/.ipynb_checkpoints/learning-checkpoint.ipynb
@@ -0,0 +1,580 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fc8e5ea0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "print(torch.cuda.is_available())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d8a1e039",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import pipeline\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6ad73024",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "classifier = pipeline(\"zero-shot-classification\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "04f7e02c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "classifier(\n",
+ " \"This is a course about the Transformers library\",\n",
+ " candidate_labels=[\"machine learning\", \"gym\", \"food\"],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6fb246c2",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "from transformers import pipeline\n",
+ "generator = pipeline(task=\"text-generation\", model=\"bigscience/bloom-1b7\", device=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c4e174f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import AutoModelForTokenClassification, AutoModel, AutoTokenizer\n",
+ "import torch\n",
+ "\n",
+ "# Define input text and pre-trained model checkpoint\n",
+ "text = \"My name is wolfgang and I live in berlin\"\n",
+ "checkpoint = \"Jean-Baptiste/roberta-large-ner-english\"\n",
+ "\n",
+ "# Instantiate tokenizer and encode input text\n",
+ "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
+ "inputs = tokenizer(text, padding=True, truncation=True, return_tensors=\"pt\")\n",
+ "\n",
+ "# Instantiate model and generate output\n",
+ "model = AutoModel.from_pretrained(checkpoint)\n",
+ "outputs = model(**inputs)\n",
+ "print(outputs[0].shape)\n",
+ "\n",
+ "# Instantiate token classification model and generate predictions\n",
+ "model = AutoModelForTokenClassification.from_pretrained(checkpoint)\n",
+ "outputs = model(**inputs)\n",
+ "predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)\n",
+ "print(predictions)\n",
+ "print(model.config.id2label)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8212bbaa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
+ "\n",
+ "tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')\n",
+ "model = AutoModelForMaskedLM.from_pretrained(\"xlm-roberta-large\")\n",
+ "\n",
+ "# prepare input\n",
+ "text = \"Replace me by any text you'd like.\"\n",
+ "encoded_input = tokenizer(text, return_tensors='pt')\n",
+ "\n",
+ "# forward pass\n",
+ "output = model(**encoded_input)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "314cba41",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
+ "\n",
+ "# Load the pre-trained tokenizer and model\n",
+ "tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')\n",
+ "model = AutoModelForMaskedLM.from_pretrained(\"xlm-roberta-large\")\n",
+ "\n",
+ "# Define the input sentence with a masked token\n",
+ "text = \"I want to a new car tomorrow.\"\n",
+ "\n",
+ "# Tokenize the input sentence, replacing the masked token with a special [MASK] token\n",
+ "encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')\n",
+ "\n",
+ "print(output.logits.shape)\n",
+ "print(encoded_input['input_ids'][0].tolist().index(tokenizer.mask_token_id))\n",
+ "\n",
+ "# Extract the predicted probabilities for the masked token\n",
+ "predicted_probabilities = output.logits[0, encoded_input['input_ids'][0].tolist().index(tokenizer.mask_token_id)]\n",
+ "predicted_probabilities = torch.nn.functional.softmax(predicted_probabilities, dim=-1)\n",
+ "\n",
+ "# Get the top-k most probable predictions for the masked token\n",
+ "k = 5\n",
+ "top_k = torch.topk(predicted_probabilities, k)\n",
+ "for i in range(k):\n",
+ " token = tokenizer.convert_ids_to_tokens(top_k.indices[i].item())\n",
+ " score = top_k.values[i].item()\n",
+ " print(f\"Prediction {i+1}: '{token}' with probability {score:.5f}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6187e77e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
+ "\n",
+ "sequences = [\n",
+ " \"Using a Transformer network is simple\",\n",
+ " \"The quick brown fox jumps over the lazy dog\",\n",
+ " \"To be or not to be, that is the question\"\n",
+ "]\n",
+ "\n",
+ "# Tokenize the input sequences and convert them to padded and truncated integer token IDs\n",
+ "inputs = tokenizer(\n",
+ " sequences,\n",
+ " padding=True,\n",
+ " truncation=True,\n",
+ " return_tensors=\"pt\"\n",
+ ")\n",
+ "\n",
+ "# Print the resulting input IDs and attention masks\n",
+ "print(inputs['input_ids'])\n",
+ "print(inputs['attention_mask'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fc259c5a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "43466db6",
+ "metadata": {},
+ "source": [
+ "Huggingface:\n",
+ "\n",
+ "1. Understanding how to use the Pipeline (probably most useful) for various tasks, easy to use, and the different subtasks it can do like translation, QA, zero shot, sentiment analysis, token classification, etc. \n",
+ "2. Understood how pipeline works in more detail by using AutoModel for various tasks as well as AutoTokenizer\n",
+ "3. Load dataset\n",
+ "4. How to finetune\n",
+ "5. How to evaluate\n",
+ "6. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "97c474f2",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3ed5d8c2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification\n",
+ "\n",
+ "# Same as before\n",
+ "checkpoint = \"bert-base-uncased\"\n",
+ "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
+ "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
+ "sequences = [\n",
+ " \"I've been waiting for a HuggingFace course my whole life.\",\n",
+ " \"This course is amazing!\",\n",
+ "]\n",
+ "batch = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"pt\")\n",
+ "\n",
+ "# This is new\n",
+ "batch[\"labels\"] = torch.tensor([1, 1])\n",
+ "\n",
+ "optimizer = AdamW(model.parameters())\n",
+ "loss = model(**batch).loss\n",
+ "loss.backward()\n",
+ "optimizer.step()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c598624f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from datasets import load_dataset\n",
+ "raw_datasets = load_dataset(\"glue\", \"mrpc\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cd296227",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "raw_train_dataset = raw_datasets[\"train\"]\n",
+ "raw_train_dataset[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e462947a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from datasets import load_dataset\n",
+ "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
+ "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
+ "\n",
+ "checkpoint = \"bert-base-uncased\"\n",
+ "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
+ "\n",
+ "def tokenize_function(example):\n",
+ " return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n",
+ "\n",
+ "\n",
+ "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
+ "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
+ "\n",
+ "\n",
+ "from transformers import TrainingArguments\n",
+ "training_args = TrainingArguments(\"test-trainer\")\n",
+ "\n",
+ "from transformers import AutoModelForSequenceClassification\n",
+ "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
+ "\n",
+ "import numpy as np\n",
+ "import evaluate\n",
+ "\n",
+ "def compute_metrics(eval_preds):\n",
+ " metric = evaluate.load(\"glue\", \"mrpc\")\n",
+ " logits, labels = eval_preds\n",
+ " predictions = np.argmax(logits, axis=-1)\n",
+ " return metric.compute(predictions=predictions, references=labels)\n",
+ "\n",
+ "training_args = TrainingArguments(\"test-trainer\", evaluation_strategy=\"epoch\")\n",
+ "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
+ "\n",
+ "trainer = Trainer(\n",
+ " model,\n",
+ " training_args,\n",
+ " train_dataset=tokenized_datasets[\"train\"],\n",
+ " eval_dataset=tokenized_datasets[\"validation\"],\n",
+ " data_collator=data_collator,\n",
+ " tokenizer=tokenizer,\n",
+ " compute_metrics=compute_metrics,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e2795dc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import TrainingArguments\n",
+ "training_args = TrainingArguments(\"test-trainer\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3af29cd5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import AutoModelForSequenceClassification\n",
+ "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "817f644e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import evaluate"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "42819a6c",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "def compute_metrics(eval_preds):\n",
+ " metric = evaluate.load(\"glue\", \"mrpc\")\n",
+ " logits, labels = eval_preds\n",
+ " predictions = np.argmax(logits, axis=-1)\n",
+ " return metric.compute(predictions=predictions, references=labels)\n",
+ "\n",
+ "training_args = TrainingArguments(\"test-trainer\", evaluation_strategy=\"epoch\")\n",
+ "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
+ "\n",
+ "trainer = Trainer(\n",
+ " model,\n",
+ " training_args,\n",
+ " train_dataset=tokenized_datasets[\"train\"],\n",
+ " eval_dataset=tokenized_datasets[\"validation\"],\n",
+ " data_collator=data_collator,\n",
+ " tokenizer=tokenizer,\n",
+ " compute_metrics=compute_metrics,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eb5986b0",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer\n",
+ "from datasets import load_dataset\n",
+ "batch_size=32\n",
+ "\n",
+ "# Define the generator function to preprocess the data in batches\n",
+ "def preprocess_generator(examples):\n",
+ " for i in range(0, len(examples[\"article\"]), batch_size):\n",
+ " batch = examples[\"article\"][i:i+batch_size]\n",
+ " targets = examples[\"highlights\"][i:i+batch_size]\n",
+ " model_inputs = tokenizer(batch, max_length=512, padding=\"max_length\", truncation=True)\n",
+ " with tokenizer.as_target_tokenizer():\n",
+ " model_targets = tokenizer(targets, max_length=128, padding=\"max_length\", truncation=True)\n",
+ " model_inputs[\"labels\"] = model_targets[\"input_ids\"]\n",
+ " yield model_inputs\n",
+ "\n",
+ "def preprocess_function(examples):\n",
+ " articles = [ex for ex in examples[\"article\"]]\n",
+ " summaries = [ex for ex in examples[\"highlights\"]]\n",
+ "\n",
+ " model_inputs = tokenizer(articles, max_length=512, padding=\"max_length\", truncation=True)\n",
+ " with tokenizer.as_target_tokenizer():\n",
+ " model_targets = tokenizer(summaries, max_length=128, padding=\"max_length\", truncation=True)\n",
+ " \n",
+ " model_inputs[\"labels\"] = model_targets[\"input_ids\"]\n",
+ " return model_inputs\n",
+ " \n",
+ "# Load the dataset\n",
+ "raw_datasets = load_dataset(\"cnn_dailymail\", \"3.0.0\")\n",
+ "preprocessed_datasets = raw_datasets.map(preprocess_function, batched=True, num_proc=4)\n",
+ "\n",
+ "# Load the pre-trained model and tokenizer\n",
+ "model_name = \"t5-small\"\n",
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+ "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
+ "\n",
+ "# Define the data collator\n",
+ "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
+ "\n",
+ "# Initialize the trainer arguments\n",
+ "training_args = Seq2SeqTrainingArguments(\n",
+ " output_dir=\"./results\",\n",
+ " evaluation_strategy = \"epoch\",\n",
+ " learning_rate=2e-5,\n",
+ " per_device_train_batch_size=batch_size,\n",
+ " max_steps=1000,\n",
+ " weight_decay=0.01,\n",
+ " push_to_hub=False,\n",
+ ")\n",
+ "\n",
+ "# Initialize the trainer\n",
+ "trainer = Seq2SeqTrainer(\n",
+ " model=model,\n",
+ " args=training_args,\n",
+ " train_dataset=train_ds,\n",
+ " data_collator=data_collator,\n",
+ " tokenizer=tokenizer,\n",
+ ")\n",
+ "\n",
+ "# Start the training\n",
+ "trainer.train()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7d62583e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from datasets import load_metric"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d310a7b3",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "preprocessed_datasets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "99d422cc",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "# Load the pre-trained model and tokenizer\n",
+ "model_name = \"t5-small\"\n",
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+ "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
+ "\n",
+ "# Define the data collator\n",
+ "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
+ "\n",
+ "# Initialize the trainer arguments\n",
+ "training_args = Seq2SeqTrainingArguments(\n",
+ " output_dir=\"./results\",\n",
+ " learning_rate=2e-5,\n",
+ " per_device_train_batch_size=batch_size,\n",
+ " max_steps=5000,\n",
+ " weight_decay=0.01,\n",
+ " push_to_hub=False,\n",
+ " evaluation_strategy = \"steps\",\n",
+ " eval_steps = 50,\n",
+ ")\n",
+ "\n",
+ "# Load the ROUGE metric\n",
+ "metric = load_metric(\"rouge\")\n",
+ "\n",
+ "# Define the evaluation function\n",
+ "def compute_metrics(pred):\n",
+ " labels = pred.label_ids\n",
+ " preds = pred.predictions\n",
+ " \n",
+ " decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
+ " decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+ " \n",
+ " scores = metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=[\"rouge1\"])[\"rouge1\"].mid\n",
+ " \n",
+ " return {\"rouge1_precision\": scores.precision, \"rouge1_recall\": scores.recall, \"rouge1_fmeasure\": scores.fmeasure}\n",
+ "\n",
+ "\n",
+ "# Initialize the trainer\n",
+ "trainer = Seq2SeqTrainer(\n",
+ " model=model,\n",
+ " args=training_args,\n",
+ " train_dataset=preprocessed_datasets[\"train\"],\n",
+ " eval_dataset=preprocessed_datasets[\"validation\"],\n",
+ " data_collator=data_collator,\n",
+ " tokenizer=tokenizer,\n",
+ " compute_metrics=compute_metrics,\n",
+ ")\n",
+ "\n",
+ "# Start the training\n",
+ "trainer.train()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a5e97b57",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install nltk\n",
+ "!pip install rouge_score"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "558c3e66",
+ "metadata": {},
+ "source": [
+ "# Goal:\n",
+ "\n",
+ "1. Implement full training from dataloading (dailycnn dataset), to model training, evaluation, etc, using HF. \n",
+ "* Right now: stuck on on the fly dataset loading, we don't want to cache because this would take a lot of disk space etc.\n",
+ "\n",
+ "2. After we get step 1) working, we want to go deeper on every step, so download the dataset and load it as a custom dataset rather than using huggingface simple API, in order to make it more general. Compare with loading the ds as a custom HF dataset or using pytorch class together with lightning. Speed difference? Convenience? Also we want to use the lightning Trainer so see how we can integrate that. And then compare HF to the lightning + hf model approach and see what we like the most."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "624d49ca",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/ML/Pytorch/huggingface/finetune_t5_lightning.ipynb b/ML/Pytorch/huggingface/finetune_t5_lightning.ipynb
new file mode 100644
index 0000000..b763a5c
--- /dev/null
+++ b/ML/Pytorch/huggingface/finetune_t5_lightning.ipynb
@@ -0,0 +1,3487 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "87ef8027",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from jupyterthemes.stylefx import set_nb_theme\n",
+ "set_nb_theme('chesterish')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "225eab36",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.simplefilter(\"ignore\")\n",
+ "\n",
+ "import os\n",
+ "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n",
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n",
+ "\n",
+ "import numpy as np\n",
+ "import torch\n",
+ "\n",
+ "import datasets \n",
+ "import pytorch_lightning as pl\n",
+ "\n",
+ "from datasets import load_dataset, load_metric\n",
+ "\n",
+ "from transformers import (\n",
+ " AutoModel,\n",
+ " AutoModelForSeq2SeqLM,\n",
+ " AutoTokenizer,\n",
+ " DataCollatorForSeq2Seq,\n",
+ " Seq2SeqTrainingArguments,\n",
+ " Seq2SeqTrainer,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "9f7d2829",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Define the LightningDataModule\n",
+ "class MyDataModule(pl.LightningDataModule):\n",
+ " def __init__(self, batch_size):\n",
+ " super().__init__()\n",
+ " self.batch_size = batch_size\n",
+ " \n",
+ " def prepare_data(self):\n",
+ " # Download and preprocess the data\n",
+ " load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train\")\n",
+ " load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"validation[:10%]\")\n",
+ " \n",
+ " def setup(self, stage=None):\n",
+ " # Load and preprocess the data\n",
+ " train_data = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train\")\n",
+ " val_data = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"validation[:10%]\")\n",
+ "\n",
+ " self.train_ds = train_data.map(\n",
+ " self.preprocess_function, \n",
+ " batched=True, \n",
+ " batch_size=self.batch_size, \n",
+ " remove_columns=[\"article\", \"highlights\", \"id\"]\n",
+ " )\n",
+ "\n",
+ " self.val_ds = val_data.map(\n",
+ " self.preprocess_function, \n",
+ " batched=True, \n",
+ " batch_size=self.batch_size,\n",
+ " remove_columns=[\"article\", \"highlights\", \"id\"]\n",
+ " )\n",
+ "\n",
+ " def preprocess_function(self, batch):\n",
+ " inputs = tokenizer(batch[\"article\"], padding=\"max_length\", truncation=True, max_length=512)\n",
+ " outputs = tokenizer(batch[\"highlights\"], padding=\"max_length\", truncation=True, max_length=128)\n",
+ " batch[\"input_ids\"] = inputs.input_ids\n",
+ " batch[\"attention_mask\"] = inputs.attention_mask\n",
+ " batch[\"labels\"] = outputs.input_ids.copy()\n",
+ " return batch\n",
+ "\n",
+ " def train_dataloader(self):\n",
+ " return torch.utils.data.DataLoader(self.train_ds, batch_size=self.batch_size)\n",
+ "\n",
+ " def val_dataloader(self):\n",
+ " return torch.utils.data.DataLoader(self.val_ds, batch_size=self.batch_size)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "a99bdbb0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class MyLightningModule(pl.LightningModule):\n",
+ " def __init__(self, model_name, learning_rate, weight_decay, batch_size):\n",
+ " super().__init__()\n",
+ " self.model_name = model_name\n",
+ " self.learning_rate = learning_rate\n",
+ " self.weight_decay = weight_decay\n",
+ " self.batch_size = batch_size\n",
+ " \n",
+ " # Load the pre-trained model and tokenizer\n",
+ " self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)\n",
+ "\n",
+ " # Load the ROUGE metric\n",
+ " self.metric = load_metric(\"rouge\")\n",
+ "\n",
+ " def forward(self, input_ids, attention_mask, labels=None):\n",
+ " output = self.model(\n",
+ " input_ids=input_ids,\n",
+ " attention_mask=attention_mask,\n",
+ " labels=labels,\n",
+ " )\n",
+ " return output.loss, output.logits\n",
+ " \n",
+ " def training_step(self, batch, batch_idx):\n",
+ " input_ids = batch[\"input_ids\"]\n",
+ " attention_mask = batch[\"attention_mask\"]\n",
+ " labels = batch[\"labels\"]\n",
+ " \n",
+ " loss, logits = self(input_ids, attention_mask, labels)\n",
+ " self.log('train_loss', loss, on_epoch=True, on_step=True)\n",
+ " return {'loss': loss, 'logits': logits}\n",
+ " \n",
+ " def validation_step(self, batch, batch_idx):\n",
+ " input_ids = batch[\"input_ids\"]\n",
+ " attention_mask = batch[\"attention_mask\"]\n",
+ " labels = batch[\"labels\"]\n",
+ " loss, logits = self(input_ids, attention_mask, labels)\n",
+ " self.log('val_loss', loss, on_epoch=True, on_step=False)\n",
+ " return {'loss': loss, 'logits': logits}\n",
+ " \n",
+ " def validation_epoch_end(self, outputs):\n",
+ " decoded_preds = []\n",
+ " decoded_labels = []\n",
+ " for output in outputs:\n",
+ " logits = output['logits']\n",
+ " labels = output['labels']\n",
+ " decoded_preds += self.tokenizer.batch_decode(logits, skip_special_tokens=True)\n",
+ " decoded_labels += self.tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+ " \n",
+ " scores = self.metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=[\"rouge1\"])[\"rouge1\"].mid\n",
+ " \n",
+ " self.log('rouge1_precision', scores.precision, prog_bar=True)\n",
+ " self.log('rouge1_recall', scores.recall, prog_bar=True)\n",
+ " self.log('rouge1_fmeasure', scores.fmeasure, prog_bar=True)\n",
+ " \n",
+ " def configure_optimizers(self):\n",
+ " optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)\n",
+ " return optimizer\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "3c28da7c",
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "TypeError",
+ "evalue": "Trainer.__init__() got an unexpected keyword argument 'num_epochs'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[20], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m model \u001b[38;5;241m=\u001b[39m MyLightningModule(model_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mt5-small\u001b[39m\u001b[38;5;124m\"\u001b[39m, learning_rate\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-5\u001b[39m, weight_decay\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-4\u001b[39m, batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16\u001b[39m)\n\u001b[0;32m----> 2\u001b[0m trainer \u001b[38;5;241m=\u001b[39m \u001b[43mpl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mTrainer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevices\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_epochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdeterministic\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogger\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m dm \u001b[38;5;241m=\u001b[39m MyDataModule(batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16\u001b[39m)\n\u001b[1;32m 4\u001b[0m trainer\u001b[38;5;241m.\u001b[39mfit(model, datamodule\u001b[38;5;241m=\u001b[39mdm)\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/pytorch_lightning/utilities/argparse.py:348\u001b[0m, in \u001b[0;36m_defaults_from_env_vars..insert_env_defaults\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 345\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(\u001b[38;5;28mlist\u001b[39m(env_variables\u001b[38;5;241m.\u001b[39mitems()) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlist\u001b[39m(kwargs\u001b[38;5;241m.\u001b[39mitems()))\n\u001b[1;32m 347\u001b[0m \u001b[38;5;66;03m# all args were already moved to kwargs\u001b[39;00m\n\u001b[0;32m--> 348\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+ "\u001b[0;31mTypeError\u001b[0m: Trainer.__init__() got an unexpected keyword argument 'num_epochs'"
+ ]
+ }
+ ],
+ "source": [
+ "model = MyLightningModule(model_name=\"t5-small\", learning_rate=1e-5, weight_decay=1e-4, batch_size=16)\n",
+ "trainer = pl.Trainer(devices=[0], num_epochs=10, deterministic=True, logger=False)\n",
+ "dm = MyDataModule(batch_size=16)\n",
+ "trainer.fit(model, datamodule=dm)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "55729d94",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/ML/Pytorch/huggingface/finetuning_t5_small_cnndaily.ipynb b/ML/Pytorch/huggingface/finetuning_t5_small_cnndaily.ipynb
new file mode 100644
index 0000000..8cfe998
--- /dev/null
+++ b/ML/Pytorch/huggingface/finetuning_t5_small_cnndaily.ipynb
@@ -0,0 +1,3585 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "bd8e3b95",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from jupyterthemes.stylefx import set_nb_theme\n",
+ "set_nb_theme('chesterish')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "8c2a24cb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n",
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "f45eb6b0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/mrbean/.conda/envs/whisper_lightning/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n",
+ "2023-02-21 15:40:52.888700: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+ "2023-02-21 15:40:53.473104: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
+ "2023-02-21 15:40:53.473149: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
+ "2023-02-21 15:40:53.473154: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import torch\n",
+ "\n",
+ "import datasets \n",
+ "\n",
+ "from datasets import load_dataset, load_metric\n",
+ "\n",
+ "from transformers import (\n",
+ " AutoModel,\n",
+ " AutoModelForMaskedLM,\n",
+ " AutoModelForSeq2SeqLM,\n",
+ " AutoModelForTokenClassification,\n",
+ " AutoTokenizer,\n",
+ " DataCollatorForSeq2Seq,\n",
+ " pipeline,\n",
+ " Seq2SeqTrainingArguments,\n",
+ " Seq2SeqTrainer,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "7fc4eb40",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/mrbean/.conda/envs/whisper_lightning/lib/python3.10/site-packages/transformers/models/t5/tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n",
+ "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n",
+ "- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.\n",
+ "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n",
+ "- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Load the pre-trained model and tokenizer\n",
+ "model_name = \"t5-small\"\n",
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+ "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "363045f5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Found cached dataset cnn_dailymail (/home/mrbean/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)\n",
+ "Found cached dataset cnn_dailymail (/home/mrbean/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)\n",
+ "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1122/1122 [02:06<00:00, 8.88ba/s]\n",
+ "Loading cached processed dataset at /home/mrbean/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de/cache-2d3b7edd75fb1188.arrow\n"
+ ]
+ }
+ ],
+ "source": [
+ "def preprocess_function(batch):\n",
+ " inputs = tokenizer(batch[\"article\"], padding=\"max_length\", truncation=True, max_length=512)\n",
+ " outputs = tokenizer(batch[\"highlights\"], padding=\"max_length\", truncation=True, max_length=128)\n",
+ " batch[\"input_ids\"] = inputs.input_ids\n",
+ " batch[\"attention_mask\"] = inputs.attention_mask\n",
+ " batch[\"labels\"] = outputs.input_ids.copy()\n",
+ " return batch\n",
+ "\n",
+ "# Load the dataset\n",
+ "train_data = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"train\")\n",
+ "val_data = load_dataset(\"cnn_dailymail\", \"3.0.0\", split=\"validation[:10%]\")\n",
+ "\n",
+ "train_ds = train_data.map(\n",
+ " preprocess_function, \n",
+ " batched=True, \n",
+ " batch_size=256, \n",
+ " remove_columns=[\"article\", \"highlights\", \"id\"]\n",
+ ")\n",
+ "\n",
+ "val_ds = val_data.map(\n",
+ " preprocess_function, \n",
+ " batched=True, \n",
+ " batch_size=256, \n",
+ " remove_columns=[\"article\", \"highlights\", \"id\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "6faa8c86",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_478601/1088570042.py:23: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
+ " metric = load_metric(\"rouge\")\n",
+ "max_steps is given, it will override any value given in num_train_epochs\n",
+ "Using cuda_amp half precision backend\n",
+ "The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: id, article, highlights. If id, article, highlights are not expected by `T5ForConditionalGeneration.forward`, you can safely ignore this message.\n",
+ "/home/mrbean/.conda/envs/whisper_lightning/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+ " warnings.warn(\n",
+ "***** Running training *****\n",
+ " Num examples = 0\n",
+ " Num Epochs = 1\n",
+ " Instantaneous batch size per device = 16\n",
+ " Total train batch size (w. parallel, distributed & accumulation) = 16\n",
+ " Gradient Accumulation steps = 1\n",
+ " Total optimization steps = 5000\n",
+ " Number of trainable parameters = 60506624\n"
+ ]
+ },
+ {
+ "ename": "IndexError",
+ "evalue": "Invalid key: 90427 is out of bounds for size 0",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[6], line 47\u001b[0m\n\u001b[1;32m 36\u001b[0m trainer \u001b[38;5;241m=\u001b[39m Seq2SeqTrainer(\n\u001b[1;32m 37\u001b[0m model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m 38\u001b[0m args\u001b[38;5;241m=\u001b[39mtraining_args,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 43\u001b[0m compute_metrics\u001b[38;5;241m=\u001b[39mcompute_metrics,\n\u001b[1;32m 44\u001b[0m )\n\u001b[1;32m 46\u001b[0m \u001b[38;5;66;03m# Start the training\u001b[39;00m\n\u001b[0;32m---> 47\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/transformers/trainer.py:1539\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1534\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_wrapped \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\n\u001b[1;32m 1536\u001b[0m inner_training_loop \u001b[38;5;241m=\u001b[39m find_executable_batch_size(\n\u001b[1;32m 1537\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inner_training_loop, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_batch_size, args\u001b[38;5;241m.\u001b[39mauto_find_batch_size\n\u001b[1;32m 1538\u001b[0m )\n\u001b[0;32m-> 1539\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1540\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1541\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1542\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1543\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1544\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/transformers/trainer.py:1761\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 1758\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_rng_state(resume_from_checkpoint)\n\u001b[1;32m 1760\u001b[0m step \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[0;32m-> 1761\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m step, inputs \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(epoch_iterator):\n\u001b[1;32m 1762\u001b[0m \n\u001b[1;32m 1763\u001b[0m \u001b[38;5;66;03m# Skip past any already trained steps if resuming training\u001b[39;00m\n\u001b[1;32m 1764\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m steps_trained_in_current_epoch \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 1765\u001b[0m steps_trained_in_current_epoch \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/torch/utils/data/dataloader.py:628\u001b[0m, in \u001b[0;36m_BaseDataLoaderIter.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 625\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampler_iter \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 626\u001b[0m \u001b[38;5;66;03m# TODO(https://github.com/pytorch/pytorch/issues/76750)\u001b[39;00m\n\u001b[1;32m 627\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset() \u001b[38;5;66;03m# type: ignore[call-arg]\u001b[39;00m\n\u001b[0;32m--> 628\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_next_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 630\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataset_kind \u001b[38;5;241m==\u001b[39m _DatasetKind\u001b[38;5;241m.\u001b[39mIterable \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m 632\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called:\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/torch/utils/data/dataloader.py:671\u001b[0m, in \u001b[0;36m_SingleProcessDataLoaderIter._next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 669\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_next_data\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 670\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_next_index() \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[0;32m--> 671\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset_fetcher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# may raise StopIteration\u001b[39;00m\n\u001b[1;32m 672\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory:\n\u001b[1;32m 673\u001b[0m data \u001b[38;5;241m=\u001b[39m _utils\u001b[38;5;241m.\u001b[39mpin_memory\u001b[38;5;241m.\u001b[39mpin_memory(data, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pin_memory_device)\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:58\u001b[0m, in \u001b[0;36m_MapDatasetFetcher.fetch\u001b[0;34m(self, possibly_batched_index)\u001b[0m\n\u001b[1;32m 56\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39m__getitems__(possibly_batched_index)\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m data \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[idx] \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m possibly_batched_index]\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 60\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:58\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 56\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39m__getitems__(possibly_batched_index)\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m data \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdataset\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m possibly_batched_index]\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 60\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset[possibly_batched_index]\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/datasets/arrow_dataset.py:2601\u001b[0m, in \u001b[0;36mDataset.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2599\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, key): \u001b[38;5;66;03m# noqa: F811\u001b[39;00m\n\u001b[1;32m 2600\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools).\"\"\"\u001b[39;00m\n\u001b[0;32m-> 2601\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2602\u001b[0m \u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2603\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/datasets/arrow_dataset.py:2585\u001b[0m, in \u001b[0;36mDataset._getitem\u001b[0;34m(self, key, **kwargs)\u001b[0m\n\u001b[1;32m 2583\u001b[0m format_kwargs \u001b[38;5;241m=\u001b[39m format_kwargs \u001b[38;5;28;01mif\u001b[39;00m format_kwargs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m {}\n\u001b[1;32m 2584\u001b[0m formatter \u001b[38;5;241m=\u001b[39m get_formatter(format_type, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mformat_kwargs)\n\u001b[0;32m-> 2585\u001b[0m pa_subtable \u001b[38;5;241m=\u001b[39m \u001b[43mquery_table\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindices\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_indices\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_indices\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 2586\u001b[0m formatted_output \u001b[38;5;241m=\u001b[39m format_table(\n\u001b[1;32m 2587\u001b[0m pa_subtable, key, formatter\u001b[38;5;241m=\u001b[39mformatter, format_columns\u001b[38;5;241m=\u001b[39mformat_columns, output_all_columns\u001b[38;5;241m=\u001b[39moutput_all_columns\n\u001b[1;32m 2588\u001b[0m )\n\u001b[1;32m 2589\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m formatted_output\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/datasets/formatting/formatting.py:588\u001b[0m, in \u001b[0;36mquery_table\u001b[0;34m(table, key, indices)\u001b[0m\n\u001b[1;32m 586\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 587\u001b[0m size \u001b[38;5;241m=\u001b[39m indices\u001b[38;5;241m.\u001b[39mnum_rows \u001b[38;5;28;01mif\u001b[39;00m indices \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m table\u001b[38;5;241m.\u001b[39mnum_rows\n\u001b[0;32m--> 588\u001b[0m \u001b[43m_check_valid_index_key\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 589\u001b[0m \u001b[38;5;66;03m# Query the main table\u001b[39;00m\n\u001b[1;32m 590\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m indices \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+ "File \u001b[0;32m~/.conda/envs/whisper_lightning/lib/python3.10/site-packages/datasets/formatting/formatting.py:531\u001b[0m, in \u001b[0;36m_check_valid_index_key\u001b[0;34m(key, size)\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mint\u001b[39m):\n\u001b[1;32m 530\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (key \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m key \u001b[38;5;241m+\u001b[39m size \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (key \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m size):\n\u001b[0;32m--> 531\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIndexError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid key: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m is out of bounds for size \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msize\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 532\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 533\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mslice\u001b[39m):\n",
+ "\u001b[0;31mIndexError\u001b[0m: Invalid key: 90427 is out of bounds for size 0"
+ ]
+ }
+ ],
+ "source": [
+ "class MyLightningModule(pl.LightningModule):\n",
+ " def __init__(self, model_name, learning_rate, weight_decay, batch_size, num_training_steps):\n",
+ " super().__init__()\n",
+ " self.model_name = model_name\n",
+ " self.learning_rate = learning_rate\n",
+ " self.weight_decay = weight_decay\n",
+ " self.batch_size = batch_size\n",
+ " self.num_training_steps = num_training_steps\n",
+ " \n",
+ " # Load the pre-trained model and tokenizer\n",
+ " self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)\n",
+ " self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)\n",
+ "\n",
+ " def forward(self, input_ids, attention_mask, labels=None):\n",
+ " output = self.model(\n",
+ " input_ids=input_ids,\n",
+ " attention_mask=attention_mask,\n",
+ " labels=labels,\n",
+ " )\n",
+ " return output.loss, output.logits\n",
+ " \n",
+ " def training_step(self, batch, batch_idx):\n",
+ " input_ids = batch[\"input_ids\"]\n",
+ " attention_mask = batch[\"attention_mask\"]\n",
+ " labels = batch[\"labels\"]\n",
+ " \n",
+ " loss\n",
+ "\n",
+ "# Define the data collator\n",
+ "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
+ "\n",
+ "# Initialize the trainer arguments\n",
+ "training_args = Seq2SeqTrainingArguments(\n",
+ " output_dir=\"./results\",\n",
+ " learning_rate=1e-5,\n",
+ " per_device_train_batch_size=16,\n",
+ " per_device_eval_batch_size=16,\n",
+ " max_steps=5000,\n",
+ " weight_decay=1e-4,\n",
+ " push_to_hub=False,\n",
+ " evaluation_strategy = \"steps\",\n",
+ " eval_steps = 50,\n",
+ " generation_max_length=128,\n",
+ " predict_with_generate=True,\n",
+ " logging_steps=100,\n",
+ " gradient_accumulation_steps=1,\n",
+ " fp16=True,\n",
+ ")\n",
+ "\n",
+ "# Load the ROUGE metric\n",
+ "metric = load_metric(\"rouge\")\n",
+ "\n",
+ "# Define the evaluation function\n",
+ "def compute_metrics(pred):\n",
+ " labels = pred.label_ids\n",
+ " preds = pred.predictions\n",
+ " decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
+ " decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+ " scores = metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=[\"rouge1\"])[\"rouge1\"].mid\n",
+ " return {\"rouge1_precision\": scores.precision, \"rouge1_recall\": scores.recall, \"rouge1_fmeasure\": scores.fmeasure}\n",
+ "\n",
+ "\n",
+ "# Initialize the trainer\n",
+ "trainer = Seq2SeqTrainer(\n",
+ " model=model,\n",
+ " args=training_args,\n",
+ " train_dataset=train_data,\n",
+ " eval_dataset=val_data,\n",
+ " data_collator=data_collator,\n",
+ " tokenizer=tokenizer,\n",
+ " compute_metrics=compute_metrics,\n",
+ ")\n",
+ "\n",
+ "# Start the training\n",
+ "trainer.train()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1b0f9a76",
+ "metadata": {},
+ "source": [
+ "# Steps:\n",
+ "1. Rewrite code to be more general\n",
+ "\n",
+ "a) Data loading should be from disk rather than their load_dataset, and should be on the fly\n",
+ "\n",
+ "b) Rewrite to Lightning code, Trainer etc using Lightning, compute metric fine that we use huggingface"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ff03c8bb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!nvidia-smi"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aafc4b27",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/ML/Pytorch/huggingface/learning.ipynb b/ML/Pytorch/huggingface/learning.ipynb
new file mode 100644
index 0000000..1a9ac79
--- /dev/null
+++ b/ML/Pytorch/huggingface/learning.ipynb
@@ -0,0 +1,580 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fc8e5ea0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "print(torch.cuda.is_available())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d8a1e039",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import pipeline\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6ad73024",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "classifier = pipeline(\"zero-shot-classification\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "04f7e02c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "classifier(\n",
+ " \"This is a course about the Transformers library\",\n",
+ " candidate_labels=[\"machine learning\", \"gym\", \"food\"],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6fb246c2",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "from transformers import pipeline\n",
+ "generator = pipeline(task=\"text-generation\", model=\"bigscience/bloom-1b7\", device=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c4e174f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import AutoModelForTokenClassification, AutoModel, AutoTokenizer\n",
+ "import torch\n",
+ "\n",
+ "# Define input text and pre-trained model checkpoint\n",
+ "text = \"My name is wolfgang and I live in berlin\"\n",
+ "checkpoint = \"Jean-Baptiste/roberta-large-ner-english\"\n",
+ "\n",
+ "# Instantiate tokenizer and encode input text\n",
+ "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
+ "inputs = tokenizer(text, padding=True, truncation=True, return_tensors=\"pt\")\n",
+ "\n",
+ "# Instantiate model and generate output\n",
+ "model = AutoModel.from_pretrained(checkpoint)\n",
+ "outputs = model(**inputs)\n",
+ "print(outputs[0].shape)\n",
+ "\n",
+ "# Instantiate token classification model and generate predictions\n",
+ "model = AutoModelForTokenClassification.from_pretrained(checkpoint)\n",
+ "outputs = model(**inputs)\n",
+ "predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)\n",
+ "print(predictions)\n",
+ "print(model.config.id2label)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8212bbaa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
+ "\n",
+ "tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')\n",
+ "model = AutoModelForMaskedLM.from_pretrained(\"xlm-roberta-large\")\n",
+ "\n",
+ "# prepare input\n",
+ "text = \"Replace me by any text you'd like.\"\n",
+ "encoded_input = tokenizer(text, return_tensors='pt')\n",
+ "\n",
+ "# forward pass\n",
+ "output = model(**encoded_input)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "314cba41",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
+ "\n",
+ "# Load the pre-trained tokenizer and model\n",
+ "tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')\n",
+ "model = AutoModelForMaskedLM.from_pretrained(\"xlm-roberta-large\")\n",
+ "\n",
+ "# Define the input sentence with a masked token\n",
+ "text = \"I want to a new car tomorrow.\"\n",
+ "\n",
+ "# Tokenize the input sentence, replacing the masked token with a special [MASK] token\n",
+ "encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')\n",
+ "\n",
+ "print(output.logits.shape)\n",
+ "print(encoded_input['input_ids'][0].tolist().index(tokenizer.mask_token_id))\n",
+ "\n",
+ "# Extract the predicted probabilities for the masked token\n",
+ "predicted_probabilities = output.logits[0, encoded_input['input_ids'][0].tolist().index(tokenizer.mask_token_id)]\n",
+ "predicted_probabilities = torch.nn.functional.softmax(predicted_probabilities, dim=-1)\n",
+ "\n",
+ "# Get the top-k most probable predictions for the masked token\n",
+ "k = 5\n",
+ "top_k = torch.topk(predicted_probabilities, k)\n",
+ "for i in range(k):\n",
+ " token = tokenizer.convert_ids_to_tokens(top_k.indices[i].item())\n",
+ " score = top_k.values[i].item()\n",
+ " print(f\"Prediction {i+1}: '{token}' with probability {score:.5f}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6187e77e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
+ "\n",
+ "sequences = [\n",
+ " \"Using a Transformer network is simple\",\n",
+ " \"The quick brown fox jumps over the lazy dog\",\n",
+ " \"To be or not to be, that is the question\"\n",
+ "]\n",
+ "\n",
+ "# Tokenize the input sequences and convert them to padded and truncated integer token IDs\n",
+ "inputs = tokenizer(\n",
+ " sequences,\n",
+ " padding=True,\n",
+ " truncation=True,\n",
+ " return_tensors=\"pt\"\n",
+ ")\n",
+ "\n",
+ "# Print the resulting input IDs and attention masks\n",
+ "print(inputs['input_ids'])\n",
+ "print(inputs['attention_mask'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fc259c5a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "43466db6",
+ "metadata": {},
+ "source": [
+ "Huggingface:\n",
+ "\n",
+ "1. Understanding how to use the Pipeline (probably most useful) for various tasks, easy to use, and the different subtasks it can do like translation, QA, zero shot, sentiment analysis, token classification, etc. \n",
+ "2. Understood how pipeline works in more detail by using AutoModel for various tasks as well as AutoTokenizer\n",
+ "3. Load dataset\n",
+ "4. How to finetune\n",
+ "5. How to evaluate\n",
+ "6. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "97c474f2",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3ed5d8c2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification\n",
+ "\n",
+ "# Same as before\n",
+ "checkpoint = \"bert-base-uncased\"\n",
+ "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
+ "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
+ "sequences = [\n",
+ " \"I've been waiting for a HuggingFace course my whole life.\",\n",
+ " \"This course is amazing!\",\n",
+ "]\n",
+ "batch = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"pt\")\n",
+ "\n",
+ "# This is new\n",
+ "batch[\"labels\"] = torch.tensor([1, 1])\n",
+ "\n",
+ "optimizer = AdamW(model.parameters())\n",
+ "loss = model(**batch).loss\n",
+ "loss.backward()\n",
+ "optimizer.step()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c598624f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from datasets import load_dataset\n",
+ "raw_datasets = load_dataset(\"glue\", \"mrpc\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cd296227",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "raw_train_dataset = raw_datasets[\"train\"]\n",
+ "raw_train_dataset[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e462947a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from datasets import load_dataset\n",
+ "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
+ "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
+ "\n",
+ "checkpoint = \"bert-base-uncased\"\n",
+ "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
+ "\n",
+ "def tokenize_function(example):\n",
+ " return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n",
+ "\n",
+ "\n",
+ "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
+ "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
+ "\n",
+ "\n",
+ "from transformers import TrainingArguments\n",
+ "training_args = TrainingArguments(\"test-trainer\")\n",
+ "\n",
+ "from transformers import AutoModelForSequenceClassification\n",
+ "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
+ "\n",
+ "import numpy as np\n",
+ "import evaluate\n",
+ "\n",
+ "def compute_metrics(eval_preds):\n",
+ " metric = evaluate.load(\"glue\", \"mrpc\")\n",
+ " logits, labels = eval_preds\n",
+ " predictions = np.argmax(logits, axis=-1)\n",
+ " return metric.compute(predictions=predictions, references=labels)\n",
+ "\n",
+ "training_args = TrainingArguments(\"test-trainer\", evaluation_strategy=\"epoch\")\n",
+ "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
+ "\n",
+ "trainer = Trainer(\n",
+ " model,\n",
+ " training_args,\n",
+ " train_dataset=tokenized_datasets[\"train\"],\n",
+ " eval_dataset=tokenized_datasets[\"validation\"],\n",
+ " data_collator=data_collator,\n",
+ " tokenizer=tokenizer,\n",
+ " compute_metrics=compute_metrics,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e2795dc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import TrainingArguments\n",
+ "training_args = TrainingArguments(\"test-trainer\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3af29cd5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import AutoModelForSequenceClassification\n",
+ "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "817f644e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import evaluate"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "42819a6c",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "def compute_metrics(eval_preds):\n",
+ " metric = evaluate.load(\"glue\", \"mrpc\")\n",
+ " logits, labels = eval_preds\n",
+ " predictions = np.argmax(logits, axis=-1)\n",
+ " return metric.compute(predictions=predictions, references=labels)\n",
+ "\n",
+ "training_args = TrainingArguments(\"test-trainer\", evaluation_strategy=\"epoch\")\n",
+ "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
+ "\n",
+ "trainer = Trainer(\n",
+ " model,\n",
+ " training_args,\n",
+ " train_dataset=tokenized_datasets[\"train\"],\n",
+ " eval_dataset=tokenized_datasets[\"validation\"],\n",
+ " data_collator=data_collator,\n",
+ " tokenizer=tokenizer,\n",
+ " compute_metrics=compute_metrics,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eb5986b0",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer\n",
+ "from datasets import load_dataset\n",
+ "batch_size=32\n",
+ "\n",
+ "# Define the generator function to preprocess the data in batches\n",
+ "def preprocess_generator(examples):\n",
+ " for i in range(0, len(examples[\"article\"]), batch_size):\n",
+ " batch = examples[\"article\"][i:i+batch_size]\n",
+ " targets = examples[\"highlights\"][i:i+batch_size]\n",
+ " model_inputs = tokenizer(batch, max_length=512, padding=\"max_length\", truncation=True)\n",
+ " with tokenizer.as_target_tokenizer():\n",
+ " model_targets = tokenizer(targets, max_length=128, padding=\"max_length\", truncation=True)\n",
+ " model_inputs[\"labels\"] = model_targets[\"input_ids\"]\n",
+ " yield model_inputs\n",
+ "\n",
+ "def preprocess_function(examples):\n",
+ " articles = [ex for ex in examples[\"article\"]]\n",
+ " summaries = [ex for ex in examples[\"highlights\"]]\n",
+ "\n",
+ " model_inputs = tokenizer(articles, max_length=512, padding=\"max_length\", truncation=True)\n",
+ " with tokenizer.as_target_tokenizer():\n",
+ " model_targets = tokenizer(summaries, max_length=128, padding=\"max_length\", truncation=True)\n",
+ " \n",
+ " model_inputs[\"labels\"] = model_targets[\"input_ids\"]\n",
+ " return model_inputs\n",
+ " \n",
+ "# Load the dataset\n",
+ "raw_datasets = load_dataset(\"cnn_dailymail\", \"3.0.0\")\n",
+ "preprocessed_datasets = raw_datasets.map(preprocess_function, batched=True, num_proc=4)\n",
+ "\n",
+ "# Load the pre-trained model and tokenizer\n",
+ "model_name = \"t5-small\"\n",
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+ "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
+ "\n",
+ "# Define the data collator\n",
+ "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
+ "\n",
+ "# Initialize the trainer arguments\n",
+ "training_args = Seq2SeqTrainingArguments(\n",
+ " output_dir=\"./results\",\n",
+ " evaluation_strategy = \"epoch\",\n",
+ " learning_rate=2e-5,\n",
+ " per_device_train_batch_size=batch_size,\n",
+ " max_steps=1000,\n",
+ " weight_decay=0.01,\n",
+ " push_to_hub=False,\n",
+ ")\n",
+ "\n",
+ "# Initialize the trainer\n",
+ "trainer = Seq2SeqTrainer(\n",
+ " model=model,\n",
+ " args=training_args,\n",
+ " train_dataset=train_ds,\n",
+ " data_collator=data_collator,\n",
+ " tokenizer=tokenizer,\n",
+ ")\n",
+ "\n",
+ "# Start the training\n",
+ "trainer.train()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7d62583e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from datasets import load_metric"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d310a7b3",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "preprocessed_datasets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "99d422cc",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "# Load the pre-trained model and tokenizer\n",
+ "model_name = \"t5-small\"\n",
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+ "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
+ "\n",
+ "# Define the data collator\n",
+ "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
+ "\n",
+ "# Initialize the trainer arguments\n",
+ "training_args = Seq2SeqTrainingArguments(\n",
+ " output_dir=\"./results\",\n",
+ " learning_rate=2e-5,\n",
+ " per_device_train_batch_size=batch_size,\n",
+ " max_steps=5000,\n",
+ " weight_decay=0.01,\n",
+ " push_to_hub=False,\n",
+ " evaluation_strategy = \"steps\",\n",
+ " eval_steps = 50,\n",
+ ")\n",
+ "\n",
+ "# Load the ROUGE metric\n",
+ "metric = load_metric(\"rouge\")\n",
+ "\n",
+ "# Define the evaluation function\n",
+ "def compute_metrics(pred):\n",
+ " labels = pred.label_ids\n",
+ " preds = pred.predictions\n",
+ " \n",
+ " decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
+ " decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+ " \n",
+ " scores = metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=[\"rouge1\"])[\"rouge1\"].mid\n",
+ " \n",
+ " return {\"rouge1_precision\": scores.precision, \"rouge1_recall\": scores.recall, \"rouge1_fmeasure\": scores.fmeasure}\n",
+ "\n",
+ "\n",
+ "# Initialize the trainer\n",
+ "trainer = Seq2SeqTrainer(\n",
+ " model=model,\n",
+ " args=training_args,\n",
+ " train_dataset=preprocessed_datasets[\"train\"],\n",
+ " eval_dataset=preprocessed_datasets[\"validation\"],\n",
+ " data_collator=data_collator,\n",
+ " tokenizer=tokenizer,\n",
+ " compute_metrics=compute_metrics,\n",
+ ")\n",
+ "\n",
+ "# Start the training\n",
+ "trainer.train()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a5e97b57",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install nltk\n",
+ "!pip install rouge_score"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "558c3e66",
+ "metadata": {},
+ "source": [
+ "# Goal:\n",
+ "\n",
+ "1. Implement full training from dataloading (dailycnn dataset), to model training, evaluation, etc, using HF. \n",
+ "* Right now: stuck on on the fly dataset loading, we don't want to cache because this would take a lot of disk space etc.\n",
+ "\n",
+ "2. After we get step 1) working, we want to go deeper on every step, so download the dataset and load it as a custom dataset rather than using huggingface simple API, in order to make it more general. Compare with loading the ds as a custom HF dataset or using pytorch class together with lightning. Speed difference? Convenience? Also we want to use the lightning Trainer so see how we can integrate that. And then compare HF to the lightning + hf model approach and see what we like the most."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "624d49ca",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/ML/Pytorch/huggingface/learninghugg.py b/ML/Pytorch/huggingface/learninghugg.py
new file mode 100644
index 0000000..236cf3b
--- /dev/null
+++ b/ML/Pytorch/huggingface/learninghugg.py
@@ -0,0 +1,41 @@
+from datasets import load_dataset
+from transformers import AutoTokenizer, DataCollatorWithPadding
+from transformers import Trainer
+
+raw_datasets = load_dataset("glue", "mrpc")
+checkpoint = "bert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+
+
+def tokenize_function(example):
+ return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
+
+
+tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+
+from transformers import TrainingArguments
+training_args = TrainingArguments("test-trainer")
+
+from transformers import AutoModelForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+
+def compute_metrics(eval_preds):
+ metric = evaluate.load("glue", "mrpc")
+ logits, labels = eval_preds
+ predictions = np.argmax(logits, axis=-1)
+ return metric.compute(predictions=predictions, references=labels)
+
+training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
+model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+
+trainer = Trainer(
+ model,
+ training_args,
+ train_dataset=tokenized_datasets["train"],
+ eval_dataset=tokenized_datasets["validation"],
+ data_collator=data_collator,
+ tokenizer=tokenizer,
+ compute_metrics=compute_metrics,
+)
diff --git a/ML/Pytorch/huggingface/test.py b/ML/Pytorch/huggingface/test.py
new file mode 100644
index 0000000..dd1f9d8
--- /dev/null
+++ b/ML/Pytorch/huggingface/test.py
@@ -0,0 +1,2 @@
+l = ["cat", "dog"]
+sentence = "The quick brown fox jumps over the lazy dog"
diff --git a/ML/Pytorch/more_advanced/VAE/lightning_vae/.scale_batch_size_2121f91a-3045-4142-95c9-9f29857117c1.ckpt b/ML/Pytorch/more_advanced/VAE/lightning_vae/.scale_batch_size_2121f91a-3045-4142-95c9-9f29857117c1.ckpt
new file mode 100644
index 0000000..0f06c13
Binary files /dev/null and b/ML/Pytorch/more_advanced/VAE/lightning_vae/.scale_batch_size_2121f91a-3045-4142-95c9-9f29857117c1.ckpt differ
diff --git a/ML/Pytorch/more_advanced/VAE/lightning_vae/dataset.py b/ML/Pytorch/more_advanced/VAE/lightning_vae/dataset.py
new file mode 100644
index 0000000..f89040c
--- /dev/null
+++ b/ML/Pytorch/more_advanced/VAE/lightning_vae/dataset.py
@@ -0,0 +1,60 @@
+# Imports
+import torch
+import torchvision.datasets as datasets # Standard datasets
+import torchvision.transforms as transforms # Transformations we can perform on our dataset for augmentation
+from torch.utils.data import DataLoader
+import pytorch_lightning as pl
+
+
+class MNISTDataModule(pl.LightningDataModule):
+ def __init__(self, batch_size, num_workers):
+ super().__init__()
+ self.batch_size = batch_size
+ self.num_workers = num_workers
+
+ def setup(self, stage):
+ mnist_full = train_dataset = datasets.MNIST(
+ root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+ )
+ self.mnist_test = datasets.MNIST(
+ root="dataset/", train=False, transform=transforms.ToTensor(), download=True
+ )
+ self.mnist_train, self.mnist_val = torch.utils.data.random_split(
+ mnist_full, [55000, 5000]
+ )
+
+ def train_dataloader(self):
+ return DataLoader(
+ self.mnist_train,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ persistent_workers=True,
+ shuffle=True,
+ )
+
+ def val_dataloader(self):
+ return DataLoader(
+ self.mnist_val,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ persistent_workers=True,
+ shuffle=False,
+ )
+
+ def test_dataloader(self):
+ return DataLoader(
+ self.mnist_test,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ persistent_workers=True,
+ shuffle=False,
+ )
+
+
+# check that it works
+if __name__ == "__main__":
+ dm = MNISTDataModule()
+ dm.setup("fit")
+ print(len(dm.mnist_train))
+ print(len(dm.mnist_val))
+ print(len(dm.mnist_test))
diff --git a/ML/Pytorch/more_advanced/VAE/lightning_vae/model.py b/ML/Pytorch/more_advanced/VAE/lightning_vae/model.py
new file mode 100644
index 0000000..3f9dfcd
--- /dev/null
+++ b/ML/Pytorch/more_advanced/VAE/lightning_vae/model.py
@@ -0,0 +1,92 @@
+import torch
+import torchvision
+from torch import nn
+import pytorch_lightning as pl
+
+
+class VAEpl(pl.LightningModule):
+ def __init__(self, lr, input_dim=784, h_dim=200, z_dim=20):
+ super().__init__()
+ self.lr = lr
+ self.loss_fn = nn.BCELoss(reduction="sum")
+ self.input_dim = input_dim
+
+ # encoder
+ self.img_2hid = nn.Linear(input_dim, h_dim)
+ self.hid_2mu = nn.Linear(h_dim, z_dim)
+ self.hid_2sigma = nn.Linear(h_dim, z_dim)
+
+ # decoder
+ self.z_2hid = nn.Linear(z_dim, h_dim)
+ self.hid_2img = nn.Linear(h_dim, input_dim)
+ self.relu = nn.ReLU()
+ self.sigmoid = nn.Sigmoid()
+
+ def encode(self, x):
+ h = self.relu(self.img_2hid(x))
+ mu, sigma = self.hid_2mu(h), self.hid_2sigma(h)
+ return mu, sigma
+
+ def decode(self, z):
+ h = self.relu(self.z_2hid(z))
+ return torch.sigmoid(self.hid_2img(h))
+
+ def forward(self, x):
+ mu, sigma = self.encode(x)
+ epsilon = torch.randn_like(sigma)
+ z_new = mu + sigma * epsilon
+ x_reconstructed = self.decode(z_new)
+ return x_reconstructed, mu, sigma
+
+ def training_step(self, batch, batch_idx):
+ x, _ = batch
+ x = x.view(-1, self.input_dim)
+ x_reconstructed, mu, sigma = self.forward(x)
+ reconstruction_loss = self.loss_fn(x_reconstructed, x)
+ kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
+ loss = reconstruction_loss + kl_div
+ self.log("train_loss", loss, sync_dist=True)
+
+ # add logging of images to tensorboard, x_reconstructed and x, so that
+ # it updates every step and we can the progress pictures in tensorboard
+ if batch_idx % 100 == 0:
+ # take out the first 8
+ x = x[:8]
+ x_reconstructed = x_reconstructed[:8]
+ grid = torchvision.utils.make_grid(x_reconstructed.view(-1, 1, 28, 28))
+ self.logger.experiment.add_image("reconstructed", grid, self.global_step)
+ grid = torchvision.utils.make_grid(x.view(-1, 1, 28, 28))
+ self.logger.experiment.add_image("original", grid, self.global_step)
+ return loss
+
+ def validation_step(self, batch, batch_idx):
+ x, _ = batch
+ x = x.view(-1, self.input_dim)
+ x_reconstructed, mu, sigma = self.forward(x)
+ reconstruction_loss = self.loss_fn(x_reconstructed, x)
+ kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
+ loss = reconstruction_loss + kl_div
+ self.log("val_loss", loss, sync_dist=True)
+ return loss
+
+ def test_step(self, batch, batch_idx):
+ x, _ = batch
+ x = x.view(-1, self.input_dim)
+ x_reconstructed, mu, sigma = self.forward(x)
+ reconstruction_loss = self.loss_fn(x_reconstructed, x)
+ kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
+ loss = reconstruction_loss + kl_div
+ self.log("test_loss", loss, sync_dist=True)
+ return loss
+
+ def configure_optimizers(self):
+ optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
+ return optimizer
+
+
+if __name__ == "__main__":
+ batch_size = 8
+ x = torch.randn(batch_size, 28 * 28 * 1)
+ vae_pl = VAEpl()
+ x_reconstructed, mu, sigma = vae_pl(x)
+ print(x_reconstructed.shape)
diff --git a/ML/Pytorch/more_advanced/VAE/lightning_vae/train.py b/ML/Pytorch/more_advanced/VAE/lightning_vae/train.py
new file mode 100644
index 0000000..de00c19
--- /dev/null
+++ b/ML/Pytorch/more_advanced/VAE/lightning_vae/train.py
@@ -0,0 +1,49 @@
+import torch
+import torchvision.datasets as datasets # Standard datasets
+from tqdm import tqdm
+from torch import nn, optim
+from torchvision import transforms
+from torchvision.utils import save_image
+from torch.utils.data import DataLoader
+from dataset import MNISTDataModule
+import pytorch_lightning as pl
+from model import VAEpl
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.strategies import DeepSpeedStrategy
+torch.set_float32_matmul_precision("medium")
+
+"""
+GOALS:
+* Understand the strategy (deepspeed, ddp, etc) and how to use it
+* Setup a config, for scheduler etc instead of configuring it in each sub-module
+* Metrics
+"""
+
+
+# things to add
+lr = 3e-4
+batch_size = 128
+num_workers = 2
+model = VAEpl(lr)
+dm = MNISTDataModule(batch_size, num_workers)
+logger = TensorBoardLogger("my_checkpoint", name="scheduler_autolr_vae_pl_model")
+
+# add callback for learning rate monitor, model checkpoint, and scheduler on plateau
+callbacks = [pl.callbacks.LearningRateMonitor(logging_interval="step"),
+ pl.callbacks.ModelCheckpoint(monitor="val_loss", save_top_k=1, mode="min", save_last=True),
+ ]
+
+if __name__ == "__main__":
+ trainer = pl.Trainer(
+ max_epochs=100,
+ accelerator="gpu",
+ devices=2,
+ logger=logger,
+ #precision=16,
+ strategy=DeepSpeedStrategy(
+ stage=0,
+ ),
+ )
+
+ #trainer.tune(model, dm)
+ trainer.fit(model, dm)
diff --git a/ML/Pytorch/more_advanced/VAE/lightning_vae/utils.py b/ML/Pytorch/more_advanced/VAE/lightning_vae/utils.py
new file mode 100644
index 0000000..37d7856
--- /dev/null
+++ b/ML/Pytorch/more_advanced/VAE/lightning_vae/utils.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# import save_image from torchvision.utils
+from torchvision.utils import save_image
+
+
+def inference(model, dataset, digit, num_examples=1):
+ """
+ Generates (num_examples) of a particular digit.
+ Specifically we extract an example of each digit,
+ then after we have the mu, sigma representation for
+ each digit we can sample from that.
+
+ After we sample we can run the decoder part of the VAE
+ and generate examples.
+ """
+ images = []
+ idx = 0
+ for x, y in dataset:
+ if y == idx:
+ images.append(x)
+ idx += 1
+ if idx == 10:
+ break
+
+ encodings_digit = []
+ for d in range(10):
+ with torch.no_grad():
+ mu, sigma = model.encode(images[d].view(1, 784))
+ encodings_digit.append((mu, sigma))
+
+ mu, sigma = encodings_digit[digit]
+ for example in range(num_examples):
+ epsilon = torch.randn_like(sigma)
+ z = mu + sigma * epsilon
+ out = model.decode(z)
+ out = out.view(-1, 1, 28, 28)
+ save_image(out, f"generated_{digit}_ex{example}.png")
+
+
diff --git a/ML/Pytorch/more_advanced/VAE/train.py b/ML/Pytorch/more_advanced/VAE/train.py
index eb0949f..64993fe 100644
--- a/ML/Pytorch/more_advanced/VAE/train.py
+++ b/ML/Pytorch/more_advanced/VAE/train.py
@@ -23,27 +23,7 @@ model = VariationalAutoEncoder(INPUT_DIM, H_DIM, Z_DIM).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR_RATE)
loss_fn = nn.BCELoss(reduction="sum")
-# Start Training
-for epoch in range(NUM_EPOCHS):
- loop = tqdm(enumerate(train_loader))
- for i, (x, _) in loop:
- # Forward pass
- x = x.to(DEVICE).view(x.shape[0], INPUT_DIM)
- x_reconstructed, mu, sigma = model(x)
- # Compute loss
- reconstruction_loss = loss_fn(x_reconstructed, x)
- kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
-
- # Backprop
- loss = reconstruction_loss + kl_div
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
- loop.set_postfix(loss=loss.item())
-
-
-model = model.to("cpu")
def inference(digit, num_examples=1):
"""
Generates (num_examples) of a particular digit.
@@ -79,8 +59,3 @@ def inference(digit, num_examples=1):
for idx in range(10):
inference(idx, num_examples=5)
-
-
-
-
-
diff --git a/ML/Pytorch/more_advanced/finetuning_whisper/dataset.py b/ML/Pytorch/more_advanced/finetuning_whisper/dataset.py
new file mode 100644
index 0000000..666087b
--- /dev/null
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/dataset.py
@@ -0,0 +1,120 @@
+"""
+Create a PyTorch Custom dataset that loads file in data/other.tsv that contains
+the path to image audio and text transcription.
+"""
+import pytorch_lightning as pl
+from tqdm import tqdm
+import ffmpeg
+import os
+import torch
+import numpy as np
+from torch.utils.data import Dataset, DataLoader
+import pandas as pd
+from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
+import sys
+
+class CommonVoice(Dataset):
+ def __init__(self, data_dir, whisper_model="tiny"):
+ self.sampling_rate = 16_000
+ self.data_dir = data_dir
+ self.data = pd.read_csv(
+ os.path.join(data_dir, "other.tsv"),
+ sep="\t",
+ )
+ self.feature_extractor = WhisperFeatureExtractor.from_pretrained(
+ f"openai/whisper-{whisper_model}"
+ )
+ self.tokenizer = WhisperTokenizer.from_pretrained(
+ f"openai/whisper-{whisper_model}", language="sv", task="transcribe"
+ )
+
+ def __len__(self):
+ return len(self.data)
+
+ def __getitem__(self, idx):
+ audio_file_path = os.path.join(
+ self.data_dir + "clips/", self.data.iloc[idx]["path"]
+ )
+ sentence = self.data.iloc[idx]["sentence"]
+ text = self.tokenizer(sentence).input_ids
+
+ out, _ = (
+ ffmpeg.input(audio_file_path, threads=0)
+ .output(
+ "-", format="s16le", acodec="pcm_s16le", ac=1, ar=self.sampling_rate
+ )
+ .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+ )
+ out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+
+ # run feature extractor
+ audio_features = self.feature_extractor(
+ out, sampling_rate=self.sampling_rate, return_tensors="pt"
+ )
+
+ return audio_features, text
+
+
+# Create a collator that will pad the audio features and text labels
+class DataCollatorSpeechSeq2SeqWithPadding:
+ def __init__(self, feature_extractor, tokenizer):
+ self.feature_extractor = feature_extractor
+ self.tokenizer = tokenizer
+
+ def __call__(self, batch):
+ text_features = [{"input_ids": x[1]} for x in batch]
+ batch_text = self.tokenizer.pad(
+ text_features, return_tensors="pt",
+ )
+ audio_features = [{"input_features": x[0]["input_features"]} for x in batch]
+
+ batch_audio = self.feature_extractor.pad(
+ audio_features, return_tensors="pt",
+ )
+ batch_text["input_ids"] = batch_text["input_ids"].masked_fill(
+ batch_text["attention_mask"].ne(1), -100
+ )
+
+ batch_audio["input_features"] = batch_audio["input_features"].squeeze(1)
+
+ labels = batch_text["input_ids"].clone()
+ if (labels[:, 0] == self.tokenizer.encode("")[0]).all().cpu().item():
+ labels = labels[:, 1:]
+
+ batch_text["labels"] = labels
+ return batch_audio, batch_text
+
+
+# Put into a lightning datamodule
+class WhisperDataset(pl.LightningDataModule):
+ def __init__(self, data_dir, batch_size=32, num_workers=0, whisper_model="tiny"):
+ super().__init__()
+ self.data_dir = data_dir
+ self.batch_size = batch_size
+ self.num_workers = num_workers
+ self.whisper_model = whisper_model
+ self.sampling_rate = 16_000
+
+ def setup(self, stage=None):
+ self.dataset = CommonVoice(self.data_dir, self.whisper_model)
+ self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(
+ self.dataset.feature_extractor, self.dataset.tokenizer
+ )
+
+ def train_dataloader(self):
+ return DataLoader(
+ self.dataset,
+ batch_size=self.batch_size,
+ shuffle=True,
+ num_workers=self.num_workers,
+ collate_fn=self.data_collator,
+ )
+
+
+# Test if lightning datamodule working as intended
+if __name__ == "__main__":
+ dm = WhisperDataset(data_dir="data/")
+ dm.setup()
+ from tqdm import tqdm
+ for batch in tqdm(dm.train_dataloader()):
+ pass
diff --git a/ML/Pytorch/more_advanced/finetuning_whisper/model.py b/ML/Pytorch/more_advanced/finetuning_whisper/model.py
new file mode 100644
index 0000000..9e5fcac
--- /dev/null
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/model.py
@@ -0,0 +1,34 @@
+import torch
+import torchvision
+from torch import nn
+import pytorch_lightning as pl
+from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
+from transformers import WhisperForConditionalGeneration
+
+
+class WhisperFinetuning(pl.LightningModule):
+ def __init__(self, lr, whisper_model="tiny"):
+ super().__init__()
+ self.lr = lr
+ self.model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{whisper_model}")
+ self.model.config.forced_decoder_ids = None
+ self.model.config.suppress_tokens = []
+
+ def training_step(self, batch, batch_idx):
+ encoder_input = batch[0]["input_features"]
+ decoder_labels = batch[1]["labels"]
+
+ out = self.model(
+ input_features=encoder_input,
+ labels=decoder_labels,
+ )
+ loss = out["loss"]
+ return loss
+
+ def configure_optimizers(self):
+ optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
+ return optimizer
+
+
+if __name__ == "__main__":
+ pass
diff --git a/ML/Pytorch/more_advanced/finetuning_whisper/steps.txt b/ML/Pytorch/more_advanced/finetuning_whisper/steps.txt
new file mode 100644
index 0000000..f2a4ea8
--- /dev/null
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/steps.txt
@@ -0,0 +1,9 @@
+Goal: re-write the code of huggingface whisper finetuning to use pytorch lightning
+1. load the dataset using lightning datamodule
+* integrate huggingface loading data, or we can write it ourselves and use lightning datamodule
+2. load the model using lightning module
+3. train the model using lightning trainer
+(4. See if we can sharded training with lightning trainer to maybe finetune a large whisper model
+that we couldn't on single GPU)
+
+End goal: Finetune the model on our own dataset for some cool application
diff --git a/ML/Pytorch/more_advanced/finetuning_whisper/test.py b/ML/Pytorch/more_advanced/finetuning_whisper/test.py
new file mode 100644
index 0000000..9684feb
--- /dev/null
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/test.py
@@ -0,0 +1,7 @@
+from transformers import WhisperTokenizer
+tokenizer = WhisperTokenizer.from_pretrained(
+ f"openai/whisper-tiny", task="transcribe"
+)
+encoded_string = tokenizer.encode("")[0]
+print(encoded_string) # should print 50258
+print(tokenizer.bos_token_id) # should print 50257
diff --git a/ML/Pytorch/more_advanced/finetuning_whisper/train.py b/ML/Pytorch/more_advanced/finetuning_whisper/train.py
new file mode 100644
index 0000000..1d85a54
--- /dev/null
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/train.py
@@ -0,0 +1,31 @@
+import torch
+import torchvision.datasets as datasets # Standard datasets
+from tqdm import tqdm
+from torch import nn, optim
+from torchvision import transforms
+from torchvision.utils import save_image
+from torch.utils.data import DataLoader
+import pytorch_lightning as pl
+from model import WhisperFinetuning
+from dataset import WhisperDataset
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.strategies import DeepSpeedStrategy
+torch.set_float32_matmul_precision("medium")
+
+# things to add
+lr = 1e-5
+batch_size = 32
+num_workers = 4
+model = WhisperFinetuning(lr)
+dm = WhisperDataset(data_dir="data/", batch_size=batch_size, num_workers=num_workers)
+
+if __name__ == "__main__":
+ trainer = pl.Trainer(
+ max_epochs=1000,
+ accelerator="gpu",
+ devices=[0],
+ precision=16,
+ )
+
+ trainer.fit(model, dm)
+
diff --git a/ML/Pytorch/more_advanced/finetuning_whisper/whisper.py b/ML/Pytorch/more_advanced/finetuning_whisper/whisper.py
new file mode 100644
index 0000000..ae98a83
--- /dev/null
+++ b/ML/Pytorch/more_advanced/finetuning_whisper/whisper.py
@@ -0,0 +1,181 @@
+import evaluate
+from transformers import Seq2SeqTrainer
+from transformers import WhisperForConditionalGeneration
+import torch
+from dataclasses import dataclass
+from typing import Any, Dict, List, Union
+from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
+from datasets import load_dataset, DatasetDict, Audio
+# set so we only can see first cuda device
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+
+common_voice = DatasetDict()
+common_voice["train"] = load_dataset(
+ "mozilla-foundation/common_voice_11_0",
+ "sv-SE",
+ split="train+validation",
+ use_auth_token=False,
+)
+common_voice["test"] = load_dataset(
+ "mozilla-foundation/common_voice_11_0",
+ "sv-SE",
+ split="test",
+ use_auth_token=False,
+)
+
+# common_voice = common_voice.remove_columns(
+# [
+# "accent",
+# "age",
+# "client_id",
+# "down_votes",
+# "gender",
+# "locale",
+# "path",
+# "segment",
+# "up_votes",
+# ]
+# )
+
+feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
+tokenizer = WhisperTokenizer.from_pretrained(
+ "openai/whisper-tiny", language="sv", task="transcribe"
+)
+
+input_str = common_voice["train"][0]["sentence"]
+labels = tokenizer(input_str).input_ids
+decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
+decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
+
+print(f"Input: {input_str}")
+print(f"Decoded w/ special: {decoded_with_special}")
+print(f"Decoded w/out special: {decoded_str}")
+print(f"Are equal: {input_str == decoded_str}")
+
+input_str = common_voice["train"][0]["sentence"]
+labels = tokenizer(input_str).input_ids
+decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
+decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
+
+processor = WhisperProcessor.from_pretrained(
+ "openai/whisper-small", language="sv", task="transcribe"
+)
+
+common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
+
+
+def prepare_dataset(example):
+ # load and resample audio data from 48 to 16kHz
+ audio = example["audio"]
+
+ # compute log-Mel input features from input audio array
+ example["input_features"] = feature_extractor(
+ audio["array"], sampling_rate=audio["sampling_rate"]
+ ).input_features[0]
+
+ # encode target text to label ids
+ example["labels"] = tokenizer(example["sentence"]).input_ids
+ return example
+
+
+common_voice = common_voice.map(prepare_dataset, num_proc=8)
+
+
+@dataclass
+class DataCollatorSpeechSeq2SeqWithPadding:
+ processor: Any
+
+ def __call__(
+ self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
+ ) -> Dict[str, torch.Tensor]:
+ # split inputs and labels since they have to be of different lengths
+ # and need different padding methods first treat the audio inputs by
+ # simply returning torch tensors
+ input_features = [
+ {"input_features": feature["input_features"]} for feature in features
+ ]
+ batch = self.processor.feature_extractor.pad(
+ input_features, return_tensors="pt"
+ )
+
+ # get the tokenized label sequences
+ label_features = [{"input_ids": feature["labels"]} for feature in features]
+ # pad the labels to max length
+ labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
+
+ # replace padding with -100 to ignore loss correctly
+ labels = labels_batch["input_ids"].masked_fill(
+ labels_batch.attention_mask.ne(1), -100
+ )
+
+ # if bos token is appended in previous tokenization step,
+ # cut bos token here as it's append later anyways
+ if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
+ labels = labels[:, 1:]
+
+ batch["labels"] = labels
+ return batch
+
+
+data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
+metric = evaluate.load("wer")
+
+
+def compute_metrics(pred):
+ pred_ids = pred.predictions
+ label_ids = pred.label_ids
+
+ # replace -100 with the pad_token_id
+ label_ids[label_ids == -100] = tokenizer.pad_token_id
+
+ # we do not want to group tokens when computing the metrics
+ pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
+ label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
+ wer = 100 * metric.compute(predictions=pred_str, references=label_str)
+
+ return {"wer": wer}
+
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+model.config.forced_decoder_ids = None
+model.config.suppress_tokens = []
+
+from transformers import Seq2SeqTrainingArguments
+
+training_args = Seq2SeqTrainingArguments(
+ output_dir="./whisper-tiny-swedish", # change to a repo name of your choice
+ per_device_train_batch_size=32,
+ gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size
+ learning_rate=1e-5,
+ warmup_steps=500,
+ max_steps=4000,
+ gradient_checkpointing=False,
+ fp16=True,
+ evaluation_strategy="steps",
+ per_device_eval_batch_size=8,
+ predict_with_generate=True,
+ generation_max_length=225,
+ save_steps=1000,
+ eval_steps=1000,
+ logging_steps=25,
+ report_to=["tensorboard"],
+ load_best_model_at_end=True,
+ metric_for_best_model="wer",
+ greater_is_better=False,
+ push_to_hub=False,
+ dataloader_num_workers=0,
+)
+
+
+trainer = Seq2SeqTrainer(
+ args=training_args,
+ model=model,
+ train_dataset=common_voice["train"],
+ eval_dataset=common_voice["test"],
+ data_collator=data_collator,
+ compute_metrics=compute_metrics,
+ tokenizer=processor.feature_extractor,
+)
+
+trainer.train()
diff --git a/ML/Pytorch/pytorch_lightning/1. start code/simple_fc.py b/ML/Pytorch/pytorch_lightning/1. start code/simple_fc.py
new file mode 100644
index 0000000..84b8700
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/1. start code/simple_fc.py
@@ -0,0 +1,110 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from torch.utils.data import random_split
+
+
+class NN(nn.Module):
+ def __init__(self, input_size, num_classes):
+ super().__init__()
+ self.fc1 = nn.Linear(input_size, 50)
+ self.fc2 = nn.Linear(50, num_classes)
+
+ def forward(self, x):
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return x
+
+
+# Set device cuda for GPU if it's available otherwise run on the CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+input_size = 784
+num_classes = 10
+learning_rate = 0.001
+batch_size = 64
+num_epochs = 3
+
+# Load Data
+entire_dataset = datasets.MNIST(
+ root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+)
+train_ds, val_ds = random_split(entire_dataset, [50000, 10000])
+test_ds = datasets.MNIST(
+ root="dataset/", train=False, transform=transforms.ToTensor(), download=True
+)
+train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
+val_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
+test_loader = DataLoader(dataset=test_ds, batch_size=batch_size, shuffle=False)
+
+# Initialize network
+model = NN(input_size=input_size, num_classes=num_classes).to(device)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Train Network
+for epoch in range(num_epochs):
+ for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
+ # Get data to cuda if possible
+ data = data.to(device=device)
+ targets = targets.to(device=device)
+
+ # Get to correct shape
+ data = data.reshape(data.shape[0], -1)
+
+ # Forward
+ scores = model(data)
+ loss = criterion(scores, targets)
+
+ # Backward
+ optimizer.zero_grad()
+ loss.backward()
+
+ # Gradient descent or adam step
+ optimizer.step()
+
+
+# Check accuracy on training & test to see how good our model
+def check_accuracy(loader, model):
+ num_correct = 0
+ num_samples = 0
+ model.eval()
+
+ # We don't need to keep track of gradients here so we wrap it in torch.no_grad()
+ with torch.no_grad():
+ # Loop through the data
+ for x, y in loader:
+
+ # Move data to device
+ x = x.to(device=device)
+ y = y.to(device=device)
+
+ # Get to correct shape
+ x = x.reshape(x.shape[0], -1)
+
+ # Forward pass
+ scores = model(x)
+ _, predictions = scores.max(1)
+
+ # Check how many we got correct
+ num_correct += (predictions == y).sum()
+
+ # Keep track of number of samples
+ num_samples += predictions.size(0)
+
+ model.train()
+ return num_correct / num_samples
+
+
+# Check accuracy on training & test to see how good our model
+model.to(device)
+print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
+print(f"Accuracy on validation set: {check_accuracy(val_loader, model)*100:.2f}")
+print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
diff --git a/ML/Pytorch/pytorch_lightning/10. Multi-GPU/callbacks.py b/ML/Pytorch/pytorch_lightning/10. Multi-GPU/callbacks.py
new file mode 100644
index 0000000..18eb930
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/10. Multi-GPU/callbacks.py
@@ -0,0 +1,12 @@
+from pytorch_lightning.callbacks import EarlyStopping, Callback
+
+class MyPrintingCallback(Callback):
+ def __init__(self):
+ super().__init__()
+
+ def on_train_start(self, trainer, pl_module):
+ print("Starting to train!")
+
+ def on_train_end(self, trainer, pl_module):
+ print("Training is done.")
+
diff --git a/ML/Pytorch/pytorch_lightning/10. Multi-GPU/config.py b/ML/Pytorch/pytorch_lightning/10. Multi-GPU/config.py
new file mode 100644
index 0000000..59ce2b0
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/10. Multi-GPU/config.py
@@ -0,0 +1,15 @@
+# Training hyperparameters
+INPUT_SIZE = 784
+NUM_CLASSES = 10
+LEARNING_RATE = 0.001
+BATCH_SIZE = 64
+NUM_EPOCHS = 3
+
+# Dataset
+DATA_DIR = "dataset/"
+NUM_WORKERS = 4
+
+# Compute related
+ACCELERATOR = "gpu"
+DEVICES = [0, 1]
+PRECISION = 16
diff --git a/ML/Pytorch/pytorch_lightning/10. Multi-GPU/dataset.py b/ML/Pytorch/pytorch_lightning/10. Multi-GPU/dataset.py
new file mode 100644
index 0000000..14a7d86
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/10. Multi-GPU/dataset.py
@@ -0,0 +1,64 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from torch.utils.data import random_split
+import pytorch_lightning as pl
+from torchvision.transforms import RandomHorizontalFlip, RandomVerticalFlip
+
+
+class MnistDataModule(pl.LightningDataModule):
+ def __init__(self, data_dir, batch_size, num_workers):
+ super().__init__()
+ self.data_dir = data_dir
+ self.batch_size = batch_size
+ self.num_workers = num_workers
+
+ def prepare_data(self):
+ datasets.MNIST(self.data_dir, train=True, download=True)
+ datasets.MNIST(self.data_dir, train=False, download=True)
+
+ def setup(self, stage):
+ entire_dataset = datasets.MNIST(
+ root=self.data_dir,
+ train=True,
+ transform=transforms.Compose([
+ transforms.RandomVerticalFlip(),
+ transforms.RandomHorizontalFlip(),
+ transforms.ToTensor(),
+ ]),
+ download=False,
+ )
+ self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
+ self.test_ds = datasets.MNIST(
+ root=self.data_dir,
+ train=False,
+ transform=transforms.ToTensor(),
+ download=False,
+ )
+
+ def train_dataloader(self):
+ return DataLoader(
+ self.train_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=True,
+ )
+
+ def val_dataloader(self):
+ return DataLoader(
+ self.val_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=False,
+ )
+
+ def test_dataloader(self):
+ return DataLoader(
+ self.test_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=False,
+ )
diff --git a/ML/Pytorch/pytorch_lightning/10. Multi-GPU/model.py b/ML/Pytorch/pytorch_lightning/10. Multi-GPU/model.py
new file mode 100644
index 0000000..f06da7c
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/10. Multi-GPU/model.py
@@ -0,0 +1,89 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import pytorch_lightning as pl
+import torchmetrics
+from torchmetrics import Metric
+import torchvision
+
+
+class NN(pl.LightningModule):
+ def __init__(self, input_size, learning_rate, num_classes):
+ super().__init__()
+ self.lr = learning_rate
+ self.fc1 = nn.Linear(input_size, 1_000_000)
+ self.fc2 = nn.Linear(1_000_000, num_classes)
+ self.loss_fn = nn.CrossEntropyLoss()
+ self.accuracy = torchmetrics.Accuracy(
+ task="multiclass", num_classes=num_classes
+ )
+ self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+
+ def forward(self, x):
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return x
+
+ def training_step(self, batch, batch_idx):
+ x, y = batch
+ loss, scores, y = self._common_step(batch, batch_idx)
+
+ self.log_dict(
+ {
+ "train_loss": loss,
+ },
+ on_step=False,
+ on_epoch=True,
+ prog_bar=True,
+ )
+
+ if batch_idx % 100 == 0:
+ x = x[:8]
+ grid = torchvision.utils.make_grid(x.view(-1, 1, 28, 28))
+ self.logger.experiment.add_image("mnist_images", grid, self.global_step)
+
+ return {"loss": loss, "scores": scores, "y": y}
+
+ def training_epoch_end(self, outputs):
+ scores = torch.cat([x["scores"] for x in outputs])
+ y = torch.cat([x["y"] for x in outputs])
+ self.log_dict(
+ {
+ "train_acc": self.accuracy(scores, y),
+ "train_f1": self.f1_score(scores, y),
+ },
+ on_step=False,
+ on_epoch=True,
+ prog_bar=True,
+ )
+
+ def validation_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log("val_loss", loss)
+ return loss
+
+ def test_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log("test_loss", loss)
+ return loss
+
+ def _common_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ loss = self.loss_fn(scores, y)
+ return loss, scores, y
+
+ def predict_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ preds = torch.argmax(scores, dim=1)
+ return preds
+
+ def configure_optimizers(self):
+ return optim.Adam(self.parameters(), lr=self.lr)
diff --git a/ML/Pytorch/pytorch_lightning/10. Multi-GPU/train.py b/ML/Pytorch/pytorch_lightning/10. Multi-GPU/train.py
new file mode 100644
index 0000000..92c8e5b
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/10. Multi-GPU/train.py
@@ -0,0 +1,43 @@
+import torch
+import pytorch_lightning as pl
+from model import NN
+from dataset import MnistDataModule
+import config
+from callbacks import MyPrintingCallback, EarlyStopping
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.profilers import PyTorchProfiler
+from pytorch_lightning.strategies import DeepSpeedStrategy
+
+torch.set_float32_matmul_precision("medium") # to make lightning happy
+
+if __name__ == "__main__":
+ logger = TensorBoardLogger("tb_logs", name="mnist_model_v1")
+ strategy = DeepSpeedStrategy()
+ profiler = PyTorchProfiler(
+ on_trace_ready=torch.profiler.tensorboard_trace_handler("tb_logs/profiler0"),
+ schedule=torch.profiler.schedule(skip_first=10, wait=1, warmup=1, active=20),
+ )
+ model = NN(
+ input_size=config.INPUT_SIZE,
+ learning_rate=config.LEARNING_RATE,
+ num_classes=config.NUM_CLASSES,
+ )
+ dm = MnistDataModule(
+ data_dir=config.DATA_DIR,
+ batch_size=config.BATCH_SIZE,
+ num_workers=config.NUM_WORKERS,
+ )
+ trainer = pl.Trainer(
+ strategy=strategy,
+ profiler=profiler,
+ logger=logger,
+ accelerator=config.ACCELERATOR,
+ devices=config.DEVICES,
+ min_epochs=1,
+ max_epochs=config.NUM_EPOCHS,
+ precision=config.PRECISION,
+ callbacks=[MyPrintingCallback(), EarlyStopping(monitor="val_loss")],
+ )
+ trainer.fit(model, dm)
+ trainer.validate(model, dm)
+ trainer.test(model, dm)
diff --git a/ML/Pytorch/pytorch_lightning/2. LightningModule/simple_fc.py b/ML/Pytorch/pytorch_lightning/2. LightningModule/simple_fc.py
new file mode 100644
index 0000000..aa769a2
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/2. LightningModule/simple_fc.py
@@ -0,0 +1,154 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from torch.utils.data import random_split
+import pytorch_lightning as pl
+
+class NN(nn.Module):
+ def __init__(self, input_size, num_classes):
+ super().__init__()
+ self.fc1 = nn.Linear(input_size, 50)
+ self.fc2 = nn.Linear(50, num_classes)
+
+ def forward(self, x):
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return x
+
+class NN(pl.LightningModule):
+ def __init__(self, input_size, num_classes):
+ super().__init__()
+ self.fc1 = nn.Linear(input_size, 50)
+ self.fc2 = nn.Linear(50, num_classes)
+ self.loss_fn = nn.CrossEntropyLoss()
+
+ def forward(self, x):
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return x
+
+ def training_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log('train_loss', loss)
+ return loss
+
+ def validation_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log('val_loss', loss)
+ return loss
+
+ def test_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log('test_loss', loss)
+ return loss
+
+ def _common_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ loss = self.loss_fn(scores, y)
+ return loss, scores, y
+
+ def predict_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ preds = torch.argmax(scores, dim=1)
+ return preds
+
+ def configure_optimizers(self):
+ return optim.Adam(self.parameters(), lr=0.001)
+
+# Set device cuda for GPU if it's available otherwise run on the CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+
+input_size = 784
+num_classes = 10
+learning_rate = 0.001
+batch_size = 64
+num_epochs = 3
+
+# Load Data
+entire_dataset = datasets.MNIST(
+ root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+)
+train_ds, val_ds = random_split(entire_dataset, [50000, 10000])
+test_ds = datasets.MNIST(
+ root="dataset/", train=False, transform=transforms.ToTensor(), download=True
+)
+train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
+val_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
+test_loader = DataLoader(dataset=test_ds, batch_size=batch_size, shuffle=False)
+
+# Initialize network
+model = NN(input_size=input_size, num_classes=num_classes).to(device)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Train Network
+for epoch in range(num_epochs):
+ for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
+ # Get data to cuda if possible
+ data = data.to(device=device)
+ targets = targets.to(device=device)
+
+ # Get to correct shape
+ data = data.reshape(data.shape[0], -1)
+
+ # Forward
+ scores = model(data)
+ loss = criterion(scores, targets)
+
+ # Backward
+ optimizer.zero_grad()
+ loss.backward()
+
+ # Gradient descent or adam step
+ optimizer.step()
+
+
+# Check accuracy on training & test to see how good our model
+def check_accuracy(loader, model):
+ num_correct = 0
+ num_samples = 0
+ model.eval()
+
+ # We don't need to keep track of gradients here so we wrap it in torch.no_grad()
+ with torch.no_grad():
+ # Loop through the data
+ for x, y in loader:
+
+ # Move data to device
+ x = x.to(device=device)
+ y = y.to(device=device)
+
+ # Get to correct shape
+ x = x.reshape(x.shape[0], -1)
+
+ # Forward pass
+ scores = model(x)
+ _, predictions = scores.max(1)
+
+ # Check how many we got correct
+ num_correct += (predictions == y).sum()
+
+ # Keep track of number of samples
+ num_samples += predictions.size(0)
+
+ model.train()
+ return num_correct / num_samples
+
+
+# Check accuracy on training & test to see how good our model
+model.to(device)
+print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
+print(f"Accuracy on validation set: {check_accuracy(val_loader, model)*100:.2f}")
+print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
diff --git a/ML/Pytorch/pytorch_lightning/3. Lightning Trainer/simple_fc.py b/ML/Pytorch/pytorch_lightning/3. Lightning Trainer/simple_fc.py
new file mode 100644
index 0000000..6bdcef3
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/3. Lightning Trainer/simple_fc.py
@@ -0,0 +1,126 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from torch.utils.data import random_split
+import pytorch_lightning as pl
+
+
+class NN(pl.LightningModule):
+ def __init__(self, input_size, num_classes):
+ super().__init__()
+ self.fc1 = nn.Linear(input_size, 50)
+ self.fc2 = nn.Linear(50, num_classes)
+ self.loss_fn = nn.CrossEntropyLoss()
+
+ def forward(self, x):
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return x
+
+ def training_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log('train_loss', loss)
+ return loss
+
+ def validation_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log('val_loss', loss)
+ return loss
+
+ def test_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log('test_loss', loss)
+ return loss
+
+ def _common_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ loss = self.loss_fn(scores, y)
+ return loss, scores, y
+
+ def predict_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ preds = torch.argmax(scores, dim=1)
+ return preds
+
+ def configure_optimizers(self):
+ return optim.Adam(self.parameters(), lr=0.001)
+
+# Set device cuda for GPU if it's available otherwise run on the CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+input_size = 784
+num_classes = 10
+learning_rate = 0.001
+batch_size = 64
+num_epochs = 3
+
+# Load Data
+entire_dataset = datasets.MNIST(
+ root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+)
+train_ds, val_ds = random_split(entire_dataset, [50000, 10000])
+test_ds = datasets.MNIST(
+ root="dataset/", train=False, transform=transforms.ToTensor(), download=True
+)
+train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
+val_loader = DataLoader(dataset=val_ds, batch_size=batch_size, shuffle=False)
+test_loader = DataLoader(dataset=test_ds, batch_size=batch_size, shuffle=False)
+
+# Initialize network
+model = NN(input_size=input_size, num_classes=num_classes).to(device)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+trainer = pl.Trainer(accelerator="gpu", devices=1, min_epochs=1, max_epochs=3, precision=16)
+trainer.fit(model, train_loader, val_loader)
+trainer.validate(model, val_loader)
+trainer.test(model, test_loader)
+
+# Check accuracy on training & test to see how good our model
+def check_accuracy(loader, model):
+ num_correct = 0
+ num_samples = 0
+ model.eval()
+
+ # We don't need to keep track of gradients here so we wrap it in torch.no_grad()
+ with torch.no_grad():
+ # Loop through the data
+ for x, y in loader:
+
+ # Move data to device
+ x = x.to(device=device)
+ y = y.to(device=device)
+
+ # Get to correct shape
+ x = x.reshape(x.shape[0], -1)
+
+ # Forward pass
+ scores = model(x)
+ _, predictions = scores.max(1)
+
+ # Check how many we got correct
+ num_correct += (predictions == y).sum()
+
+ # Keep track of number of samples
+ num_samples += predictions.size(0)
+
+ model.train()
+ return num_correct / num_samples
+
+
+# Check accuracy on training & test to see how good our model
+model.to(device)
+print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
+print(f"Accuracy on validation set: {check_accuracy(val_loader, model)*100:.2f}")
+print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
diff --git a/ML/Pytorch/pytorch_lightning/4. Metrics/simple_fc.py b/ML/Pytorch/pytorch_lightning/4. Metrics/simple_fc.py
new file mode 100644
index 0000000..0d5bb46
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/4. Metrics/simple_fc.py
@@ -0,0 +1,150 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from torch.utils.data import random_split
+import pytorch_lightning as pl
+import torchmetrics
+from torchmetrics import Metric
+
+
+class MyAccuracy(Metric):
+ def __init__(self):
+ super().__init__()
+ self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
+ self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum")
+
+ def update(self, preds, target):
+ preds = torch.argmax(preds, dim=1)
+ assert preds.shape == target.shape
+ self.correct += torch.sum(preds == target)
+ self.total += target.numel()
+
+ def compute(self):
+ return self.correct.float() / self.total.float()
+
+
+class NN(pl.LightningModule):
+ def __init__(self, input_size, num_classes):
+ super().__init__()
+ self.fc1 = nn.Linear(input_size, 50)
+ self.fc2 = nn.Linear(50, num_classes)
+ self.loss_fn = nn.CrossEntropyLoss()
+ self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+ self.my_accuracy = MyAccuracy()
+ self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+
+ def forward(self, x):
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return x
+
+ def training_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ accuracy = self.my_accuracy(scores, y)
+ f1_score = self.f1_score(scores, y)
+ self.log_dict({'train_loss': loss, 'train_accuracy': accuracy, 'train_f1_score': f1_score},
+ on_step=False, on_epoch=True, prog_bar=True)
+ return {'loss': loss, "scores": scores, "y": y}
+
+ def validation_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log('val_loss', loss)
+ return loss
+
+ def test_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log('test_loss', loss)
+ return loss
+
+ def _common_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ loss = self.loss_fn(scores, y)
+ return loss, scores, y
+
+ def predict_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ preds = torch.argmax(scores, dim=1)
+ return preds
+
+ def configure_optimizers(self):
+ return optim.Adam(self.parameters(), lr=0.001)
+
+# Set device cuda for GPU if it's available otherwise run on the CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+input_size = 784
+num_classes = 10
+learning_rate = 0.001
+batch_size = 64
+num_epochs = 3
+
+# Load Data
+entire_dataset = datasets.MNIST(
+ root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+)
+train_ds, val_ds = random_split(entire_dataset, [50000, 10000])
+test_ds = datasets.MNIST(
+ root="dataset/", train=False, transform=transforms.ToTensor(), download=True
+)
+train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
+val_loader = DataLoader(dataset=val_ds, batch_size=batch_size, shuffle=False)
+test_loader = DataLoader(dataset=test_ds, batch_size=batch_size, shuffle=False)
+
+# Initialize network
+model = NN(input_size=input_size, num_classes=num_classes).to(device)
+
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+trainer = pl.Trainer(accelerator="gpu", devices=1, min_epochs=1, max_epochs=3, precision=16)
+trainer.fit(model, train_loader, val_loader)
+trainer.validate(model, val_loader)
+trainer.test(model, test_loader)
+
+# Check accuracy on training & test to see how good our model
+def check_accuracy(loader, model):
+ num_correct = 0
+ num_samples = 0
+ model.eval()
+
+ # We don't need to keep track of gradients here so we wrap it in torch.no_grad()
+ with torch.no_grad():
+ # Loop through the data
+ for x, y in loader:
+
+ # Move data to device
+ x = x.to(device=device)
+ y = y.to(device=device)
+
+ # Get to correct shape
+ x = x.reshape(x.shape[0], -1)
+
+ # Forward pass
+ scores = model(x)
+ _, predictions = scores.max(1)
+
+ # Check how many we got correct
+ num_correct += (predictions == y).sum()
+
+ # Keep track of number of samples
+ num_samples += predictions.size(0)
+
+ model.train()
+ return num_correct / num_samples
+
+
+# Check accuracy on training & test to see how good our model
+model.to(device)
+print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
+print(f"Accuracy on validation set: {check_accuracy(val_loader, model)*100:.2f}")
+print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
diff --git a/ML/Pytorch/pytorch_lightning/5. DataModule/simple_fc.py b/ML/Pytorch/pytorch_lightning/5. DataModule/simple_fc.py
new file mode 100644
index 0000000..255fa5d
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/5. DataModule/simple_fc.py
@@ -0,0 +1,146 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from torch.utils.data import random_split
+import pytorch_lightning as pl
+import torchmetrics
+from torchmetrics import Metric
+
+
+class MyAccuracy(Metric):
+ def __init__(self):
+ super().__init__()
+ self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
+ self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum")
+
+ def update(self, preds, target):
+ preds = torch.argmax(preds, dim=1)
+ assert preds.shape == target.shape
+ self.correct += torch.sum(preds == target)
+ self.total += target.numel()
+
+ def compute(self):
+ return self.correct.float() / self.total.float()
+
+
+class NN(pl.LightningModule):
+ def __init__(self, input_size, num_classes):
+ super().__init__()
+ self.fc1 = nn.Linear(input_size, 50)
+ self.fc2 = nn.Linear(50, num_classes)
+ self.loss_fn = nn.CrossEntropyLoss()
+ self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
+ self.my_accuracy = MyAccuracy()
+ self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+
+ def forward(self, x):
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return x
+
+ def training_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ accuracy = self.my_accuracy(scores, y)
+ f1_score = self.f1_score(scores, y)
+ self.log_dict({'train_loss': loss, 'train_accuracy': accuracy, 'train_f1_score': f1_score},
+ on_step=False, on_epoch=True, prog_bar=True)
+ return {'loss': loss, "scores": scores, "y": y}
+
+ def validation_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log('val_loss', loss)
+ return loss
+
+ def test_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log('test_loss', loss)
+ return loss
+
+ def _common_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ loss = self.loss_fn(scores, y)
+ return loss, scores, y
+
+ def predict_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ preds = torch.argmax(scores, dim=1)
+ return preds
+
+ def configure_optimizers(self):
+ return optim.Adam(self.parameters(), lr=0.001)
+
+
+class MnistDataModule(pl.LightningDataModule):
+ def __init__(self, data_dir, batch_size, num_workers):
+ super().__init__()
+ self.data_dir = data_dir
+ self.batch_size = batch_size
+ self.num_workers = num_workers
+
+ def prepare_data(self):
+ datasets.MNIST(self.data_dir, train=True, download=True)
+ datasets.MNIST(self.data_dir, train=False, download=True)
+
+ def setup(self, stage):
+ entire_dataset = datasets.MNIST(
+ root=self.data_dir,
+ train=True,
+ transform=transforms.ToTensor(),
+ download=False,
+ )
+ self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
+ self.test_ds = datasets.MNIST(
+ root=self.data_dir,
+ train=False,
+ transform=transforms.ToTensor(),
+ download=False,
+ )
+
+ def train_dataloader(self):
+ return DataLoader(
+ self.train_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=True,
+ )
+
+ def val_dataloader(self):
+ return DataLoader(
+ self.val_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=False,
+ )
+
+ def test_dataloader(self):
+ return DataLoader(
+ self.test_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=False,
+ )
+
+# Set device cuda for GPU if it's available otherwise run on the CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyperparameters
+input_size = 784
+num_classes = 10
+learning_rate = 0.001
+batch_size = 64
+num_epochs = 3
+
+model = NN(input_size=input_size, num_classes=num_classes)
+dm = MnistDataModule(data_dir="dataset/", batch_size=batch_size, num_workers=4)
+trainer = pl.Trainer(accelerator="gpu", devices=1, min_epochs=1, max_epochs=3, precision=16)
+trainer.fit(model, dm)
+trainer.validate(model, dm)
+trainer.test(model, dm)
diff --git a/ML/Pytorch/pytorch_lightning/6. Restructuring/config.py b/ML/Pytorch/pytorch_lightning/6. Restructuring/config.py
new file mode 100644
index 0000000..de0978c
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/6. Restructuring/config.py
@@ -0,0 +1,15 @@
+# Training hyperparameters
+INPUT_SIZE = 784
+NUM_CLASSES = 10
+LEARNING_RATE = 0.001
+BATCH_SIZE = 64
+NUM_EPOCHS = 3
+
+# Dataset
+DATA_DIR = "dataset/"
+NUM_WORKERS = 4
+
+# Compute related
+ACCELERATOR = "gpu"
+DEVICES = [0]
+PRECISION = 16
diff --git a/ML/Pytorch/pytorch_lightning/6. Restructuring/dataset.py b/ML/Pytorch/pytorch_lightning/6. Restructuring/dataset.py
new file mode 100644
index 0000000..82e64ce
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/6. Restructuring/dataset.py
@@ -0,0 +1,59 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from torch.utils.data import random_split
+import pytorch_lightning as pl
+
+
+class MnistDataModule(pl.LightningDataModule):
+ def __init__(self, data_dir, batch_size, num_workers):
+ super().__init__()
+ self.data_dir = data_dir
+ self.batch_size = batch_size
+ self.num_workers = num_workers
+
+ def prepare_data(self):
+ datasets.MNIST(self.data_dir, train=True, download=True)
+ datasets.MNIST(self.data_dir, train=False, download=True)
+
+ def setup(self, stage):
+ entire_dataset = datasets.MNIST(
+ root=self.data_dir,
+ train=True,
+ transform=transforms.ToTensor(),
+ download=False,
+ )
+ self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
+ self.test_ds = datasets.MNIST(
+ root=self.data_dir,
+ train=False,
+ transform=transforms.ToTensor(),
+ download=False,
+ )
+
+ def train_dataloader(self):
+ return DataLoader(
+ self.train_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=True,
+ )
+
+ def val_dataloader(self):
+ return DataLoader(
+ self.val_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=False,
+ )
+
+ def test_dataloader(self):
+ return DataLoader(
+ self.test_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=False,
+ )
diff --git a/ML/Pytorch/pytorch_lightning/6. Restructuring/model.py b/ML/Pytorch/pytorch_lightning/6. Restructuring/model.py
new file mode 100644
index 0000000..8d299db
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/6. Restructuring/model.py
@@ -0,0 +1,71 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import pytorch_lightning as pl
+import torchmetrics
+from torchmetrics import Metric
+
+
+class NN(pl.LightningModule):
+ def __init__(self, input_size, learning_rate, num_classes):
+ super().__init__()
+ self.lr = learning_rate
+ self.fc1 = nn.Linear(input_size, 50)
+ self.fc2 = nn.Linear(50, num_classes)
+ self.loss_fn = nn.CrossEntropyLoss()
+ self.accuracy = torchmetrics.Accuracy(
+ task="multiclass", num_classes=num_classes
+ )
+ self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+
+ def forward(self, x):
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return x
+
+ def training_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ accuracy = self.accuracy(scores, y)
+ f1_score = self.f1_score(scores, y)
+ self.log_dict(
+ {
+ "train_loss": loss,
+ "train_accuracy": accuracy,
+ "train_f1_score": f1_score,
+ },
+ on_step=False,
+ on_epoch=True,
+ prog_bar=True,
+ )
+ return {"loss": loss, "scores": scores, "y": y}
+
+ def validation_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log("val_loss", loss)
+ return loss
+
+ def test_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log("test_loss", loss)
+ return loss
+
+ def _common_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ loss = self.loss_fn(scores, y)
+ return loss, scores, y
+
+ def predict_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ preds = torch.argmax(scores, dim=1)
+ return preds
+
+ def configure_optimizers(self):
+ return optim.Adam(self.parameters(), lr=self.lr)
diff --git a/ML/Pytorch/pytorch_lightning/6. Restructuring/train.py b/ML/Pytorch/pytorch_lightning/6. Restructuring/train.py
new file mode 100644
index 0000000..b691fdf
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/6. Restructuring/train.py
@@ -0,0 +1,27 @@
+import torch
+import pytorch_lightning as pl
+from model import NN
+from dataset import MnistDataModule
+import config
+
+if __name__ == "__main__":
+ model = NN(
+ input_size=config.INPUT_SIZE,
+ learning_rate=config.LEARNING_RATE,
+ num_classes=config.NUM_CLASSES,
+ )
+ dm = MnistDataModule(
+ data_dir=config.DATA_DIR,
+ batch_size=config.BATCH_SIZE,
+ num_workers=config.NUM_WORKERS,
+ )
+ trainer = pl.Trainer(
+ accelerator=config.ACCELERATOR,
+ devices=config.DEVICES,
+ min_epochs=1,
+ max_epochs=3,
+ precision=config.PRECISION,
+ )
+ trainer.fit(model, dm)
+ trainer.validate(model, dm)
+ trainer.test(model, dm)
diff --git a/ML/Pytorch/pytorch_lightning/7. Callbacks/callbacks.py b/ML/Pytorch/pytorch_lightning/7. Callbacks/callbacks.py
new file mode 100644
index 0000000..18eb930
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/7. Callbacks/callbacks.py
@@ -0,0 +1,12 @@
+from pytorch_lightning.callbacks import EarlyStopping, Callback
+
+class MyPrintingCallback(Callback):
+ def __init__(self):
+ super().__init__()
+
+ def on_train_start(self, trainer, pl_module):
+ print("Starting to train!")
+
+ def on_train_end(self, trainer, pl_module):
+ print("Training is done.")
+
diff --git a/ML/Pytorch/pytorch_lightning/7. Callbacks/config.py b/ML/Pytorch/pytorch_lightning/7. Callbacks/config.py
new file mode 100644
index 0000000..d3472b7
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/7. Callbacks/config.py
@@ -0,0 +1,15 @@
+# Training hyperparameters
+INPUT_SIZE = 784
+NUM_CLASSES = 10
+LEARNING_RATE = 0.001
+BATCH_SIZE = 64
+NUM_EPOCHS = 1000
+
+# Dataset
+DATA_DIR = "dataset/"
+NUM_WORKERS = 4
+
+# Compute related
+ACCELERATOR = "gpu"
+DEVICES = [0]
+PRECISION = 16
diff --git a/ML/Pytorch/pytorch_lightning/7. Callbacks/dataset.py b/ML/Pytorch/pytorch_lightning/7. Callbacks/dataset.py
new file mode 100644
index 0000000..82e64ce
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/7. Callbacks/dataset.py
@@ -0,0 +1,59 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from torch.utils.data import random_split
+import pytorch_lightning as pl
+
+
+class MnistDataModule(pl.LightningDataModule):
+ def __init__(self, data_dir, batch_size, num_workers):
+ super().__init__()
+ self.data_dir = data_dir
+ self.batch_size = batch_size
+ self.num_workers = num_workers
+
+ def prepare_data(self):
+ datasets.MNIST(self.data_dir, train=True, download=True)
+ datasets.MNIST(self.data_dir, train=False, download=True)
+
+ def setup(self, stage):
+ entire_dataset = datasets.MNIST(
+ root=self.data_dir,
+ train=True,
+ transform=transforms.ToTensor(),
+ download=False,
+ )
+ self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
+ self.test_ds = datasets.MNIST(
+ root=self.data_dir,
+ train=False,
+ transform=transforms.ToTensor(),
+ download=False,
+ )
+
+ def train_dataloader(self):
+ return DataLoader(
+ self.train_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=True,
+ )
+
+ def val_dataloader(self):
+ return DataLoader(
+ self.val_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=False,
+ )
+
+ def test_dataloader(self):
+ return DataLoader(
+ self.test_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=False,
+ )
diff --git a/ML/Pytorch/pytorch_lightning/7. Callbacks/model.py b/ML/Pytorch/pytorch_lightning/7. Callbacks/model.py
new file mode 100644
index 0000000..8d299db
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/7. Callbacks/model.py
@@ -0,0 +1,71 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import pytorch_lightning as pl
+import torchmetrics
+from torchmetrics import Metric
+
+
+class NN(pl.LightningModule):
+ def __init__(self, input_size, learning_rate, num_classes):
+ super().__init__()
+ self.lr = learning_rate
+ self.fc1 = nn.Linear(input_size, 50)
+ self.fc2 = nn.Linear(50, num_classes)
+ self.loss_fn = nn.CrossEntropyLoss()
+ self.accuracy = torchmetrics.Accuracy(
+ task="multiclass", num_classes=num_classes
+ )
+ self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+
+ def forward(self, x):
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return x
+
+ def training_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ accuracy = self.accuracy(scores, y)
+ f1_score = self.f1_score(scores, y)
+ self.log_dict(
+ {
+ "train_loss": loss,
+ "train_accuracy": accuracy,
+ "train_f1_score": f1_score,
+ },
+ on_step=False,
+ on_epoch=True,
+ prog_bar=True,
+ )
+ return {"loss": loss, "scores": scores, "y": y}
+
+ def validation_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log("val_loss", loss)
+ return loss
+
+ def test_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log("test_loss", loss)
+ return loss
+
+ def _common_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ loss = self.loss_fn(scores, y)
+ return loss, scores, y
+
+ def predict_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ preds = torch.argmax(scores, dim=1)
+ return preds
+
+ def configure_optimizers(self):
+ return optim.Adam(self.parameters(), lr=self.lr)
diff --git a/ML/Pytorch/pytorch_lightning/7. Callbacks/train.py b/ML/Pytorch/pytorch_lightning/7. Callbacks/train.py
new file mode 100644
index 0000000..4b0c576
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/7. Callbacks/train.py
@@ -0,0 +1,31 @@
+import torch
+import pytorch_lightning as pl
+from model import NN
+from dataset import MnistDataModule
+import config
+from callbacks import MyPrintingCallback, EarlyStopping
+
+torch.set_float32_matmul_precision("medium") # to make lightning happy
+
+if __name__ == "__main__":
+ model = NN(
+ input_size=config.INPUT_SIZE,
+ learning_rate=config.LEARNING_RATE,
+ num_classes=config.NUM_CLASSES,
+ )
+ dm = MnistDataModule(
+ data_dir=config.DATA_DIR,
+ batch_size=config.BATCH_SIZE,
+ num_workers=config.NUM_WORKERS,
+ )
+ trainer = pl.Trainer(
+ accelerator=config.ACCELERATOR,
+ devices=config.DEVICES,
+ min_epochs=1,
+ max_epochs=config.NUM_EPOCHS,
+ precision=config.PRECISION,
+ callbacks=[MyPrintingCallback(), EarlyStopping(monitor="val_loss")],
+ )
+ trainer.fit(model, dm)
+ trainer.validate(model, dm)
+ trainer.test(model, dm)
diff --git a/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/callbacks.py b/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/callbacks.py
new file mode 100644
index 0000000..18eb930
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/callbacks.py
@@ -0,0 +1,12 @@
+from pytorch_lightning.callbacks import EarlyStopping, Callback
+
+class MyPrintingCallback(Callback):
+ def __init__(self):
+ super().__init__()
+
+ def on_train_start(self, trainer, pl_module):
+ print("Starting to train!")
+
+ def on_train_end(self, trainer, pl_module):
+ print("Training is done.")
+
diff --git a/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/config.py b/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/config.py
new file mode 100644
index 0000000..d3472b7
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/config.py
@@ -0,0 +1,15 @@
+# Training hyperparameters
+INPUT_SIZE = 784
+NUM_CLASSES = 10
+LEARNING_RATE = 0.001
+BATCH_SIZE = 64
+NUM_EPOCHS = 1000
+
+# Dataset
+DATA_DIR = "dataset/"
+NUM_WORKERS = 4
+
+# Compute related
+ACCELERATOR = "gpu"
+DEVICES = [0]
+PRECISION = 16
diff --git a/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/dataset.py b/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/dataset.py
new file mode 100644
index 0000000..14a7d86
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/dataset.py
@@ -0,0 +1,64 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from torch.utils.data import random_split
+import pytorch_lightning as pl
+from torchvision.transforms import RandomHorizontalFlip, RandomVerticalFlip
+
+
+class MnistDataModule(pl.LightningDataModule):
+ def __init__(self, data_dir, batch_size, num_workers):
+ super().__init__()
+ self.data_dir = data_dir
+ self.batch_size = batch_size
+ self.num_workers = num_workers
+
+ def prepare_data(self):
+ datasets.MNIST(self.data_dir, train=True, download=True)
+ datasets.MNIST(self.data_dir, train=False, download=True)
+
+ def setup(self, stage):
+ entire_dataset = datasets.MNIST(
+ root=self.data_dir,
+ train=True,
+ transform=transforms.Compose([
+ transforms.RandomVerticalFlip(),
+ transforms.RandomHorizontalFlip(),
+ transforms.ToTensor(),
+ ]),
+ download=False,
+ )
+ self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
+ self.test_ds = datasets.MNIST(
+ root=self.data_dir,
+ train=False,
+ transform=transforms.ToTensor(),
+ download=False,
+ )
+
+ def train_dataloader(self):
+ return DataLoader(
+ self.train_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=True,
+ )
+
+ def val_dataloader(self):
+ return DataLoader(
+ self.val_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=False,
+ )
+
+ def test_dataloader(self):
+ return DataLoader(
+ self.test_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=False,
+ )
diff --git a/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/model.py b/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/model.py
new file mode 100644
index 0000000..7c8b332
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/model.py
@@ -0,0 +1,79 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import pytorch_lightning as pl
+import torchmetrics
+from torchmetrics import Metric
+import torchvision
+
+
+class NN(pl.LightningModule):
+ def __init__(self, input_size, learning_rate, num_classes):
+ super().__init__()
+ self.lr = learning_rate
+ self.fc1 = nn.Linear(input_size, 50)
+ self.fc2 = nn.Linear(50, num_classes)
+ self.loss_fn = nn.CrossEntropyLoss()
+ self.accuracy = torchmetrics.Accuracy(
+ task="multiclass", num_classes=num_classes
+ )
+ self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+
+ def forward(self, x):
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return x
+
+ def training_step(self, batch, batch_idx):
+ x, y = batch
+ loss, scores, y = self._common_step(batch, batch_idx)
+ accuracy = self.accuracy(scores, y)
+ f1_score = self.f1_score(scores, y)
+ self.log_dict(
+ {
+ "train_loss": loss,
+ "train_accuracy": accuracy,
+ "train_f1_score": f1_score,
+ },
+ on_step=False,
+ on_epoch=True,
+ prog_bar=True,
+ )
+
+ if batch_idx % 100 == 0:
+ x = x[:8]
+ grid = torchvision.utils.make_grid(x.view(-1, 1, 28, 28))
+ self.logger.experiment.add_image("mnist_images", grid, self.global_step)
+
+ return {"loss": loss, "scores": scores, "y": y}
+
+ def validation_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log("val_loss", loss)
+ return loss
+
+ def test_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log("test_loss", loss)
+ return loss
+
+ def _common_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ loss = self.loss_fn(scores, y)
+ return loss, scores, y
+
+ def predict_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ preds = torch.argmax(scores, dim=1)
+ return preds
+
+ def configure_optimizers(self):
+ return optim.Adam(self.parameters(), lr=self.lr)
diff --git a/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/train.py b/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/train.py
new file mode 100644
index 0000000..23f4bca
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/8. Logging Tensorboard/train.py
@@ -0,0 +1,34 @@
+import torch
+import pytorch_lightning as pl
+from model import NN
+from dataset import MnistDataModule
+import config
+from callbacks import MyPrintingCallback, EarlyStopping
+from pytorch_lightning.loggers import TensorBoardLogger
+
+torch.set_float32_matmul_precision("medium") # to make lightning happy
+
+if __name__ == "__main__":
+ logger = TensorBoardLogger("tb_logs", name="mnist_model_v0")
+ model = NN(
+ input_size=config.INPUT_SIZE,
+ learning_rate=config.LEARNING_RATE,
+ num_classes=config.NUM_CLASSES,
+ )
+ dm = MnistDataModule(
+ data_dir=config.DATA_DIR,
+ batch_size=config.BATCH_SIZE,
+ num_workers=config.NUM_WORKERS,
+ )
+ trainer = pl.Trainer(
+ logger=logger,
+ accelerator=config.ACCELERATOR,
+ devices=config.DEVICES,
+ min_epochs=1,
+ max_epochs=config.NUM_EPOCHS,
+ precision=config.PRECISION,
+ callbacks=[MyPrintingCallback(), EarlyStopping(monitor="val_loss")],
+ )
+ trainer.fit(model, dm)
+ trainer.validate(model, dm)
+ trainer.test(model, dm)
diff --git a/ML/Pytorch/pytorch_lightning/9. Profiler/callbacks.py b/ML/Pytorch/pytorch_lightning/9. Profiler/callbacks.py
new file mode 100644
index 0000000..18eb930
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/9. Profiler/callbacks.py
@@ -0,0 +1,12 @@
+from pytorch_lightning.callbacks import EarlyStopping, Callback
+
+class MyPrintingCallback(Callback):
+ def __init__(self):
+ super().__init__()
+
+ def on_train_start(self, trainer, pl_module):
+ print("Starting to train!")
+
+ def on_train_end(self, trainer, pl_module):
+ print("Training is done.")
+
diff --git a/ML/Pytorch/pytorch_lightning/9. Profiler/config.py b/ML/Pytorch/pytorch_lightning/9. Profiler/config.py
new file mode 100644
index 0000000..de0978c
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/9. Profiler/config.py
@@ -0,0 +1,15 @@
+# Training hyperparameters
+INPUT_SIZE = 784
+NUM_CLASSES = 10
+LEARNING_RATE = 0.001
+BATCH_SIZE = 64
+NUM_EPOCHS = 3
+
+# Dataset
+DATA_DIR = "dataset/"
+NUM_WORKERS = 4
+
+# Compute related
+ACCELERATOR = "gpu"
+DEVICES = [0]
+PRECISION = 16
diff --git a/ML/Pytorch/pytorch_lightning/9. Profiler/dataset.py b/ML/Pytorch/pytorch_lightning/9. Profiler/dataset.py
new file mode 100644
index 0000000..14a7d86
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/9. Profiler/dataset.py
@@ -0,0 +1,64 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from torch.utils.data import random_split
+import pytorch_lightning as pl
+from torchvision.transforms import RandomHorizontalFlip, RandomVerticalFlip
+
+
+class MnistDataModule(pl.LightningDataModule):
+ def __init__(self, data_dir, batch_size, num_workers):
+ super().__init__()
+ self.data_dir = data_dir
+ self.batch_size = batch_size
+ self.num_workers = num_workers
+
+ def prepare_data(self):
+ datasets.MNIST(self.data_dir, train=True, download=True)
+ datasets.MNIST(self.data_dir, train=False, download=True)
+
+ def setup(self, stage):
+ entire_dataset = datasets.MNIST(
+ root=self.data_dir,
+ train=True,
+ transform=transforms.Compose([
+ transforms.RandomVerticalFlip(),
+ transforms.RandomHorizontalFlip(),
+ transforms.ToTensor(),
+ ]),
+ download=False,
+ )
+ self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
+ self.test_ds = datasets.MNIST(
+ root=self.data_dir,
+ train=False,
+ transform=transforms.ToTensor(),
+ download=False,
+ )
+
+ def train_dataloader(self):
+ return DataLoader(
+ self.train_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=True,
+ )
+
+ def val_dataloader(self):
+ return DataLoader(
+ self.val_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=False,
+ )
+
+ def test_dataloader(self):
+ return DataLoader(
+ self.test_ds,
+ batch_size=self.batch_size,
+ num_workers=self.num_workers,
+ shuffle=False,
+ )
diff --git a/ML/Pytorch/pytorch_lightning/9. Profiler/model.py b/ML/Pytorch/pytorch_lightning/9. Profiler/model.py
new file mode 100644
index 0000000..05f4d0d
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/9. Profiler/model.py
@@ -0,0 +1,89 @@
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import pytorch_lightning as pl
+import torchmetrics
+from torchmetrics import Metric
+import torchvision
+
+
+class NN(pl.LightningModule):
+ def __init__(self, input_size, learning_rate, num_classes):
+ super().__init__()
+ self.lr = learning_rate
+ self.fc1 = nn.Linear(input_size, 50)
+ self.fc2 = nn.Linear(50, num_classes)
+ self.loss_fn = nn.CrossEntropyLoss()
+ self.accuracy = torchmetrics.Accuracy(
+ task="multiclass", num_classes=num_classes
+ )
+ self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
+
+ def forward(self, x):
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return x
+
+ def training_step(self, batch, batch_idx):
+ x, y = batch
+ loss, scores, y = self._common_step(batch, batch_idx)
+
+ self.log_dict(
+ {
+ "train_loss": loss,
+ },
+ on_step=False,
+ on_epoch=True,
+ prog_bar=True,
+ )
+
+ if batch_idx % 100 == 0:
+ x = x[:8]
+ grid = torchvision.utils.make_grid(x.view(-1, 1, 28, 28))
+ self.logger.experiment.add_image("mnist_images", grid, self.global_step)
+
+ return {"loss": loss, "scores": scores, "y": y}
+
+ def training_epoch_end(self, outputs):
+ scores = torch.cat([x["scores"] for x in outputs])
+ y = torch.cat([x["y"] for x in outputs])
+ self.log_dict(
+ {
+ "train_acc": self.accuracy(scores, y),
+ "train_f1": self.f1_score(scores, y),
+ },
+ on_step=False,
+ on_epoch=True,
+ prog_bar=True,
+ )
+
+ def validation_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log("val_loss", loss)
+ return loss
+
+ def test_step(self, batch, batch_idx):
+ loss, scores, y = self._common_step(batch, batch_idx)
+ self.log("test_loss", loss)
+ return loss
+
+ def _common_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ loss = self.loss_fn(scores, y)
+ return loss, scores, y
+
+ def predict_step(self, batch, batch_idx):
+ x, y = batch
+ x = x.reshape(x.size(0), -1)
+ scores = self.forward(x)
+ preds = torch.argmax(scores, dim=1)
+ return preds
+
+ def configure_optimizers(self):
+ return optim.Adam(self.parameters(), lr=self.lr)
diff --git a/ML/Pytorch/pytorch_lightning/9. Profiler/train.py b/ML/Pytorch/pytorch_lightning/9. Profiler/train.py
new file mode 100644
index 0000000..8c60695
--- /dev/null
+++ b/ML/Pytorch/pytorch_lightning/9. Profiler/train.py
@@ -0,0 +1,40 @@
+import torch
+import pytorch_lightning as pl
+from model import NN
+from dataset import MnistDataModule
+import config
+from callbacks import MyPrintingCallback, EarlyStopping
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.profilers import PyTorchProfiler
+
+torch.set_float32_matmul_precision("medium") # to make lightning happy
+
+if __name__ == "__main__":
+ logger = TensorBoardLogger("tb_logs", name="mnist_model_v1")
+ profiler = PyTorchProfiler(
+ on_trace_ready=torch.profiler.tensorboard_trace_handler("tb_logs/profiler0"),
+ schedule=torch.profiler.schedule(skip_first=10, wait=1, warmup=1, active=20),
+ )
+ model = NN(
+ input_size=config.INPUT_SIZE,
+ learning_rate=config.LEARNING_RATE,
+ num_classes=config.NUM_CLASSES,
+ )
+ dm = MnistDataModule(
+ data_dir=config.DATA_DIR,
+ batch_size=config.BATCH_SIZE,
+ num_workers=config.NUM_WORKERS,
+ )
+ trainer = pl.Trainer(
+ profiler=profiler,
+ logger=logger,
+ accelerator=config.ACCELERATOR,
+ devices=config.DEVICES,
+ min_epochs=1,
+ max_epochs=config.NUM_EPOCHS,
+ precision=config.PRECISION,
+ callbacks=[MyPrintingCallback(), EarlyStopping(monitor="val_loss")],
+ )
+ trainer.fit(model, dm)
+ trainer.validate(model, dm)
+ trainer.test(model, dm)
diff --git a/ML/Pytorch/recommender_systems/neural_collaborative_filtering/main.py b/ML/Pytorch/recommender_systems/neural_collaborative_filtering/main.py
new file mode 100644
index 0000000..650e95c
--- /dev/null
+++ b/ML/Pytorch/recommender_systems/neural_collaborative_filtering/main.py
@@ -0,0 +1,123 @@
+"""
+Implementation of Neural collaborative filtering (NCF)
+Next:
+ * Understand and use NDCG = Normalized Discounted Cumulative Gain
+ * Use SVD and compare results
+"""
+
+import torch
+import pytorch_lightning as pl
+import pandas as pd
+import torchmetrics
+from torch.utils.data import Dataset, DataLoader
+from torch import nn
+from sklearn.model_selection import train_test_split
+
+torch.set_float32_matmul_precision("medium") # to make lightning happy
+
+class MovieLens(Dataset):
+ def __init__(self, df_ratings):
+ self.df_ratings = df_ratings
+
+ def __len__(self):
+ return len(self.df_ratings)
+
+ def __getitem__(self, idx):
+ row = self.df_ratings.iloc[idx]
+ user_id = torch.tensor(row["user_id"], dtype=torch.long)
+ movie_id = torch.tensor(row["movie_id"], dtype=torch.long)
+ rating = torch.tensor(row["rating"], dtype=torch.float)
+ return user_id, movie_id, rating
+
+
+class LightningData(pl.LightningDataModule):
+ def __init__(self, batch_size):
+ super().__init__()
+ self.batch_size = batch_size
+
+ def prepare_data(self):
+ self.df_ratings = pd.read_csv(
+ "data/ratings.dat",
+ sep="::",
+ header=None,
+ names=["user_id", "movie_id", "rating", "timestamp"],
+ engine="python",
+ )
+
+
+ # split into train and test
+ self.df_ratings_train, self.df_ratings_val = train_test_split(
+ self.df_ratings, test_size=0.2, random_state=42
+ )
+
+ def setup(self, stage=None):
+ self.dataset_train = MovieLens(self.df_ratings_train)
+ self.dataset_val = MovieLens(self.df_ratings_val)
+
+ def train_dataloader(self):
+ return DataLoader(self.dataset_train, batch_size=self.batch_size, num_workers=6)
+
+ def val_dataloader(self):
+ return DataLoader(self.dataset_train, batch_size=self.batch_size, num_workers=2)
+
+class Net(nn.Module):
+ def __init__(self, n_users, n_movies, n_factors=50):
+ super().__init__()
+ self.user_factors = nn.Embedding(n_users, n_factors)
+ self.movie_factors = nn.Embedding(n_movies, n_factors)
+ self.lin = nn.Linear(n_factors * 2, 1)
+
+ def forward(self, user, movie):
+ user_embedding = self.user_factors(user)
+ movie_embedding = self.movie_factors(movie)
+ x = torch.cat([user_embedding, movie_embedding], dim=1)
+ return self.lin(x)
+
+
+class NetLightning(pl.LightningModule):
+ def __init__(self, n_users, n_movies, n_factors=50, lr=3e-4):
+ super().__init__()
+ self.num_users = n_users
+ self.num_movies = n_movies
+ self.net = Net(n_users, n_movies, n_factors)
+ self.loss_fn = nn.MSELoss()
+ self.MAE = torchmetrics.MeanAbsoluteError()
+ self.lr = lr
+
+ def forward(self, user, movie):
+ return self.net(user, movie)
+
+ def training_step(self, batch, batch_idx):
+ user, movie, rating = batch
+ out = self.forward(user, movie)
+ mae = self.MAE(out.squeeze(1), rating.float())
+ loss = self.loss_fn(out.squeeze(1), rating.float())
+ self.log_dict({"train_loss": loss, "train_mae": mae}, on_step=False, on_epoch=True, prog_bar=True)
+ return loss
+
+ def validation_step(self, batch, batch_idx):
+ user, movie, rating = batch
+ out = self.forward(user, movie)
+ mae = self.MAE(out.squeeze(1), rating.float())
+ loss = self.loss_fn(out.squeeze(1), rating.float())
+ self.log_dict({"val_loss": loss, "val_mae": mae}, on_step=False, on_epoch=True, prog_bar=True)
+
+ def predict_step(self, user_id):
+ out = self.forward(user_id, torch.arange(0, self.num_movies))
+ return out
+
+ def configure_optimizers(self):
+ return torch.optim.Adam(self.parameters(), lr=self.lr)
+
+
+dm = LightningData(batch_size=512)
+dm.prepare_data()
+dm.setup()
+
+num_movies = dm.df_ratings["movie_id"].max() + 1
+num_users = dm.df_ratings["user_id"].max() + 1
+
+model = NetLightning(num_users, num_movies)
+trainer = pl.Trainer(accelerator="gpu", devices=1, max_epochs=3)
+trainer.fit(model, dm)
+trainer.validate(model, dm)