Machine-Learning-Collection/ML/Pytorch/huggingface/cnndaily_t5_lightning_customdataloading.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f54ecf0b",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "# HuggingFace Tutorial Series\n",
    "- 1. What is Huggingface?\n",
    "- 2. Common tasks we can do with HuggingFace & explain the tasks briefly, like what is question answering etc\n",
    "- 3. Using the HuggingFace Pipeline (High level feature)\n",
    "- 4. How the pipeline works at a lower level\n",
    "- 5. HuggingFace Datasets\n",
    "- 6. HuggingFace Tokenizer\n",
    "- 7. HuggingFace Evaluate\n",
    "- 8. HuggingFace Trainer\n",
    "- 9. Putting it together to finetune a news article summarizer\n",
    "- 10. Making it more general and robust with Lightning and custom data loading\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec1aae37",
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.simplefilter(\"ignore\")\n",
    "\n",
    "import os\n",
    "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
    "\n",
    "import numpy as np\n",
    "import torch\n",
    "import datasets \n",
    "import pytorch_lightning as pl\n",
    "from datasets import load_dataset, load_metric\n",
    "\n",
    "from transformers import (\n",
    "    AutoModel,\n",
    "    AutoModelForSeq2SeqLM,\n",
    "    AutoTokenizer,\n",
    "    DataCollatorForSeq2Seq,\n",
    "    Seq2SeqTrainingArguments,\n",
    "    Seq2SeqTrainer,\n",
    ")\n",
    "\n",
    "import torch\n",
    "import pandas as pd\n",
    "from torch.utils.data import Dataset\n",
    "import pytorch_lightning as pl\n",
    "\n",
    "torch.set_float32_matmul_precision(\"medium\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fd7cb0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = \"t5-small\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "418cb03a",
   "metadata": {},
   "outputs": [],
   "source": [
    "class cnn_dailymail(Dataset):\n",
    "    def __init__(self, csv_file, tokenizer, max_length=512):\n",
    "        self.data = pd.read_csv(csv_file)\n",
    "        self.tokenizer = tokenizer\n",
    "        self.max_length = max_length\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.data)\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        article = self.data.loc[idx, 'article']\n",
    "        highlights = self.data.loc[idx, 'highlights']\n",
    "\n",
    "        inputs = self.tokenizer(\n",
    "            article,\n",
    "            truncation=True,\n",
    "            padding='max_length',\n",
    "            max_length=self.max_length,\n",
    "            return_tensors='pt'\n",
    "        )\n",
    "        targets = self.tokenizer(\n",
    "            highlights,\n",
    "            truncation=True,\n",
    "            padding='max_length',\n",
    "            max_length=self.max_length,\n",
    "            return_tensors='pt'\n",
    "        )\n",
    "\n",
    "        return {\n",
    "            'input_ids': inputs['input_ids'].squeeze(),\n",
    "            'attention_mask': inputs['attention_mask'].squeeze(),\n",
    "            'labels': targets['input_ids'].squeeze()\n",
    "        }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aaa62755",
   "metadata": {},
   "outputs": [],
   "source": [
    "class MyDataModule(pl.LightningDataModule):\n",
    "    def __init__(self, train_csv, val_csv, test_csv, tokenizer, batch_size=16, max_length=512):\n",
    "        super().__init__()\n",
    "        self.train_csv = train_csv\n",
    "        self.val_csv = val_csv\n",
    "        self.test_csv = test_csv\n",
    "        self.tokenizer = tokenizer\n",
    "        self.batch_size = batch_size\n",
    "        self.max_length = max_length\n",
    "\n",
    "    def setup(self, stage=None):\n",
    "        if stage in ('fit', None):\n",
    "            self.train_dataset = cnn_dailymail(self.train_csv, self.tokenizer, self.max_length)\n",
    "            self.val_dataset = cnn_dailymail(self.val_csv, self.tokenizer, self.max_length)\n",
    "        if stage in ('test', None):\n",
    "            self.test_dataset = cnn_dailymail(self.test_csv, self.tokenizer, self.max_length)\n",
    "\n",
    "    def train_dataloader(self):\n",
    "        return torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)\n",
    "\n",
    "    def val_dataloader(self):\n",
    "        return torch.utils.data.DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2)\n",
    "\n",
    "    def test_dataloader(self):\n",
    "        return torch.utils.data.DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fbb699e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "class MyLightningModule(pl.LightningModule):\n",
    "    def __init__(self, model_name, learning_rate, weight_decay):\n",
    "        super().__init__()\n",
    "        self.model_name = model_name\n",
    "        self.learning_rate = learning_rate\n",
    "        self.weight_decay = weight_decay\n",
    "        \n",
    "        # Load the pre-trained model and tokenizer\n",
    "        self.model = torch.compile(AutoModelForSeq2SeqLM.from_pretrained(self.model_name))\n",
    "        \n",
    "        # Load the ROUGE metric\n",
    "        self.metric = load_metric(\"rouge\")\n",
    "\n",
    "    def forward(self, input_ids, attention_mask, labels=None):\n",
    "        output = self.model(\n",
    "            input_ids=input_ids,\n",
    "            attention_mask=attention_mask,\n",
    "            labels=labels,\n",
    "        )\n",
    "        return output.loss, output.logits\n",
    "    \n",
    "    def training_step(self, batch, batch_idx):\n",
    "        input_ids = batch[\"input_ids\"]\n",
    "        attention_mask = batch[\"attention_mask\"]\n",
    "        labels = batch[\"labels\"]\n",
    "        loss, logits = self(input_ids, attention_mask, labels)\n",
    "        self.log('train_loss', loss, on_epoch=True, on_step=True, prog_bar=True)\n",
    "        return {'loss': loss, 'logits': logits}\n",
    "    \n",
    "    def validation_step(self, batch, batch_idx):\n",
    "        input_ids = batch[\"input_ids\"]\n",
    "        attention_mask = batch[\"attention_mask\"]\n",
    "        labels = batch[\"labels\"]\n",
    "        loss, logits = self(input_ids, attention_mask, labels)\n",
    "        self.log('val_loss', loss, on_epoch=True, on_step=False)\n",
    "        \n",
    "        # Save logits and labels as instance attributes\n",
    "        if not hasattr(self, \"logits\"):\n",
    "            self.logits = logits\n",
    "        else:\n",
    "            self.logits = torch.cat((self.logits, logits), dim=0)\n",
    "        \n",
    "        if not hasattr(self, \"labels\"):\n",
    "            self.labels = labels\n",
    "        else:\n",
    "            self.labels = torch.cat((self.labels, labels), dim=0)\n",
    "            \n",
    "        return {'loss': loss, 'logits': logits, \"labels\":labels}\n",
    "    \n",
    "    def on_validation_epoch_end(self):\n",
    "        # Convert logits to predicted token IDs\n",
    "        pred_token_ids = self.logits.argmax(dim=-1)\n",
    "\n",
    "        # Decode predictions and labels using the saved instance attributes\n",
    "        decoded_preds = tokenizer.batch_decode(pred_token_ids, skip_special_tokens=True)\n",
    "        decoded_labels = tokenizer.batch_decode(self.labels, skip_special_tokens=True)\n",
    "\n",
    "        # Compute ROUGE scores\n",
    "        scores = self.metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=[\"rouge1\"])[\"rouge1\"].mid\n",
    "\n",
    "        self.log('rouge1_precision', scores.precision, prog_bar=True)\n",
    "        self.log('rouge1_recall', scores.recall, prog_bar=True)\n",
    "        self.log('rouge1_fmeasure', scores.fmeasure, prog_bar=True)\n",
    "\n",
    "        # Clear logits and labels instance attributes for the next validation epoch\n",
    "        del self.logits\n",
    "        del self.labels\n",
    "    \n",
    "    def configure_optimizers(self):\n",
    "        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)\n",
    "        return optimizer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd63c628",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# File paths\n",
    "train_csv = \"train.csv\"\n",
    "val_csv = \"validation.csv\"\n",
    "test_csv = \"test.csv\"\n",
    "\n",
    "# Create the data module\n",
    "dm = MyDataModule(train_csv, val_csv, test_csv, tokenizer, batch_size=16)\n",
    "dm.setup()\n",
    "\n",
    "model = MyLightningModule(model_name=\"t5-small\", learning_rate=1e-4, weight_decay=1e-5)\n",
    "trainer = pl.Trainer(accelerator=\"gpu\", devices=[0], max_epochs=1, precision=16)\n",
    "trainer.fit(model, datamodule=dm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5d3d684",
   "metadata": {},
   "outputs": [],
   "source": [
    "http://localhost:18888/notebooks/cnndaily_t5_lightning_customdataloading.ipynb"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a0494596",
   "metadata": {},
   "source": [
    "### next steps:\n",
    "* if article is > 512, because now we are truncating maybe it causes issues if the article is much longer?\n",
    "\n",
    "#### what we've done:\n",
    "* Change the data loading so it's more general, meaning on the fly loading from disk\n",
    "* add torch.compile\n",
    "* 1. Clean up the code, make it into scripts instead of notebook -> Train for an epoch (add multi-gpu training?)\n",
    "* add tensorboard visualization\n",
    "* not use pretrained weights but from scratch to ensure that training setup works and actually improving\n",
    "* 2. Create an inference step, send in news article -> get summary, check that it works\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80a2efab",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f9b71ab",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}