mirror of
https://github.com/aladdinpersson/Machine-Learning-Collection.git
synced 2026-02-21 11:18:01 +00:00
318 lines
11 KiB
Plaintext
318 lines
11 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f54ecf0b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\"\"\"\n",
|
|
"# HuggingFace Tutorial Series\n",
|
|
"- 1. What is Huggingface?\n",
|
|
"- 2. Common tasks we can do with HuggingFace & explain the tasks briefly, like what is question answering etc\n",
|
|
"- 3. Using the HuggingFace Pipeline (High level feature)\n",
|
|
"- 4. How the pipeline works at a lower level\n",
|
|
"- 5. HuggingFace Datasets\n",
|
|
"- 6. HuggingFace Tokenizer\n",
|
|
"- 7. HuggingFace Evaluate\n",
|
|
"- 8. HuggingFace Trainer\n",
|
|
"- 9. Putting it together to finetune a news article summarizer\n",
|
|
"- 10. Making it more general and robust with Lightning and custom data loading\n",
|
|
"\"\"\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ec1aae37",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import warnings\n",
|
|
"warnings.simplefilter(\"ignore\")\n",
|
|
"\n",
|
|
"import os\n",
|
|
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n",
|
|
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
|
|
"\n",
|
|
"import numpy as np\n",
|
|
"import torch\n",
|
|
"import datasets \n",
|
|
"import pytorch_lightning as pl\n",
|
|
"from datasets import load_dataset, load_metric\n",
|
|
"\n",
|
|
"from transformers import (\n",
|
|
" AutoModel,\n",
|
|
" AutoModelForSeq2SeqLM,\n",
|
|
" AutoTokenizer,\n",
|
|
" DataCollatorForSeq2Seq,\n",
|
|
" Seq2SeqTrainingArguments,\n",
|
|
" Seq2SeqTrainer,\n",
|
|
")\n",
|
|
"\n",
|
|
"import torch\n",
|
|
"import pandas as pd\n",
|
|
"from torch.utils.data import Dataset\n",
|
|
"import pytorch_lightning as pl\n",
|
|
"\n",
|
|
"torch.set_float32_matmul_precision(\"medium\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5fd7cb0c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model_name = \"t5-small\"\n",
|
|
"tokenizer = AutoTokenizer.from_pretrained(model_name)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "418cb03a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class cnn_dailymail(Dataset):\n",
|
|
" def __init__(self, csv_file, tokenizer, max_length=512):\n",
|
|
" self.data = pd.read_csv(csv_file)\n",
|
|
" self.tokenizer = tokenizer\n",
|
|
" self.max_length = max_length\n",
|
|
"\n",
|
|
" def __len__(self):\n",
|
|
" return len(self.data)\n",
|
|
"\n",
|
|
" def __getitem__(self, idx):\n",
|
|
" article = self.data.loc[idx, 'article']\n",
|
|
" highlights = self.data.loc[idx, 'highlights']\n",
|
|
"\n",
|
|
" inputs = self.tokenizer(\n",
|
|
" article,\n",
|
|
" truncation=True,\n",
|
|
" padding='max_length',\n",
|
|
" max_length=self.max_length,\n",
|
|
" return_tensors='pt'\n",
|
|
" )\n",
|
|
" targets = self.tokenizer(\n",
|
|
" highlights,\n",
|
|
" truncation=True,\n",
|
|
" padding='max_length',\n",
|
|
" max_length=self.max_length,\n",
|
|
" return_tensors='pt'\n",
|
|
" )\n",
|
|
"\n",
|
|
" return {\n",
|
|
" 'input_ids': inputs['input_ids'].squeeze(),\n",
|
|
" 'attention_mask': inputs['attention_mask'].squeeze(),\n",
|
|
" 'labels': targets['input_ids'].squeeze()\n",
|
|
" }"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "aaa62755",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class MyDataModule(pl.LightningDataModule):\n",
|
|
" def __init__(self, train_csv, val_csv, test_csv, tokenizer, batch_size=16, max_length=512):\n",
|
|
" super().__init__()\n",
|
|
" self.train_csv = train_csv\n",
|
|
" self.val_csv = val_csv\n",
|
|
" self.test_csv = test_csv\n",
|
|
" self.tokenizer = tokenizer\n",
|
|
" self.batch_size = batch_size\n",
|
|
" self.max_length = max_length\n",
|
|
"\n",
|
|
" def setup(self, stage=None):\n",
|
|
" if stage in ('fit', None):\n",
|
|
" self.train_dataset = cnn_dailymail(self.train_csv, self.tokenizer, self.max_length)\n",
|
|
" self.val_dataset = cnn_dailymail(self.val_csv, self.tokenizer, self.max_length)\n",
|
|
" if stage in ('test', None):\n",
|
|
" self.test_dataset = cnn_dailymail(self.test_csv, self.tokenizer, self.max_length)\n",
|
|
"\n",
|
|
" def train_dataloader(self):\n",
|
|
" return torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)\n",
|
|
"\n",
|
|
" def val_dataloader(self):\n",
|
|
" return torch.utils.data.DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2)\n",
|
|
"\n",
|
|
" def test_dataloader(self):\n",
|
|
" return torch.utils.data.DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fbb699e1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class MyLightningModule(pl.LightningModule):\n",
|
|
" def __init__(self, model_name, learning_rate, weight_decay):\n",
|
|
" super().__init__()\n",
|
|
" self.model_name = model_name\n",
|
|
" self.learning_rate = learning_rate\n",
|
|
" self.weight_decay = weight_decay\n",
|
|
" \n",
|
|
" # Load the pre-trained model and tokenizer\n",
|
|
" self.model = torch.compile(AutoModelForSeq2SeqLM.from_pretrained(self.model_name))\n",
|
|
" \n",
|
|
" # Load the ROUGE metric\n",
|
|
" self.metric = load_metric(\"rouge\")\n",
|
|
"\n",
|
|
" def forward(self, input_ids, attention_mask, labels=None):\n",
|
|
" output = self.model(\n",
|
|
" input_ids=input_ids,\n",
|
|
" attention_mask=attention_mask,\n",
|
|
" labels=labels,\n",
|
|
" )\n",
|
|
" return output.loss, output.logits\n",
|
|
" \n",
|
|
" def training_step(self, batch, batch_idx):\n",
|
|
" input_ids = batch[\"input_ids\"]\n",
|
|
" attention_mask = batch[\"attention_mask\"]\n",
|
|
" labels = batch[\"labels\"]\n",
|
|
" loss, logits = self(input_ids, attention_mask, labels)\n",
|
|
" self.log('train_loss', loss, on_epoch=True, on_step=True, prog_bar=True)\n",
|
|
" return {'loss': loss, 'logits': logits}\n",
|
|
" \n",
|
|
" def validation_step(self, batch, batch_idx):\n",
|
|
" input_ids = batch[\"input_ids\"]\n",
|
|
" attention_mask = batch[\"attention_mask\"]\n",
|
|
" labels = batch[\"labels\"]\n",
|
|
" loss, logits = self(input_ids, attention_mask, labels)\n",
|
|
" self.log('val_loss', loss, on_epoch=True, on_step=False)\n",
|
|
" \n",
|
|
" # Save logits and labels as instance attributes\n",
|
|
" if not hasattr(self, \"logits\"):\n",
|
|
" self.logits = logits\n",
|
|
" else:\n",
|
|
" self.logits = torch.cat((self.logits, logits), dim=0)\n",
|
|
" \n",
|
|
" if not hasattr(self, \"labels\"):\n",
|
|
" self.labels = labels\n",
|
|
" else:\n",
|
|
" self.labels = torch.cat((self.labels, labels), dim=0)\n",
|
|
" \n",
|
|
" return {'loss': loss, 'logits': logits, \"labels\":labels}\n",
|
|
" \n",
|
|
" def on_validation_epoch_end(self):\n",
|
|
" # Convert logits to predicted token IDs\n",
|
|
" pred_token_ids = self.logits.argmax(dim=-1)\n",
|
|
"\n",
|
|
" # Decode predictions and labels using the saved instance attributes\n",
|
|
" decoded_preds = tokenizer.batch_decode(pred_token_ids, skip_special_tokens=True)\n",
|
|
" decoded_labels = tokenizer.batch_decode(self.labels, skip_special_tokens=True)\n",
|
|
"\n",
|
|
" # Compute ROUGE scores\n",
|
|
" scores = self.metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=[\"rouge1\"])[\"rouge1\"].mid\n",
|
|
"\n",
|
|
" self.log('rouge1_precision', scores.precision, prog_bar=True)\n",
|
|
" self.log('rouge1_recall', scores.recall, prog_bar=True)\n",
|
|
" self.log('rouge1_fmeasure', scores.fmeasure, prog_bar=True)\n",
|
|
"\n",
|
|
" # Clear logits and labels instance attributes for the next validation epoch\n",
|
|
" del self.logits\n",
|
|
" del self.labels\n",
|
|
" \n",
|
|
" def configure_optimizers(self):\n",
|
|
" optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)\n",
|
|
" return optimizer\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "dd63c628",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# File paths\n",
|
|
"train_csv = \"train.csv\"\n",
|
|
"val_csv = \"validation.csv\"\n",
|
|
"test_csv = \"test.csv\"\n",
|
|
"\n",
|
|
"# Create the data module\n",
|
|
"dm = MyDataModule(train_csv, val_csv, test_csv, tokenizer, batch_size=16)\n",
|
|
"dm.setup()\n",
|
|
"\n",
|
|
"model = MyLightningModule(model_name=\"t5-small\", learning_rate=1e-4, weight_decay=1e-5)\n",
|
|
"trainer = pl.Trainer(accelerator=\"gpu\", devices=[0], max_epochs=1, precision=16)\n",
|
|
"trainer.fit(model, datamodule=dm)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b5d3d684",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"http://localhost:18888/notebooks/cnndaily_t5_lightning_customdataloading.ipynb"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a0494596",
|
|
"metadata": {},
|
|
"source": [
|
|
"### next steps:\n",
|
|
"* if article is > 512, because now we are truncating maybe it causes issues if the article is much longer?\n",
|
|
"\n",
|
|
"#### what we've done:\n",
|
|
"* Change the data loading so it's more general, meaning on the fly loading from disk\n",
|
|
"* add torch.compile\n",
|
|
"* 1. Clean up the code, make it into scripts instead of notebook -> Train for an epoch (add multi-gpu training?)\n",
|
|
"* add tensorboard visualization\n",
|
|
"* not use pretrained weights but from scratch to ensure that training setup works and actually improving\n",
|
|
"* 2. Create an inference step, send in news article -> get summary, check that it works\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "80a2efab",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "0f9b71ab",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|