add lightning code, finetuning whisper, recommender system neural collaborative filtering

This commit is contained in:
Aladdin Persson
2023-02-21 16:25:42 +01:00
parent c646ef65e2
commit 94f6c024fe
51 changed files with 17977 additions and 25 deletions

View File

@@ -0,0 +1,190 @@
"""
Simple pytorch lightning example
"""
# Imports
import torch
import torch.nn.functional as F # Parameterless functions, like (some) activation functions
import torchvision.datasets as datasets # Standard datasets
import torchvision.transforms as transforms # Transformations we can perform on our dataset for augmentation
from torch import optim # For optimizers like SGD, Adam, etc.
from torch import nn # All neural network modules
from torch.utils.data import (
DataLoader,
) # Gives easier dataset managment by creating mini batches etc.
from tqdm import tqdm # For nice progress bar!
import pytorch_lightning as pl
import torchmetrics
from pytorch_lightning.callbacks import Callback, EarlyStopping
precision = "medium"
torch.set_float32_matmul_precision(precision)
criterion = nn.CrossEntropyLoss()
## use 20% of training data for validation
# train_set_size = int(len(train_dataset) * 0.8)
# valid_set_size = len(train_dataset) - train_set_size
#
## split the train set into two
# seed = torch.Generator().manual_seed(42)
# train_dataset, val_dataset = torch.utils.data.random_split(
# train_dataset, [train_set_size, valid_set_size], generator=seed
# )
class CNNLightning(pl.LightningModule):
def __init__(self, lr=3e-4, in_channels=1, num_classes=10):
super().__init__()
self.lr = lr
self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10)
self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10)
self.conv1 = nn.Conv2d(
in_channels=in_channels,
out_channels=8,
kernel_size=3,
stride=1,
padding=1,
)
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv2 = nn.Conv2d(
in_channels=8,
out_channels=16,
kernel_size=3,
stride=1,
padding=1,
)
self.fc1 = nn.Linear(16 * 7 * 7, num_classes)
self.lr = lr
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self._common_step(x, batch_idx)
loss = criterion(y_hat, y)
accuracy = self.train_acc(y_hat, y)
self.log(
"train_acc_step",
self.train_acc,
on_step=True,
on_epoch=False,
prog_bar=True,
)
return loss
def training_epoch_end(self, outputs):
self.train_acc.reset()
def test_step(self, batch, batch_idx):
x, y = batch
y_hat = self._common_step(x, batch_idx)
loss = F.cross_entropy(y_hat, y)
accuracy = self.test_acc(y_hat, y)
self.log("test_loss", loss, on_step=True)
self.log("test_acc", accuracy, on_step=True)
def validation_step(self, batch, batch_idx):
x, y = batch
y_hat = self._common_step(x, batch_idx)
loss = F.cross_entropy(y_hat, y)
accuracy = self.test_acc(y_hat, y)
self.log("val_loss", loss, on_step=True)
self.log("val_acc", accuracy, on_step=True)
def predict_step(self, batch, batch_idx):
x, y = batch
y_hat = self._common_step(x)
return y_hat
def _common_step(self, x, batch_idx):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.reshape(x.shape[0], -1)
y_hat = self.fc1(x)
return y_hat
def configure_optimizers(self):
optimizer = optim.Adam(self.parameters(), lr=self.lr)
return optimizer
class MNISTDataModule(pl.LightningDataModule):
def __init__(self, batch_size=512):
super().__init__()
self.batch_size = batch_size
def setup(self, stage):
mnist_full = train_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
self.mnist_test = datasets.MNIST(
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
self.mnist_train, self.mnist_val = torch.utils.data.random_split(
mnist_full, [55000, 5000]
)
def train_dataloader(self):
return DataLoader(
self.mnist_train,
batch_size=self.batch_size,
num_workers=6,
shuffle=True,
)
def val_dataloader(self):
return DataLoader(
self.mnist_val, batch_size=self.batch_size, num_workers=2, shuffle=False
)
def test_dataloader(self):
return DataLoader(
self.mnist_test, batch_size=self.batch_size, num_workers=2, shuffle=False
)
class MyPrintingCallback(Callback):
def on_train_start(self, trainer, pl_module):
print("Training is starting")
def on_train_end(self, trainer, pl_module):
print("Training is ending")
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load Data
if __name__ == "__main__":
# Initialize network
model_lightning = CNNLightning()
trainer = pl.Trainer(
#fast_dev_run=True,
# overfit_batches=3,
max_epochs=5,
precision=16,
accelerator="gpu",
devices=[0,1],
callbacks=[EarlyStopping(monitor="val_loss", mode="min")],
auto_lr_find=True,
enable_model_summary=True,
profiler="simple",
strategy="deepspeed_stage_1",
# accumulate_grad_batches=2,
# auto_scale_batch_size="binsearch",
# log_every_n_steps=1,
)
dm = MNISTDataModule()
# trainer tune first to find best batch size and lr
trainer.tune(model_lightning, dm)
trainer.fit(
model=model_lightning,
datamodule=dm,
)
# test model on test loader from LightningDataModule
trainer.test(model=model_lightning, datamodule=dm)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,580 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "fc8e5ea0",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"print(torch.cuda.is_available())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d8a1e039",
"metadata": {},
"outputs": [],
"source": [
"from transformers import pipeline\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6ad73024",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"classifier = pipeline(\"zero-shot-classification\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "04f7e02c",
"metadata": {},
"outputs": [],
"source": [
"classifier(\n",
" \"This is a course about the Transformers library\",\n",
" candidate_labels=[\"machine learning\", \"gym\", \"food\"],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6fb246c2",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from transformers import pipeline\n",
"generator = pipeline(task=\"text-generation\", model=\"bigscience/bloom-1b7\", device=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c4e174f0",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForTokenClassification, AutoModel, AutoTokenizer\n",
"import torch\n",
"\n",
"# Define input text and pre-trained model checkpoint\n",
"text = \"My name is wolfgang and I live in berlin\"\n",
"checkpoint = \"Jean-Baptiste/roberta-large-ner-english\"\n",
"\n",
"# Instantiate tokenizer and encode input text\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"inputs = tokenizer(text, padding=True, truncation=True, return_tensors=\"pt\")\n",
"\n",
"# Instantiate model and generate output\n",
"model = AutoModel.from_pretrained(checkpoint)\n",
"outputs = model(**inputs)\n",
"print(outputs[0].shape)\n",
"\n",
"# Instantiate token classification model and generate predictions\n",
"model = AutoModelForTokenClassification.from_pretrained(checkpoint)\n",
"outputs = model(**inputs)\n",
"predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)\n",
"print(predictions)\n",
"print(model.config.id2label)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8212bbaa",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')\n",
"model = AutoModelForMaskedLM.from_pretrained(\"xlm-roberta-large\")\n",
"\n",
"# prepare input\n",
"text = \"Replace me by any text you'd like.\"\n",
"encoded_input = tokenizer(text, return_tensors='pt')\n",
"\n",
"# forward pass\n",
"output = model(**encoded_input)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "314cba41",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
"\n",
"# Load the pre-trained tokenizer and model\n",
"tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')\n",
"model = AutoModelForMaskedLM.from_pretrained(\"xlm-roberta-large\")\n",
"\n",
"# Define the input sentence with a masked token\n",
"text = \"I want to <mask> a new car tomorrow.\"\n",
"\n",
"# Tokenize the input sentence, replacing the masked token with a special [MASK] token\n",
"encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')\n",
"\n",
"print(output.logits.shape)\n",
"print(encoded_input['input_ids'][0].tolist().index(tokenizer.mask_token_id))\n",
"\n",
"# Extract the predicted probabilities for the masked token\n",
"predicted_probabilities = output.logits[0, encoded_input['input_ids'][0].tolist().index(tokenizer.mask_token_id)]\n",
"predicted_probabilities = torch.nn.functional.softmax(predicted_probabilities, dim=-1)\n",
"\n",
"# Get the top-k most probable predictions for the masked token\n",
"k = 5\n",
"top_k = torch.topk(predicted_probabilities, k)\n",
"for i in range(k):\n",
" token = tokenizer.convert_ids_to_tokens(top_k.indices[i].item())\n",
" score = top_k.values[i].item()\n",
" print(f\"Prediction {i+1}: '{token}' with probability {score:.5f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6187e77e",
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
"\n",
"sequences = [\n",
" \"Using a Transformer network is simple\",\n",
" \"The quick brown fox jumps over the lazy dog\",\n",
" \"To be or not to be, that is the question\"\n",
"]\n",
"\n",
"# Tokenize the input sequences and convert them to padded and truncated integer token IDs\n",
"inputs = tokenizer(\n",
" sequences,\n",
" padding=True,\n",
" truncation=True,\n",
" return_tensors=\"pt\"\n",
")\n",
"\n",
"# Print the resulting input IDs and attention masks\n",
"print(inputs['input_ids'])\n",
"print(inputs['attention_mask'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fc259c5a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "43466db6",
"metadata": {},
"source": [
"Huggingface:\n",
"\n",
"1. Understanding how to use the Pipeline (probably most useful) for various tasks, easy to use, and the different subtasks it can do like translation, QA, zero shot, sentiment analysis, token classification, etc. \n",
"2. Understood how pipeline works in more detail by using AutoModel for various tasks as well as AutoTokenizer\n",
"3. Load dataset\n",
"4. How to finetune\n",
"5. How to evaluate\n",
"6. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "97c474f2",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "3ed5d8c2",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification\n",
"\n",
"# Same as before\n",
"checkpoint = \"bert-base-uncased\"\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
"sequences = [\n",
" \"I've been waiting for a HuggingFace course my whole life.\",\n",
" \"This course is amazing!\",\n",
"]\n",
"batch = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"pt\")\n",
"\n",
"# This is new\n",
"batch[\"labels\"] = torch.tensor([1, 1])\n",
"\n",
"optimizer = AdamW(model.parameters())\n",
"loss = model(**batch).loss\n",
"loss.backward()\n",
"optimizer.step()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c598624f",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"raw_datasets = load_dataset(\"glue\", \"mrpc\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd296227",
"metadata": {},
"outputs": [],
"source": [
"raw_train_dataset = raw_datasets[\"train\"]\n",
"raw_train_dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e462947a",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"from transformers import AutoTokenizer, DataCollatorWithPadding\n",
"raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
"\n",
"checkpoint = \"bert-base-uncased\"\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"\n",
"def tokenize_function(example):\n",
" return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n",
"\n",
"\n",
"tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
"data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
"\n",
"\n",
"from transformers import TrainingArguments\n",
"training_args = TrainingArguments(\"test-trainer\")\n",
"\n",
"from transformers import AutoModelForSequenceClassification\n",
"model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
"\n",
"import numpy as np\n",
"import evaluate\n",
"\n",
"def compute_metrics(eval_preds):\n",
" metric = evaluate.load(\"glue\", \"mrpc\")\n",
" logits, labels = eval_preds\n",
" predictions = np.argmax(logits, axis=-1)\n",
" return metric.compute(predictions=predictions, references=labels)\n",
"\n",
"training_args = TrainingArguments(\"test-trainer\", evaluation_strategy=\"epoch\")\n",
"model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
"\n",
"trainer = Trainer(\n",
" model,\n",
" training_args,\n",
" train_dataset=tokenized_datasets[\"train\"],\n",
" eval_dataset=tokenized_datasets[\"validation\"],\n",
" data_collator=data_collator,\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e2795dc",
"metadata": {},
"outputs": [],
"source": [
"from transformers import TrainingArguments\n",
"training_args = TrainingArguments(\"test-trainer\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3af29cd5",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForSequenceClassification\n",
"model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "817f644e",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import evaluate"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42819a6c",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def compute_metrics(eval_preds):\n",
" metric = evaluate.load(\"glue\", \"mrpc\")\n",
" logits, labels = eval_preds\n",
" predictions = np.argmax(logits, axis=-1)\n",
" return metric.compute(predictions=predictions, references=labels)\n",
"\n",
"training_args = TrainingArguments(\"test-trainer\", evaluation_strategy=\"epoch\")\n",
"model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
"\n",
"trainer = Trainer(\n",
" model,\n",
" training_args,\n",
" train_dataset=tokenized_datasets[\"train\"],\n",
" eval_dataset=tokenized_datasets[\"validation\"],\n",
" data_collator=data_collator,\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb5986b0",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer\n",
"from datasets import load_dataset\n",
"batch_size=32\n",
"\n",
"# Define the generator function to preprocess the data in batches\n",
"def preprocess_generator(examples):\n",
" for i in range(0, len(examples[\"article\"]), batch_size):\n",
" batch = examples[\"article\"][i:i+batch_size]\n",
" targets = examples[\"highlights\"][i:i+batch_size]\n",
" model_inputs = tokenizer(batch, max_length=512, padding=\"max_length\", truncation=True)\n",
" with tokenizer.as_target_tokenizer():\n",
" model_targets = tokenizer(targets, max_length=128, padding=\"max_length\", truncation=True)\n",
" model_inputs[\"labels\"] = model_targets[\"input_ids\"]\n",
" yield model_inputs\n",
"\n",
"def preprocess_function(examples):\n",
" articles = [ex for ex in examples[\"article\"]]\n",
" summaries = [ex for ex in examples[\"highlights\"]]\n",
"\n",
" model_inputs = tokenizer(articles, max_length=512, padding=\"max_length\", truncation=True)\n",
" with tokenizer.as_target_tokenizer():\n",
" model_targets = tokenizer(summaries, max_length=128, padding=\"max_length\", truncation=True)\n",
" \n",
" model_inputs[\"labels\"] = model_targets[\"input_ids\"]\n",
" return model_inputs\n",
" \n",
"# Load the dataset\n",
"raw_datasets = load_dataset(\"cnn_dailymail\", \"3.0.0\")\n",
"preprocessed_datasets = raw_datasets.map(preprocess_function, batched=True, num_proc=4)\n",
"\n",
"# Load the pre-trained model and tokenizer\n",
"model_name = \"t5-small\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
"\n",
"# Define the data collator\n",
"data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
"\n",
"# Initialize the trainer arguments\n",
"training_args = Seq2SeqTrainingArguments(\n",
" output_dir=\"./results\",\n",
" evaluation_strategy = \"epoch\",\n",
" learning_rate=2e-5,\n",
" per_device_train_batch_size=batch_size,\n",
" max_steps=1000,\n",
" weight_decay=0.01,\n",
" push_to_hub=False,\n",
")\n",
"\n",
"# Initialize the trainer\n",
"trainer = Seq2SeqTrainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=train_ds,\n",
" data_collator=data_collator,\n",
" tokenizer=tokenizer,\n",
")\n",
"\n",
"# Start the training\n",
"trainer.train()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d62583e",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_metric"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d310a7b3",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"preprocessed_datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "99d422cc",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Load the pre-trained model and tokenizer\n",
"model_name = \"t5-small\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
"\n",
"# Define the data collator\n",
"data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
"\n",
"# Initialize the trainer arguments\n",
"training_args = Seq2SeqTrainingArguments(\n",
" output_dir=\"./results\",\n",
" learning_rate=2e-5,\n",
" per_device_train_batch_size=batch_size,\n",
" max_steps=5000,\n",
" weight_decay=0.01,\n",
" push_to_hub=False,\n",
" evaluation_strategy = \"steps\",\n",
" eval_steps = 50,\n",
")\n",
"\n",
"# Load the ROUGE metric\n",
"metric = load_metric(\"rouge\")\n",
"\n",
"# Define the evaluation function\n",
"def compute_metrics(pred):\n",
" labels = pred.label_ids\n",
" preds = pred.predictions\n",
" \n",
" decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
" decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
" \n",
" scores = metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=[\"rouge1\"])[\"rouge1\"].mid\n",
" \n",
" return {\"rouge1_precision\": scores.precision, \"rouge1_recall\": scores.recall, \"rouge1_fmeasure\": scores.fmeasure}\n",
"\n",
"\n",
"# Initialize the trainer\n",
"trainer = Seq2SeqTrainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=preprocessed_datasets[\"train\"],\n",
" eval_dataset=preprocessed_datasets[\"validation\"],\n",
" data_collator=data_collator,\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics,\n",
")\n",
"\n",
"# Start the training\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5e97b57",
"metadata": {},
"outputs": [],
"source": [
"!pip install nltk\n",
"!pip install rouge_score"
]
},
{
"cell_type": "markdown",
"id": "558c3e66",
"metadata": {},
"source": [
"# Goal:\n",
"\n",
"1. Implement full training from dataloading (dailycnn dataset), to model training, evaluation, etc, using HF. \n",
"* Right now: stuck on on the fly dataset loading, we don't want to cache because this would take a lot of disk space etc.\n",
"\n",
"2. After we get step 1) working, we want to go deeper on every step, so download the dataset and load it as a custom dataset rather than using huggingface simple API, in order to make it more general. Compare with loading the ds as a custom HF dataset or using pytorch class together with lightning. Speed difference? Convenience? Also we want to use the lightning Trainer so see how we can integrate that. And then compare HF to the lightning + hf model approach and see what we like the most."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "624d49ca",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,580 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "fc8e5ea0",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"print(torch.cuda.is_available())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d8a1e039",
"metadata": {},
"outputs": [],
"source": [
"from transformers import pipeline\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6ad73024",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"classifier = pipeline(\"zero-shot-classification\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "04f7e02c",
"metadata": {},
"outputs": [],
"source": [
"classifier(\n",
" \"This is a course about the Transformers library\",\n",
" candidate_labels=[\"machine learning\", \"gym\", \"food\"],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6fb246c2",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from transformers import pipeline\n",
"generator = pipeline(task=\"text-generation\", model=\"bigscience/bloom-1b7\", device=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c4e174f0",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForTokenClassification, AutoModel, AutoTokenizer\n",
"import torch\n",
"\n",
"# Define input text and pre-trained model checkpoint\n",
"text = \"My name is wolfgang and I live in berlin\"\n",
"checkpoint = \"Jean-Baptiste/roberta-large-ner-english\"\n",
"\n",
"# Instantiate tokenizer and encode input text\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"inputs = tokenizer(text, padding=True, truncation=True, return_tensors=\"pt\")\n",
"\n",
"# Instantiate model and generate output\n",
"model = AutoModel.from_pretrained(checkpoint)\n",
"outputs = model(**inputs)\n",
"print(outputs[0].shape)\n",
"\n",
"# Instantiate token classification model and generate predictions\n",
"model = AutoModelForTokenClassification.from_pretrained(checkpoint)\n",
"outputs = model(**inputs)\n",
"predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)\n",
"print(predictions)\n",
"print(model.config.id2label)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8212bbaa",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')\n",
"model = AutoModelForMaskedLM.from_pretrained(\"xlm-roberta-large\")\n",
"\n",
"# prepare input\n",
"text = \"Replace me by any text you'd like.\"\n",
"encoded_input = tokenizer(text, return_tensors='pt')\n",
"\n",
"# forward pass\n",
"output = model(**encoded_input)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "314cba41",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
"\n",
"# Load the pre-trained tokenizer and model\n",
"tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')\n",
"model = AutoModelForMaskedLM.from_pretrained(\"xlm-roberta-large\")\n",
"\n",
"# Define the input sentence with a masked token\n",
"text = \"I want to <mask> a new car tomorrow.\"\n",
"\n",
"# Tokenize the input sentence, replacing the masked token with a special [MASK] token\n",
"encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')\n",
"\n",
"print(output.logits.shape)\n",
"print(encoded_input['input_ids'][0].tolist().index(tokenizer.mask_token_id))\n",
"\n",
"# Extract the predicted probabilities for the masked token\n",
"predicted_probabilities = output.logits[0, encoded_input['input_ids'][0].tolist().index(tokenizer.mask_token_id)]\n",
"predicted_probabilities = torch.nn.functional.softmax(predicted_probabilities, dim=-1)\n",
"\n",
"# Get the top-k most probable predictions for the masked token\n",
"k = 5\n",
"top_k = torch.topk(predicted_probabilities, k)\n",
"for i in range(k):\n",
" token = tokenizer.convert_ids_to_tokens(top_k.indices[i].item())\n",
" score = top_k.values[i].item()\n",
" print(f\"Prediction {i+1}: '{token}' with probability {score:.5f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6187e77e",
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
"\n",
"sequences = [\n",
" \"Using a Transformer network is simple\",\n",
" \"The quick brown fox jumps over the lazy dog\",\n",
" \"To be or not to be, that is the question\"\n",
"]\n",
"\n",
"# Tokenize the input sequences and convert them to padded and truncated integer token IDs\n",
"inputs = tokenizer(\n",
" sequences,\n",
" padding=True,\n",
" truncation=True,\n",
" return_tensors=\"pt\"\n",
")\n",
"\n",
"# Print the resulting input IDs and attention masks\n",
"print(inputs['input_ids'])\n",
"print(inputs['attention_mask'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fc259c5a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "43466db6",
"metadata": {},
"source": [
"Huggingface:\n",
"\n",
"1. Understanding how to use the Pipeline (probably most useful) for various tasks, easy to use, and the different subtasks it can do like translation, QA, zero shot, sentiment analysis, token classification, etc. \n",
"2. Understood how pipeline works in more detail by using AutoModel for various tasks as well as AutoTokenizer\n",
"3. Load dataset\n",
"4. How to finetune\n",
"5. How to evaluate\n",
"6. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "97c474f2",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "3ed5d8c2",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification\n",
"\n",
"# Same as before\n",
"checkpoint = \"bert-base-uncased\"\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
"sequences = [\n",
" \"I've been waiting for a HuggingFace course my whole life.\",\n",
" \"This course is amazing!\",\n",
"]\n",
"batch = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"pt\")\n",
"\n",
"# This is new\n",
"batch[\"labels\"] = torch.tensor([1, 1])\n",
"\n",
"optimizer = AdamW(model.parameters())\n",
"loss = model(**batch).loss\n",
"loss.backward()\n",
"optimizer.step()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c598624f",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"raw_datasets = load_dataset(\"glue\", \"mrpc\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd296227",
"metadata": {},
"outputs": [],
"source": [
"raw_train_dataset = raw_datasets[\"train\"]\n",
"raw_train_dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e462947a",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"from transformers import AutoTokenizer, DataCollatorWithPadding\n",
"raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
"\n",
"checkpoint = \"bert-base-uncased\"\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"\n",
"def tokenize_function(example):\n",
" return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n",
"\n",
"\n",
"tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
"data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
"\n",
"\n",
"from transformers import TrainingArguments\n",
"training_args = TrainingArguments(\"test-trainer\")\n",
"\n",
"from transformers import AutoModelForSequenceClassification\n",
"model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
"\n",
"import numpy as np\n",
"import evaluate\n",
"\n",
"def compute_metrics(eval_preds):\n",
" metric = evaluate.load(\"glue\", \"mrpc\")\n",
" logits, labels = eval_preds\n",
" predictions = np.argmax(logits, axis=-1)\n",
" return metric.compute(predictions=predictions, references=labels)\n",
"\n",
"training_args = TrainingArguments(\"test-trainer\", evaluation_strategy=\"epoch\")\n",
"model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
"\n",
"trainer = Trainer(\n",
" model,\n",
" training_args,\n",
" train_dataset=tokenized_datasets[\"train\"],\n",
" eval_dataset=tokenized_datasets[\"validation\"],\n",
" data_collator=data_collator,\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e2795dc",
"metadata": {},
"outputs": [],
"source": [
"from transformers import TrainingArguments\n",
"training_args = TrainingArguments(\"test-trainer\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3af29cd5",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForSequenceClassification\n",
"model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "817f644e",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import evaluate"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42819a6c",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def compute_metrics(eval_preds):\n",
" metric = evaluate.load(\"glue\", \"mrpc\")\n",
" logits, labels = eval_preds\n",
" predictions = np.argmax(logits, axis=-1)\n",
" return metric.compute(predictions=predictions, references=labels)\n",
"\n",
"training_args = TrainingArguments(\"test-trainer\", evaluation_strategy=\"epoch\")\n",
"model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
"\n",
"trainer = Trainer(\n",
" model,\n",
" training_args,\n",
" train_dataset=tokenized_datasets[\"train\"],\n",
" eval_dataset=tokenized_datasets[\"validation\"],\n",
" data_collator=data_collator,\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb5986b0",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer\n",
"from datasets import load_dataset\n",
"batch_size=32\n",
"\n",
"# Define the generator function to preprocess the data in batches\n",
"def preprocess_generator(examples):\n",
" for i in range(0, len(examples[\"article\"]), batch_size):\n",
" batch = examples[\"article\"][i:i+batch_size]\n",
" targets = examples[\"highlights\"][i:i+batch_size]\n",
" model_inputs = tokenizer(batch, max_length=512, padding=\"max_length\", truncation=True)\n",
" with tokenizer.as_target_tokenizer():\n",
" model_targets = tokenizer(targets, max_length=128, padding=\"max_length\", truncation=True)\n",
" model_inputs[\"labels\"] = model_targets[\"input_ids\"]\n",
" yield model_inputs\n",
"\n",
"def preprocess_function(examples):\n",
" articles = [ex for ex in examples[\"article\"]]\n",
" summaries = [ex for ex in examples[\"highlights\"]]\n",
"\n",
" model_inputs = tokenizer(articles, max_length=512, padding=\"max_length\", truncation=True)\n",
" with tokenizer.as_target_tokenizer():\n",
" model_targets = tokenizer(summaries, max_length=128, padding=\"max_length\", truncation=True)\n",
" \n",
" model_inputs[\"labels\"] = model_targets[\"input_ids\"]\n",
" return model_inputs\n",
" \n",
"# Load the dataset\n",
"raw_datasets = load_dataset(\"cnn_dailymail\", \"3.0.0\")\n",
"preprocessed_datasets = raw_datasets.map(preprocess_function, batched=True, num_proc=4)\n",
"\n",
"# Load the pre-trained model and tokenizer\n",
"model_name = \"t5-small\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
"\n",
"# Define the data collator\n",
"data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
"\n",
"# Initialize the trainer arguments\n",
"training_args = Seq2SeqTrainingArguments(\n",
" output_dir=\"./results\",\n",
" evaluation_strategy = \"epoch\",\n",
" learning_rate=2e-5,\n",
" per_device_train_batch_size=batch_size,\n",
" max_steps=1000,\n",
" weight_decay=0.01,\n",
" push_to_hub=False,\n",
")\n",
"\n",
"# Initialize the trainer\n",
"trainer = Seq2SeqTrainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=train_ds,\n",
" data_collator=data_collator,\n",
" tokenizer=tokenizer,\n",
")\n",
"\n",
"# Start the training\n",
"trainer.train()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d62583e",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_metric"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d310a7b3",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"preprocessed_datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "99d422cc",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Load the pre-trained model and tokenizer\n",
"model_name = \"t5-small\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
"\n",
"# Define the data collator\n",
"data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
"\n",
"# Initialize the trainer arguments\n",
"training_args = Seq2SeqTrainingArguments(\n",
" output_dir=\"./results\",\n",
" learning_rate=2e-5,\n",
" per_device_train_batch_size=batch_size,\n",
" max_steps=5000,\n",
" weight_decay=0.01,\n",
" push_to_hub=False,\n",
" evaluation_strategy = \"steps\",\n",
" eval_steps = 50,\n",
")\n",
"\n",
"# Load the ROUGE metric\n",
"metric = load_metric(\"rouge\")\n",
"\n",
"# Define the evaluation function\n",
"def compute_metrics(pred):\n",
" labels = pred.label_ids\n",
" preds = pred.predictions\n",
" \n",
" decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
" decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
" \n",
" scores = metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=[\"rouge1\"])[\"rouge1\"].mid\n",
" \n",
" return {\"rouge1_precision\": scores.precision, \"rouge1_recall\": scores.recall, \"rouge1_fmeasure\": scores.fmeasure}\n",
"\n",
"\n",
"# Initialize the trainer\n",
"trainer = Seq2SeqTrainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=preprocessed_datasets[\"train\"],\n",
" eval_dataset=preprocessed_datasets[\"validation\"],\n",
" data_collator=data_collator,\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics,\n",
")\n",
"\n",
"# Start the training\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5e97b57",
"metadata": {},
"outputs": [],
"source": [
"!pip install nltk\n",
"!pip install rouge_score"
]
},
{
"cell_type": "markdown",
"id": "558c3e66",
"metadata": {},
"source": [
"# Goal:\n",
"\n",
"1. Implement full training from dataloading (dailycnn dataset), to model training, evaluation, etc, using HF. \n",
"* Right now: stuck on on the fly dataset loading, we don't want to cache because this would take a lot of disk space etc.\n",
"\n",
"2. After we get step 1) working, we want to go deeper on every step, so download the dataset and load it as a custom dataset rather than using huggingface simple API, in order to make it more general. Compare with loading the ds as a custom HF dataset or using pytorch class together with lightning. Speed difference? Convenience? Also we want to use the lightning Trainer so see how we can integrate that. And then compare HF to the lightning + hf model approach and see what we like the most."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "624d49ca",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,41 @@
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import Trainer
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
def compute_metrics(eval_preds):
metric = evaluate.load("glue", "mrpc")
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)

View File

@@ -0,0 +1,2 @@
l = ["cat", "dog"]
sentence = "The quick brown fox jumps over the lazy dog"

View File

@@ -0,0 +1,60 @@
# Imports
import torch
import torchvision.datasets as datasets # Standard datasets
import torchvision.transforms as transforms # Transformations we can perform on our dataset for augmentation
from torch.utils.data import DataLoader
import pytorch_lightning as pl
class MNISTDataModule(pl.LightningDataModule):
def __init__(self, batch_size, num_workers):
super().__init__()
self.batch_size = batch_size
self.num_workers = num_workers
def setup(self, stage):
mnist_full = train_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
self.mnist_test = datasets.MNIST(
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
self.mnist_train, self.mnist_val = torch.utils.data.random_split(
mnist_full, [55000, 5000]
)
def train_dataloader(self):
return DataLoader(
self.mnist_train,
batch_size=self.batch_size,
num_workers=self.num_workers,
persistent_workers=True,
shuffle=True,
)
def val_dataloader(self):
return DataLoader(
self.mnist_val,
batch_size=self.batch_size,
num_workers=self.num_workers,
persistent_workers=True,
shuffle=False,
)
def test_dataloader(self):
return DataLoader(
self.mnist_test,
batch_size=self.batch_size,
num_workers=self.num_workers,
persistent_workers=True,
shuffle=False,
)
# check that it works
if __name__ == "__main__":
dm = MNISTDataModule()
dm.setup("fit")
print(len(dm.mnist_train))
print(len(dm.mnist_val))
print(len(dm.mnist_test))

View File

@@ -0,0 +1,92 @@
import torch
import torchvision
from torch import nn
import pytorch_lightning as pl
class VAEpl(pl.LightningModule):
def __init__(self, lr, input_dim=784, h_dim=200, z_dim=20):
super().__init__()
self.lr = lr
self.loss_fn = nn.BCELoss(reduction="sum")
self.input_dim = input_dim
# encoder
self.img_2hid = nn.Linear(input_dim, h_dim)
self.hid_2mu = nn.Linear(h_dim, z_dim)
self.hid_2sigma = nn.Linear(h_dim, z_dim)
# decoder
self.z_2hid = nn.Linear(z_dim, h_dim)
self.hid_2img = nn.Linear(h_dim, input_dim)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def encode(self, x):
h = self.relu(self.img_2hid(x))
mu, sigma = self.hid_2mu(h), self.hid_2sigma(h)
return mu, sigma
def decode(self, z):
h = self.relu(self.z_2hid(z))
return torch.sigmoid(self.hid_2img(h))
def forward(self, x):
mu, sigma = self.encode(x)
epsilon = torch.randn_like(sigma)
z_new = mu + sigma * epsilon
x_reconstructed = self.decode(z_new)
return x_reconstructed, mu, sigma
def training_step(self, batch, batch_idx):
x, _ = batch
x = x.view(-1, self.input_dim)
x_reconstructed, mu, sigma = self.forward(x)
reconstruction_loss = self.loss_fn(x_reconstructed, x)
kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
loss = reconstruction_loss + kl_div
self.log("train_loss", loss, sync_dist=True)
# add logging of images to tensorboard, x_reconstructed and x, so that
# it updates every step and we can the progress pictures in tensorboard
if batch_idx % 100 == 0:
# take out the first 8
x = x[:8]
x_reconstructed = x_reconstructed[:8]
grid = torchvision.utils.make_grid(x_reconstructed.view(-1, 1, 28, 28))
self.logger.experiment.add_image("reconstructed", grid, self.global_step)
grid = torchvision.utils.make_grid(x.view(-1, 1, 28, 28))
self.logger.experiment.add_image("original", grid, self.global_step)
return loss
def validation_step(self, batch, batch_idx):
x, _ = batch
x = x.view(-1, self.input_dim)
x_reconstructed, mu, sigma = self.forward(x)
reconstruction_loss = self.loss_fn(x_reconstructed, x)
kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
loss = reconstruction_loss + kl_div
self.log("val_loss", loss, sync_dist=True)
return loss
def test_step(self, batch, batch_idx):
x, _ = batch
x = x.view(-1, self.input_dim)
x_reconstructed, mu, sigma = self.forward(x)
reconstruction_loss = self.loss_fn(x_reconstructed, x)
kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
loss = reconstruction_loss + kl_div
self.log("test_loss", loss, sync_dist=True)
return loss
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
return optimizer
if __name__ == "__main__":
batch_size = 8
x = torch.randn(batch_size, 28 * 28 * 1)
vae_pl = VAEpl()
x_reconstructed, mu, sigma = vae_pl(x)
print(x_reconstructed.shape)

View File

@@ -0,0 +1,49 @@
import torch
import torchvision.datasets as datasets # Standard datasets
from tqdm import tqdm
from torch import nn, optim
from torchvision import transforms
from torchvision.utils import save_image
from torch.utils.data import DataLoader
from dataset import MNISTDataModule
import pytorch_lightning as pl
from model import VAEpl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.strategies import DeepSpeedStrategy
torch.set_float32_matmul_precision("medium")
"""
GOALS:
* Understand the strategy (deepspeed, ddp, etc) and how to use it
* Setup a config, for scheduler etc instead of configuring it in each sub-module
* Metrics
"""
# things to add
lr = 3e-4
batch_size = 128
num_workers = 2
model = VAEpl(lr)
dm = MNISTDataModule(batch_size, num_workers)
logger = TensorBoardLogger("my_checkpoint", name="scheduler_autolr_vae_pl_model")
# add callback for learning rate monitor, model checkpoint, and scheduler on plateau
callbacks = [pl.callbacks.LearningRateMonitor(logging_interval="step"),
pl.callbacks.ModelCheckpoint(monitor="val_loss", save_top_k=1, mode="min", save_last=True),
]
if __name__ == "__main__":
trainer = pl.Trainer(
max_epochs=100,
accelerator="gpu",
devices=2,
logger=logger,
#precision=16,
strategy=DeepSpeedStrategy(
stage=0,
),
)
#trainer.tune(model, dm)
trainer.fit(model, dm)

View File

@@ -0,0 +1,41 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
# import save_image from torchvision.utils
from torchvision.utils import save_image
def inference(model, dataset, digit, num_examples=1):
"""
Generates (num_examples) of a particular digit.
Specifically we extract an example of each digit,
then after we have the mu, sigma representation for
each digit we can sample from that.
After we sample we can run the decoder part of the VAE
and generate examples.
"""
images = []
idx = 0
for x, y in dataset:
if y == idx:
images.append(x)
idx += 1
if idx == 10:
break
encodings_digit = []
for d in range(10):
with torch.no_grad():
mu, sigma = model.encode(images[d].view(1, 784))
encodings_digit.append((mu, sigma))
mu, sigma = encodings_digit[digit]
for example in range(num_examples):
epsilon = torch.randn_like(sigma)
z = mu + sigma * epsilon
out = model.decode(z)
out = out.view(-1, 1, 28, 28)
save_image(out, f"generated_{digit}_ex{example}.png")

View File

@@ -23,27 +23,7 @@ model = VariationalAutoEncoder(INPUT_DIM, H_DIM, Z_DIM).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR_RATE)
loss_fn = nn.BCELoss(reduction="sum")
# Start Training
for epoch in range(NUM_EPOCHS):
loop = tqdm(enumerate(train_loader))
for i, (x, _) in loop:
# Forward pass
x = x.to(DEVICE).view(x.shape[0], INPUT_DIM)
x_reconstructed, mu, sigma = model(x)
# Compute loss
reconstruction_loss = loss_fn(x_reconstructed, x)
kl_div = -torch.sum(1 + torch.log(sigma.pow(2)) - mu.pow(2) - sigma.pow(2))
# Backprop
loss = reconstruction_loss + kl_div
optimizer.zero_grad()
loss.backward()
optimizer.step()
loop.set_postfix(loss=loss.item())
model = model.to("cpu")
def inference(digit, num_examples=1):
"""
Generates (num_examples) of a particular digit.
@@ -79,8 +59,3 @@ def inference(digit, num_examples=1):
for idx in range(10):
inference(idx, num_examples=5)

View File

@@ -0,0 +1,120 @@
"""
Create a PyTorch Custom dataset that loads file in data/other.tsv that contains
the path to image audio and text transcription.
"""
import pytorch_lightning as pl
from tqdm import tqdm
import ffmpeg
import os
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
import sys
class CommonVoice(Dataset):
def __init__(self, data_dir, whisper_model="tiny"):
self.sampling_rate = 16_000
self.data_dir = data_dir
self.data = pd.read_csv(
os.path.join(data_dir, "other.tsv"),
sep="\t",
)
self.feature_extractor = WhisperFeatureExtractor.from_pretrained(
f"openai/whisper-{whisper_model}"
)
self.tokenizer = WhisperTokenizer.from_pretrained(
f"openai/whisper-{whisper_model}", language="sv", task="transcribe"
)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
audio_file_path = os.path.join(
self.data_dir + "clips/", self.data.iloc[idx]["path"]
)
sentence = self.data.iloc[idx]["sentence"]
text = self.tokenizer(sentence).input_ids
out, _ = (
ffmpeg.input(audio_file_path, threads=0)
.output(
"-", format="s16le", acodec="pcm_s16le", ac=1, ar=self.sampling_rate
)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
# run feature extractor
audio_features = self.feature_extractor(
out, sampling_rate=self.sampling_rate, return_tensors="pt"
)
return audio_features, text
# Create a collator that will pad the audio features and text labels
class DataCollatorSpeechSeq2SeqWithPadding:
def __init__(self, feature_extractor, tokenizer):
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
def __call__(self, batch):
text_features = [{"input_ids": x[1]} for x in batch]
batch_text = self.tokenizer.pad(
text_features, return_tensors="pt",
)
audio_features = [{"input_features": x[0]["input_features"]} for x in batch]
batch_audio = self.feature_extractor.pad(
audio_features, return_tensors="pt",
)
batch_text["input_ids"] = batch_text["input_ids"].masked_fill(
batch_text["attention_mask"].ne(1), -100
)
batch_audio["input_features"] = batch_audio["input_features"].squeeze(1)
labels = batch_text["input_ids"].clone()
if (labels[:, 0] == self.tokenizer.encode("")[0]).all().cpu().item():
labels = labels[:, 1:]
batch_text["labels"] = labels
return batch_audio, batch_text
# Put into a lightning datamodule
class WhisperDataset(pl.LightningDataModule):
def __init__(self, data_dir, batch_size=32, num_workers=0, whisper_model="tiny"):
super().__init__()
self.data_dir = data_dir
self.batch_size = batch_size
self.num_workers = num_workers
self.whisper_model = whisper_model
self.sampling_rate = 16_000
def setup(self, stage=None):
self.dataset = CommonVoice(self.data_dir, self.whisper_model)
self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(
self.dataset.feature_extractor, self.dataset.tokenizer
)
def train_dataloader(self):
return DataLoader(
self.dataset,
batch_size=self.batch_size,
shuffle=True,
num_workers=self.num_workers,
collate_fn=self.data_collator,
)
# Test if lightning datamodule working as intended
if __name__ == "__main__":
dm = WhisperDataset(data_dir="data/")
dm.setup()
from tqdm import tqdm
for batch in tqdm(dm.train_dataloader()):
pass

View File

@@ -0,0 +1,34 @@
import torch
import torchvision
from torch import nn
import pytorch_lightning as pl
from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
from transformers import WhisperForConditionalGeneration
class WhisperFinetuning(pl.LightningModule):
def __init__(self, lr, whisper_model="tiny"):
super().__init__()
self.lr = lr
self.model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{whisper_model}")
self.model.config.forced_decoder_ids = None
self.model.config.suppress_tokens = []
def training_step(self, batch, batch_idx):
encoder_input = batch[0]["input_features"]
decoder_labels = batch[1]["labels"]
out = self.model(
input_features=encoder_input,
labels=decoder_labels,
)
loss = out["loss"]
return loss
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
return optimizer
if __name__ == "__main__":
pass

View File

@@ -0,0 +1,9 @@
Goal: re-write the code of huggingface whisper finetuning to use pytorch lightning
1. load the dataset using lightning datamodule
* integrate huggingface loading data, or we can write it ourselves and use lightning datamodule
2. load the model using lightning module
3. train the model using lightning trainer
(4. See if we can sharded training with lightning trainer to maybe finetune a large whisper model
that we couldn't on single GPU)
End goal: Finetune the model on our own dataset for some cool application

View File

@@ -0,0 +1,7 @@
from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained(
f"openai/whisper-tiny", task="transcribe"
)
encoded_string = tokenizer.encode("")[0]
print(encoded_string) # should print 50258
print(tokenizer.bos_token_id) # should print 50257

View File

@@ -0,0 +1,31 @@
import torch
import torchvision.datasets as datasets # Standard datasets
from tqdm import tqdm
from torch import nn, optim
from torchvision import transforms
from torchvision.utils import save_image
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from model import WhisperFinetuning
from dataset import WhisperDataset
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.strategies import DeepSpeedStrategy
torch.set_float32_matmul_precision("medium")
# things to add
lr = 1e-5
batch_size = 32
num_workers = 4
model = WhisperFinetuning(lr)
dm = WhisperDataset(data_dir="data/", batch_size=batch_size, num_workers=num_workers)
if __name__ == "__main__":
trainer = pl.Trainer(
max_epochs=1000,
accelerator="gpu",
devices=[0],
precision=16,
)
trainer.fit(model, dm)

View File

@@ -0,0 +1,181 @@
import evaluate
from transformers import Seq2SeqTrainer
from transformers import WhisperForConditionalGeneration
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
from datasets import load_dataset, DatasetDict, Audio
# set so we only can see first cuda device
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
common_voice = DatasetDict()
common_voice["train"] = load_dataset(
"mozilla-foundation/common_voice_11_0",
"sv-SE",
split="train+validation",
use_auth_token=False,
)
common_voice["test"] = load_dataset(
"mozilla-foundation/common_voice_11_0",
"sv-SE",
split="test",
use_auth_token=False,
)
# common_voice = common_voice.remove_columns(
# [
# "accent",
# "age",
# "client_id",
# "down_votes",
# "gender",
# "locale",
# "path",
# "segment",
# "up_votes",
# ]
# )
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
tokenizer = WhisperTokenizer.from_pretrained(
"openai/whisper-tiny", language="sv", task="transcribe"
)
input_str = common_voice["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
print(f"Input: {input_str}")
print(f"Decoded w/ special: {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal: {input_str == decoded_str}")
input_str = common_voice["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
processor = WhisperProcessor.from_pretrained(
"openai/whisper-small", language="sv", task="transcribe"
)
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
def prepare_dataset(example):
# load and resample audio data from 48 to 16kHz
audio = example["audio"]
# compute log-Mel input features from input audio array
example["input_features"] = feature_extractor(
audio["array"], sampling_rate=audio["sampling_rate"]
).input_features[0]
# encode target text to label ids
example["labels"] = tokenizer(example["sentence"]).input_ids
return example
common_voice = common_voice.map(prepare_dataset, num_proc=8)
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
processor: Any
def __call__(
self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
) -> Dict[str, torch.Tensor]:
# split inputs and labels since they have to be of different lengths
# and need different padding methods first treat the audio inputs by
# simply returning torch tensors
input_features = [
{"input_features": feature["input_features"]} for feature in features
]
batch = self.processor.feature_extractor.pad(
input_features, return_tensors="pt"
)
# get the tokenized label sequences
label_features = [{"input_ids": feature["labels"]} for feature in features]
# pad the labels to max length
labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(
labels_batch.attention_mask.ne(1), -100
)
# if bos token is appended in previous tokenization step,
# cut bos token here as it's append later anyways
if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
labels = labels[:, 1:]
batch["labels"] = labels
return batch
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
metric = evaluate.load("wer")
def compute_metrics(pred):
pred_ids = pred.predictions
label_ids = pred.label_ids
# replace -100 with the pad_token_id
label_ids[label_ids == -100] = tokenizer.pad_token_id
# we do not want to group tokens when computing the metrics
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
wer = 100 * metric.compute(predictions=pred_str, references=label_str)
return {"wer": wer}
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
from transformers import Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
output_dir="./whisper-tiny-swedish", # change to a repo name of your choice
per_device_train_batch_size=32,
gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size
learning_rate=1e-5,
warmup_steps=500,
max_steps=4000,
gradient_checkpointing=False,
fp16=True,
evaluation_strategy="steps",
per_device_eval_batch_size=8,
predict_with_generate=True,
generation_max_length=225,
save_steps=1000,
eval_steps=1000,
logging_steps=25,
report_to=["tensorboard"],
load_best_model_at_end=True,
metric_for_best_model="wer",
greater_is_better=False,
push_to_hub=False,
dataloader_num_workers=0,
)
trainer = Seq2SeqTrainer(
args=training_args,
model=model,
train_dataset=common_voice["train"],
eval_dataset=common_voice["test"],
data_collator=data_collator,
compute_metrics=compute_metrics,
tokenizer=processor.feature_extractor,
)
trainer.train()

View File

@@ -0,0 +1,110 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.utils.data import random_split
class NN(nn.Module):
def __init__(self, input_size, num_classes):
super().__init__()
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
# Set device cuda for GPU if it's available otherwise run on the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
input_size = 784
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 3
# Load Data
entire_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
train_ds, val_ds = random_split(entire_dataset, [50000, 10000])
test_ds = datasets.MNIST(
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_ds, batch_size=batch_size, shuffle=False)
# Initialize network
model = NN(input_size=input_size, num_classes=num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Train Network
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
# Get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)
# Get to correct shape
data = data.reshape(data.shape[0], -1)
# Forward
scores = model(data)
loss = criterion(scores, targets)
# Backward
optimizer.zero_grad()
loss.backward()
# Gradient descent or adam step
optimizer.step()
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
# We don't need to keep track of gradients here so we wrap it in torch.no_grad()
with torch.no_grad():
# Loop through the data
for x, y in loader:
# Move data to device
x = x.to(device=device)
y = y.to(device=device)
# Get to correct shape
x = x.reshape(x.shape[0], -1)
# Forward pass
scores = model(x)
_, predictions = scores.max(1)
# Check how many we got correct
num_correct += (predictions == y).sum()
# Keep track of number of samples
num_samples += predictions.size(0)
model.train()
return num_correct / num_samples
# Check accuracy on training & test to see how good our model
model.to(device)
print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
print(f"Accuracy on validation set: {check_accuracy(val_loader, model)*100:.2f}")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")

View File

@@ -0,0 +1,12 @@
from pytorch_lightning.callbacks import EarlyStopping, Callback
class MyPrintingCallback(Callback):
def __init__(self):
super().__init__()
def on_train_start(self, trainer, pl_module):
print("Starting to train!")
def on_train_end(self, trainer, pl_module):
print("Training is done.")

View File

@@ -0,0 +1,15 @@
# Training hyperparameters
INPUT_SIZE = 784
NUM_CLASSES = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 3
# Dataset
DATA_DIR = "dataset/"
NUM_WORKERS = 4
# Compute related
ACCELERATOR = "gpu"
DEVICES = [0, 1]
PRECISION = 16

View File

@@ -0,0 +1,64 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import pytorch_lightning as pl
from torchvision.transforms import RandomHorizontalFlip, RandomVerticalFlip
class MnistDataModule(pl.LightningDataModule):
def __init__(self, data_dir, batch_size, num_workers):
super().__init__()
self.data_dir = data_dir
self.batch_size = batch_size
self.num_workers = num_workers
def prepare_data(self):
datasets.MNIST(self.data_dir, train=True, download=True)
datasets.MNIST(self.data_dir, train=False, download=True)
def setup(self, stage):
entire_dataset = datasets.MNIST(
root=self.data_dir,
train=True,
transform=transforms.Compose([
transforms.RandomVerticalFlip(),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
]),
download=False,
)
self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
self.test_ds = datasets.MNIST(
root=self.data_dir,
train=False,
transform=transforms.ToTensor(),
download=False,
)
def train_dataloader(self):
return DataLoader(
self.train_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=True,
)
def val_dataloader(self):
return DataLoader(
self.val_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=False,
)
def test_dataloader(self):
return DataLoader(
self.test_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=False,
)

View File

@@ -0,0 +1,89 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import pytorch_lightning as pl
import torchmetrics
from torchmetrics import Metric
import torchvision
class NN(pl.LightningModule):
def __init__(self, input_size, learning_rate, num_classes):
super().__init__()
self.lr = learning_rate
self.fc1 = nn.Linear(input_size, 1_000_000)
self.fc2 = nn.Linear(1_000_000, num_classes)
self.loss_fn = nn.CrossEntropyLoss()
self.accuracy = torchmetrics.Accuracy(
task="multiclass", num_classes=num_classes
)
self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
def training_step(self, batch, batch_idx):
x, y = batch
loss, scores, y = self._common_step(batch, batch_idx)
self.log_dict(
{
"train_loss": loss,
},
on_step=False,
on_epoch=True,
prog_bar=True,
)
if batch_idx % 100 == 0:
x = x[:8]
grid = torchvision.utils.make_grid(x.view(-1, 1, 28, 28))
self.logger.experiment.add_image("mnist_images", grid, self.global_step)
return {"loss": loss, "scores": scores, "y": y}
def training_epoch_end(self, outputs):
scores = torch.cat([x["scores"] for x in outputs])
y = torch.cat([x["y"] for x in outputs])
self.log_dict(
{
"train_acc": self.accuracy(scores, y),
"train_f1": self.f1_score(scores, y),
},
on_step=False,
on_epoch=True,
prog_bar=True,
)
def validation_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log("val_loss", loss)
return loss
def test_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log("test_loss", loss)
return loss
def _common_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
loss = self.loss_fn(scores, y)
return loss, scores, y
def predict_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
preds = torch.argmax(scores, dim=1)
return preds
def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=self.lr)

View File

@@ -0,0 +1,43 @@
import torch
import pytorch_lightning as pl
from model import NN
from dataset import MnistDataModule
import config
from callbacks import MyPrintingCallback, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.profilers import PyTorchProfiler
from pytorch_lightning.strategies import DeepSpeedStrategy
torch.set_float32_matmul_precision("medium") # to make lightning happy
if __name__ == "__main__":
logger = TensorBoardLogger("tb_logs", name="mnist_model_v1")
strategy = DeepSpeedStrategy()
profiler = PyTorchProfiler(
on_trace_ready=torch.profiler.tensorboard_trace_handler("tb_logs/profiler0"),
schedule=torch.profiler.schedule(skip_first=10, wait=1, warmup=1, active=20),
)
model = NN(
input_size=config.INPUT_SIZE,
learning_rate=config.LEARNING_RATE,
num_classes=config.NUM_CLASSES,
)
dm = MnistDataModule(
data_dir=config.DATA_DIR,
batch_size=config.BATCH_SIZE,
num_workers=config.NUM_WORKERS,
)
trainer = pl.Trainer(
strategy=strategy,
profiler=profiler,
logger=logger,
accelerator=config.ACCELERATOR,
devices=config.DEVICES,
min_epochs=1,
max_epochs=config.NUM_EPOCHS,
precision=config.PRECISION,
callbacks=[MyPrintingCallback(), EarlyStopping(monitor="val_loss")],
)
trainer.fit(model, dm)
trainer.validate(model, dm)
trainer.test(model, dm)

View File

@@ -0,0 +1,154 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.utils.data import random_split
import pytorch_lightning as pl
class NN(nn.Module):
def __init__(self, input_size, num_classes):
super().__init__()
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
class NN(pl.LightningModule):
def __init__(self, input_size, num_classes):
super().__init__()
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
self.loss_fn = nn.CrossEntropyLoss()
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
def training_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('val_loss', loss)
return loss
def test_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('test_loss', loss)
return loss
def _common_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
loss = self.loss_fn(scores, y)
return loss, scores, y
def predict_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
preds = torch.argmax(scores, dim=1)
return preds
def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=0.001)
# Set device cuda for GPU if it's available otherwise run on the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
input_size = 784
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 3
# Load Data
entire_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
train_ds, val_ds = random_split(entire_dataset, [50000, 10000])
test_ds = datasets.MNIST(
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_ds, batch_size=batch_size, shuffle=False)
# Initialize network
model = NN(input_size=input_size, num_classes=num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Train Network
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
# Get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)
# Get to correct shape
data = data.reshape(data.shape[0], -1)
# Forward
scores = model(data)
loss = criterion(scores, targets)
# Backward
optimizer.zero_grad()
loss.backward()
# Gradient descent or adam step
optimizer.step()
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
# We don't need to keep track of gradients here so we wrap it in torch.no_grad()
with torch.no_grad():
# Loop through the data
for x, y in loader:
# Move data to device
x = x.to(device=device)
y = y.to(device=device)
# Get to correct shape
x = x.reshape(x.shape[0], -1)
# Forward pass
scores = model(x)
_, predictions = scores.max(1)
# Check how many we got correct
num_correct += (predictions == y).sum()
# Keep track of number of samples
num_samples += predictions.size(0)
model.train()
return num_correct / num_samples
# Check accuracy on training & test to see how good our model
model.to(device)
print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
print(f"Accuracy on validation set: {check_accuracy(val_loader, model)*100:.2f}")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")

View File

@@ -0,0 +1,126 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.utils.data import random_split
import pytorch_lightning as pl
class NN(pl.LightningModule):
def __init__(self, input_size, num_classes):
super().__init__()
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
self.loss_fn = nn.CrossEntropyLoss()
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
def training_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('val_loss', loss)
return loss
def test_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('test_loss', loss)
return loss
def _common_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
loss = self.loss_fn(scores, y)
return loss, scores, y
def predict_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
preds = torch.argmax(scores, dim=1)
return preds
def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=0.001)
# Set device cuda for GPU if it's available otherwise run on the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
input_size = 784
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 3
# Load Data
entire_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
train_ds, val_ds = random_split(entire_dataset, [50000, 10000])
test_ds = datasets.MNIST(
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_ds, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_ds, batch_size=batch_size, shuffle=False)
# Initialize network
model = NN(input_size=input_size, num_classes=num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
trainer = pl.Trainer(accelerator="gpu", devices=1, min_epochs=1, max_epochs=3, precision=16)
trainer.fit(model, train_loader, val_loader)
trainer.validate(model, val_loader)
trainer.test(model, test_loader)
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
# We don't need to keep track of gradients here so we wrap it in torch.no_grad()
with torch.no_grad():
# Loop through the data
for x, y in loader:
# Move data to device
x = x.to(device=device)
y = y.to(device=device)
# Get to correct shape
x = x.reshape(x.shape[0], -1)
# Forward pass
scores = model(x)
_, predictions = scores.max(1)
# Check how many we got correct
num_correct += (predictions == y).sum()
# Keep track of number of samples
num_samples += predictions.size(0)
model.train()
return num_correct / num_samples
# Check accuracy on training & test to see how good our model
model.to(device)
print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
print(f"Accuracy on validation set: {check_accuracy(val_loader, model)*100:.2f}")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")

View File

@@ -0,0 +1,150 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.utils.data import random_split
import pytorch_lightning as pl
import torchmetrics
from torchmetrics import Metric
class MyAccuracy(Metric):
def __init__(self):
super().__init__()
self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum")
def update(self, preds, target):
preds = torch.argmax(preds, dim=1)
assert preds.shape == target.shape
self.correct += torch.sum(preds == target)
self.total += target.numel()
def compute(self):
return self.correct.float() / self.total.float()
class NN(pl.LightningModule):
def __init__(self, input_size, num_classes):
super().__init__()
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
self.loss_fn = nn.CrossEntropyLoss()
self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
self.my_accuracy = MyAccuracy()
self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
def training_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
accuracy = self.my_accuracy(scores, y)
f1_score = self.f1_score(scores, y)
self.log_dict({'train_loss': loss, 'train_accuracy': accuracy, 'train_f1_score': f1_score},
on_step=False, on_epoch=True, prog_bar=True)
return {'loss': loss, "scores": scores, "y": y}
def validation_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('val_loss', loss)
return loss
def test_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('test_loss', loss)
return loss
def _common_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
loss = self.loss_fn(scores, y)
return loss, scores, y
def predict_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
preds = torch.argmax(scores, dim=1)
return preds
def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=0.001)
# Set device cuda for GPU if it's available otherwise run on the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
input_size = 784
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 3
# Load Data
entire_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
train_ds, val_ds = random_split(entire_dataset, [50000, 10000])
test_ds = datasets.MNIST(
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_ds, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_ds, batch_size=batch_size, shuffle=False)
# Initialize network
model = NN(input_size=input_size, num_classes=num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
trainer = pl.Trainer(accelerator="gpu", devices=1, min_epochs=1, max_epochs=3, precision=16)
trainer.fit(model, train_loader, val_loader)
trainer.validate(model, val_loader)
trainer.test(model, test_loader)
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
# We don't need to keep track of gradients here so we wrap it in torch.no_grad()
with torch.no_grad():
# Loop through the data
for x, y in loader:
# Move data to device
x = x.to(device=device)
y = y.to(device=device)
# Get to correct shape
x = x.reshape(x.shape[0], -1)
# Forward pass
scores = model(x)
_, predictions = scores.max(1)
# Check how many we got correct
num_correct += (predictions == y).sum()
# Keep track of number of samples
num_samples += predictions.size(0)
model.train()
return num_correct / num_samples
# Check accuracy on training & test to see how good our model
model.to(device)
print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
print(f"Accuracy on validation set: {check_accuracy(val_loader, model)*100:.2f}")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")

View File

@@ -0,0 +1,146 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.utils.data import random_split
import pytorch_lightning as pl
import torchmetrics
from torchmetrics import Metric
class MyAccuracy(Metric):
def __init__(self):
super().__init__()
self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum")
def update(self, preds, target):
preds = torch.argmax(preds, dim=1)
assert preds.shape == target.shape
self.correct += torch.sum(preds == target)
self.total += target.numel()
def compute(self):
return self.correct.float() / self.total.float()
class NN(pl.LightningModule):
def __init__(self, input_size, num_classes):
super().__init__()
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
self.loss_fn = nn.CrossEntropyLoss()
self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
self.my_accuracy = MyAccuracy()
self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
def training_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
accuracy = self.my_accuracy(scores, y)
f1_score = self.f1_score(scores, y)
self.log_dict({'train_loss': loss, 'train_accuracy': accuracy, 'train_f1_score': f1_score},
on_step=False, on_epoch=True, prog_bar=True)
return {'loss': loss, "scores": scores, "y": y}
def validation_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('val_loss', loss)
return loss
def test_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('test_loss', loss)
return loss
def _common_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
loss = self.loss_fn(scores, y)
return loss, scores, y
def predict_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
preds = torch.argmax(scores, dim=1)
return preds
def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=0.001)
class MnistDataModule(pl.LightningDataModule):
def __init__(self, data_dir, batch_size, num_workers):
super().__init__()
self.data_dir = data_dir
self.batch_size = batch_size
self.num_workers = num_workers
def prepare_data(self):
datasets.MNIST(self.data_dir, train=True, download=True)
datasets.MNIST(self.data_dir, train=False, download=True)
def setup(self, stage):
entire_dataset = datasets.MNIST(
root=self.data_dir,
train=True,
transform=transforms.ToTensor(),
download=False,
)
self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
self.test_ds = datasets.MNIST(
root=self.data_dir,
train=False,
transform=transforms.ToTensor(),
download=False,
)
def train_dataloader(self):
return DataLoader(
self.train_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=True,
)
def val_dataloader(self):
return DataLoader(
self.val_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=False,
)
def test_dataloader(self):
return DataLoader(
self.test_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=False,
)
# Set device cuda for GPU if it's available otherwise run on the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
input_size = 784
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 3
model = NN(input_size=input_size, num_classes=num_classes)
dm = MnistDataModule(data_dir="dataset/", batch_size=batch_size, num_workers=4)
trainer = pl.Trainer(accelerator="gpu", devices=1, min_epochs=1, max_epochs=3, precision=16)
trainer.fit(model, dm)
trainer.validate(model, dm)
trainer.test(model, dm)

View File

@@ -0,0 +1,15 @@
# Training hyperparameters
INPUT_SIZE = 784
NUM_CLASSES = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 3
# Dataset
DATA_DIR = "dataset/"
NUM_WORKERS = 4
# Compute related
ACCELERATOR = "gpu"
DEVICES = [0]
PRECISION = 16

View File

@@ -0,0 +1,59 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import pytorch_lightning as pl
class MnistDataModule(pl.LightningDataModule):
def __init__(self, data_dir, batch_size, num_workers):
super().__init__()
self.data_dir = data_dir
self.batch_size = batch_size
self.num_workers = num_workers
def prepare_data(self):
datasets.MNIST(self.data_dir, train=True, download=True)
datasets.MNIST(self.data_dir, train=False, download=True)
def setup(self, stage):
entire_dataset = datasets.MNIST(
root=self.data_dir,
train=True,
transform=transforms.ToTensor(),
download=False,
)
self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
self.test_ds = datasets.MNIST(
root=self.data_dir,
train=False,
transform=transforms.ToTensor(),
download=False,
)
def train_dataloader(self):
return DataLoader(
self.train_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=True,
)
def val_dataloader(self):
return DataLoader(
self.val_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=False,
)
def test_dataloader(self):
return DataLoader(
self.test_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=False,
)

View File

@@ -0,0 +1,71 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import pytorch_lightning as pl
import torchmetrics
from torchmetrics import Metric
class NN(pl.LightningModule):
def __init__(self, input_size, learning_rate, num_classes):
super().__init__()
self.lr = learning_rate
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
self.loss_fn = nn.CrossEntropyLoss()
self.accuracy = torchmetrics.Accuracy(
task="multiclass", num_classes=num_classes
)
self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
def training_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
accuracy = self.accuracy(scores, y)
f1_score = self.f1_score(scores, y)
self.log_dict(
{
"train_loss": loss,
"train_accuracy": accuracy,
"train_f1_score": f1_score,
},
on_step=False,
on_epoch=True,
prog_bar=True,
)
return {"loss": loss, "scores": scores, "y": y}
def validation_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log("val_loss", loss)
return loss
def test_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log("test_loss", loss)
return loss
def _common_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
loss = self.loss_fn(scores, y)
return loss, scores, y
def predict_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
preds = torch.argmax(scores, dim=1)
return preds
def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=self.lr)

View File

@@ -0,0 +1,27 @@
import torch
import pytorch_lightning as pl
from model import NN
from dataset import MnistDataModule
import config
if __name__ == "__main__":
model = NN(
input_size=config.INPUT_SIZE,
learning_rate=config.LEARNING_RATE,
num_classes=config.NUM_CLASSES,
)
dm = MnistDataModule(
data_dir=config.DATA_DIR,
batch_size=config.BATCH_SIZE,
num_workers=config.NUM_WORKERS,
)
trainer = pl.Trainer(
accelerator=config.ACCELERATOR,
devices=config.DEVICES,
min_epochs=1,
max_epochs=3,
precision=config.PRECISION,
)
trainer.fit(model, dm)
trainer.validate(model, dm)
trainer.test(model, dm)

View File

@@ -0,0 +1,12 @@
from pytorch_lightning.callbacks import EarlyStopping, Callback
class MyPrintingCallback(Callback):
def __init__(self):
super().__init__()
def on_train_start(self, trainer, pl_module):
print("Starting to train!")
def on_train_end(self, trainer, pl_module):
print("Training is done.")

View File

@@ -0,0 +1,15 @@
# Training hyperparameters
INPUT_SIZE = 784
NUM_CLASSES = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 1000
# Dataset
DATA_DIR = "dataset/"
NUM_WORKERS = 4
# Compute related
ACCELERATOR = "gpu"
DEVICES = [0]
PRECISION = 16

View File

@@ -0,0 +1,59 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import pytorch_lightning as pl
class MnistDataModule(pl.LightningDataModule):
def __init__(self, data_dir, batch_size, num_workers):
super().__init__()
self.data_dir = data_dir
self.batch_size = batch_size
self.num_workers = num_workers
def prepare_data(self):
datasets.MNIST(self.data_dir, train=True, download=True)
datasets.MNIST(self.data_dir, train=False, download=True)
def setup(self, stage):
entire_dataset = datasets.MNIST(
root=self.data_dir,
train=True,
transform=transforms.ToTensor(),
download=False,
)
self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
self.test_ds = datasets.MNIST(
root=self.data_dir,
train=False,
transform=transforms.ToTensor(),
download=False,
)
def train_dataloader(self):
return DataLoader(
self.train_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=True,
)
def val_dataloader(self):
return DataLoader(
self.val_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=False,
)
def test_dataloader(self):
return DataLoader(
self.test_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=False,
)

View File

@@ -0,0 +1,71 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import pytorch_lightning as pl
import torchmetrics
from torchmetrics import Metric
class NN(pl.LightningModule):
def __init__(self, input_size, learning_rate, num_classes):
super().__init__()
self.lr = learning_rate
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
self.loss_fn = nn.CrossEntropyLoss()
self.accuracy = torchmetrics.Accuracy(
task="multiclass", num_classes=num_classes
)
self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
def training_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
accuracy = self.accuracy(scores, y)
f1_score = self.f1_score(scores, y)
self.log_dict(
{
"train_loss": loss,
"train_accuracy": accuracy,
"train_f1_score": f1_score,
},
on_step=False,
on_epoch=True,
prog_bar=True,
)
return {"loss": loss, "scores": scores, "y": y}
def validation_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log("val_loss", loss)
return loss
def test_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log("test_loss", loss)
return loss
def _common_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
loss = self.loss_fn(scores, y)
return loss, scores, y
def predict_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
preds = torch.argmax(scores, dim=1)
return preds
def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=self.lr)

View File

@@ -0,0 +1,31 @@
import torch
import pytorch_lightning as pl
from model import NN
from dataset import MnistDataModule
import config
from callbacks import MyPrintingCallback, EarlyStopping
torch.set_float32_matmul_precision("medium") # to make lightning happy
if __name__ == "__main__":
model = NN(
input_size=config.INPUT_SIZE,
learning_rate=config.LEARNING_RATE,
num_classes=config.NUM_CLASSES,
)
dm = MnistDataModule(
data_dir=config.DATA_DIR,
batch_size=config.BATCH_SIZE,
num_workers=config.NUM_WORKERS,
)
trainer = pl.Trainer(
accelerator=config.ACCELERATOR,
devices=config.DEVICES,
min_epochs=1,
max_epochs=config.NUM_EPOCHS,
precision=config.PRECISION,
callbacks=[MyPrintingCallback(), EarlyStopping(monitor="val_loss")],
)
trainer.fit(model, dm)
trainer.validate(model, dm)
trainer.test(model, dm)

View File

@@ -0,0 +1,12 @@
from pytorch_lightning.callbacks import EarlyStopping, Callback
class MyPrintingCallback(Callback):
def __init__(self):
super().__init__()
def on_train_start(self, trainer, pl_module):
print("Starting to train!")
def on_train_end(self, trainer, pl_module):
print("Training is done.")

View File

@@ -0,0 +1,15 @@
# Training hyperparameters
INPUT_SIZE = 784
NUM_CLASSES = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 1000
# Dataset
DATA_DIR = "dataset/"
NUM_WORKERS = 4
# Compute related
ACCELERATOR = "gpu"
DEVICES = [0]
PRECISION = 16

View File

@@ -0,0 +1,64 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import pytorch_lightning as pl
from torchvision.transforms import RandomHorizontalFlip, RandomVerticalFlip
class MnistDataModule(pl.LightningDataModule):
def __init__(self, data_dir, batch_size, num_workers):
super().__init__()
self.data_dir = data_dir
self.batch_size = batch_size
self.num_workers = num_workers
def prepare_data(self):
datasets.MNIST(self.data_dir, train=True, download=True)
datasets.MNIST(self.data_dir, train=False, download=True)
def setup(self, stage):
entire_dataset = datasets.MNIST(
root=self.data_dir,
train=True,
transform=transforms.Compose([
transforms.RandomVerticalFlip(),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
]),
download=False,
)
self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
self.test_ds = datasets.MNIST(
root=self.data_dir,
train=False,
transform=transforms.ToTensor(),
download=False,
)
def train_dataloader(self):
return DataLoader(
self.train_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=True,
)
def val_dataloader(self):
return DataLoader(
self.val_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=False,
)
def test_dataloader(self):
return DataLoader(
self.test_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=False,
)

View File

@@ -0,0 +1,79 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import pytorch_lightning as pl
import torchmetrics
from torchmetrics import Metric
import torchvision
class NN(pl.LightningModule):
def __init__(self, input_size, learning_rate, num_classes):
super().__init__()
self.lr = learning_rate
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
self.loss_fn = nn.CrossEntropyLoss()
self.accuracy = torchmetrics.Accuracy(
task="multiclass", num_classes=num_classes
)
self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
def training_step(self, batch, batch_idx):
x, y = batch
loss, scores, y = self._common_step(batch, batch_idx)
accuracy = self.accuracy(scores, y)
f1_score = self.f1_score(scores, y)
self.log_dict(
{
"train_loss": loss,
"train_accuracy": accuracy,
"train_f1_score": f1_score,
},
on_step=False,
on_epoch=True,
prog_bar=True,
)
if batch_idx % 100 == 0:
x = x[:8]
grid = torchvision.utils.make_grid(x.view(-1, 1, 28, 28))
self.logger.experiment.add_image("mnist_images", grid, self.global_step)
return {"loss": loss, "scores": scores, "y": y}
def validation_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log("val_loss", loss)
return loss
def test_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log("test_loss", loss)
return loss
def _common_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
loss = self.loss_fn(scores, y)
return loss, scores, y
def predict_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
preds = torch.argmax(scores, dim=1)
return preds
def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=self.lr)

View File

@@ -0,0 +1,34 @@
import torch
import pytorch_lightning as pl
from model import NN
from dataset import MnistDataModule
import config
from callbacks import MyPrintingCallback, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
torch.set_float32_matmul_precision("medium") # to make lightning happy
if __name__ == "__main__":
logger = TensorBoardLogger("tb_logs", name="mnist_model_v0")
model = NN(
input_size=config.INPUT_SIZE,
learning_rate=config.LEARNING_RATE,
num_classes=config.NUM_CLASSES,
)
dm = MnistDataModule(
data_dir=config.DATA_DIR,
batch_size=config.BATCH_SIZE,
num_workers=config.NUM_WORKERS,
)
trainer = pl.Trainer(
logger=logger,
accelerator=config.ACCELERATOR,
devices=config.DEVICES,
min_epochs=1,
max_epochs=config.NUM_EPOCHS,
precision=config.PRECISION,
callbacks=[MyPrintingCallback(), EarlyStopping(monitor="val_loss")],
)
trainer.fit(model, dm)
trainer.validate(model, dm)
trainer.test(model, dm)

View File

@@ -0,0 +1,12 @@
from pytorch_lightning.callbacks import EarlyStopping, Callback
class MyPrintingCallback(Callback):
def __init__(self):
super().__init__()
def on_train_start(self, trainer, pl_module):
print("Starting to train!")
def on_train_end(self, trainer, pl_module):
print("Training is done.")

View File

@@ -0,0 +1,15 @@
# Training hyperparameters
INPUT_SIZE = 784
NUM_CLASSES = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 3
# Dataset
DATA_DIR = "dataset/"
NUM_WORKERS = 4
# Compute related
ACCELERATOR = "gpu"
DEVICES = [0]
PRECISION = 16

View File

@@ -0,0 +1,64 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import pytorch_lightning as pl
from torchvision.transforms import RandomHorizontalFlip, RandomVerticalFlip
class MnistDataModule(pl.LightningDataModule):
def __init__(self, data_dir, batch_size, num_workers):
super().__init__()
self.data_dir = data_dir
self.batch_size = batch_size
self.num_workers = num_workers
def prepare_data(self):
datasets.MNIST(self.data_dir, train=True, download=True)
datasets.MNIST(self.data_dir, train=False, download=True)
def setup(self, stage):
entire_dataset = datasets.MNIST(
root=self.data_dir,
train=True,
transform=transforms.Compose([
transforms.RandomVerticalFlip(),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
]),
download=False,
)
self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
self.test_ds = datasets.MNIST(
root=self.data_dir,
train=False,
transform=transforms.ToTensor(),
download=False,
)
def train_dataloader(self):
return DataLoader(
self.train_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=True,
)
def val_dataloader(self):
return DataLoader(
self.val_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=False,
)
def test_dataloader(self):
return DataLoader(
self.test_ds,
batch_size=self.batch_size,
num_workers=self.num_workers,
shuffle=False,
)

View File

@@ -0,0 +1,89 @@
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import pytorch_lightning as pl
import torchmetrics
from torchmetrics import Metric
import torchvision
class NN(pl.LightningModule):
def __init__(self, input_size, learning_rate, num_classes):
super().__init__()
self.lr = learning_rate
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
self.loss_fn = nn.CrossEntropyLoss()
self.accuracy = torchmetrics.Accuracy(
task="multiclass", num_classes=num_classes
)
self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
def training_step(self, batch, batch_idx):
x, y = batch
loss, scores, y = self._common_step(batch, batch_idx)
self.log_dict(
{
"train_loss": loss,
},
on_step=False,
on_epoch=True,
prog_bar=True,
)
if batch_idx % 100 == 0:
x = x[:8]
grid = torchvision.utils.make_grid(x.view(-1, 1, 28, 28))
self.logger.experiment.add_image("mnist_images", grid, self.global_step)
return {"loss": loss, "scores": scores, "y": y}
def training_epoch_end(self, outputs):
scores = torch.cat([x["scores"] for x in outputs])
y = torch.cat([x["y"] for x in outputs])
self.log_dict(
{
"train_acc": self.accuracy(scores, y),
"train_f1": self.f1_score(scores, y),
},
on_step=False,
on_epoch=True,
prog_bar=True,
)
def validation_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log("val_loss", loss)
return loss
def test_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log("test_loss", loss)
return loss
def _common_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
loss = self.loss_fn(scores, y)
return loss, scores, y
def predict_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
preds = torch.argmax(scores, dim=1)
return preds
def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=self.lr)

View File

@@ -0,0 +1,40 @@
import torch
import pytorch_lightning as pl
from model import NN
from dataset import MnistDataModule
import config
from callbacks import MyPrintingCallback, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.profilers import PyTorchProfiler
torch.set_float32_matmul_precision("medium") # to make lightning happy
if __name__ == "__main__":
logger = TensorBoardLogger("tb_logs", name="mnist_model_v1")
profiler = PyTorchProfiler(
on_trace_ready=torch.profiler.tensorboard_trace_handler("tb_logs/profiler0"),
schedule=torch.profiler.schedule(skip_first=10, wait=1, warmup=1, active=20),
)
model = NN(
input_size=config.INPUT_SIZE,
learning_rate=config.LEARNING_RATE,
num_classes=config.NUM_CLASSES,
)
dm = MnistDataModule(
data_dir=config.DATA_DIR,
batch_size=config.BATCH_SIZE,
num_workers=config.NUM_WORKERS,
)
trainer = pl.Trainer(
profiler=profiler,
logger=logger,
accelerator=config.ACCELERATOR,
devices=config.DEVICES,
min_epochs=1,
max_epochs=config.NUM_EPOCHS,
precision=config.PRECISION,
callbacks=[MyPrintingCallback(), EarlyStopping(monitor="val_loss")],
)
trainer.fit(model, dm)
trainer.validate(model, dm)
trainer.test(model, dm)

View File

@@ -0,0 +1,123 @@
"""
Implementation of Neural collaborative filtering (NCF)
Next:
* Understand and use NDCG = Normalized Discounted Cumulative Gain
* Use SVD and compare results
"""
import torch
import pytorch_lightning as pl
import pandas as pd
import torchmetrics
from torch.utils.data import Dataset, DataLoader
from torch import nn
from sklearn.model_selection import train_test_split
torch.set_float32_matmul_precision("medium") # to make lightning happy
class MovieLens(Dataset):
def __init__(self, df_ratings):
self.df_ratings = df_ratings
def __len__(self):
return len(self.df_ratings)
def __getitem__(self, idx):
row = self.df_ratings.iloc[idx]
user_id = torch.tensor(row["user_id"], dtype=torch.long)
movie_id = torch.tensor(row["movie_id"], dtype=torch.long)
rating = torch.tensor(row["rating"], dtype=torch.float)
return user_id, movie_id, rating
class LightningData(pl.LightningDataModule):
def __init__(self, batch_size):
super().__init__()
self.batch_size = batch_size
def prepare_data(self):
self.df_ratings = pd.read_csv(
"data/ratings.dat",
sep="::",
header=None,
names=["user_id", "movie_id", "rating", "timestamp"],
engine="python",
)
# split into train and test
self.df_ratings_train, self.df_ratings_val = train_test_split(
self.df_ratings, test_size=0.2, random_state=42
)
def setup(self, stage=None):
self.dataset_train = MovieLens(self.df_ratings_train)
self.dataset_val = MovieLens(self.df_ratings_val)
def train_dataloader(self):
return DataLoader(self.dataset_train, batch_size=self.batch_size, num_workers=6)
def val_dataloader(self):
return DataLoader(self.dataset_train, batch_size=self.batch_size, num_workers=2)
class Net(nn.Module):
def __init__(self, n_users, n_movies, n_factors=50):
super().__init__()
self.user_factors = nn.Embedding(n_users, n_factors)
self.movie_factors = nn.Embedding(n_movies, n_factors)
self.lin = nn.Linear(n_factors * 2, 1)
def forward(self, user, movie):
user_embedding = self.user_factors(user)
movie_embedding = self.movie_factors(movie)
x = torch.cat([user_embedding, movie_embedding], dim=1)
return self.lin(x)
class NetLightning(pl.LightningModule):
def __init__(self, n_users, n_movies, n_factors=50, lr=3e-4):
super().__init__()
self.num_users = n_users
self.num_movies = n_movies
self.net = Net(n_users, n_movies, n_factors)
self.loss_fn = nn.MSELoss()
self.MAE = torchmetrics.MeanAbsoluteError()
self.lr = lr
def forward(self, user, movie):
return self.net(user, movie)
def training_step(self, batch, batch_idx):
user, movie, rating = batch
out = self.forward(user, movie)
mae = self.MAE(out.squeeze(1), rating.float())
loss = self.loss_fn(out.squeeze(1), rating.float())
self.log_dict({"train_loss": loss, "train_mae": mae}, on_step=False, on_epoch=True, prog_bar=True)
return loss
def validation_step(self, batch, batch_idx):
user, movie, rating = batch
out = self.forward(user, movie)
mae = self.MAE(out.squeeze(1), rating.float())
loss = self.loss_fn(out.squeeze(1), rating.float())
self.log_dict({"val_loss": loss, "val_mae": mae}, on_step=False, on_epoch=True, prog_bar=True)
def predict_step(self, user_id):
out = self.forward(user_id, torch.arange(0, self.num_movies))
return out
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.lr)
dm = LightningData(batch_size=512)
dm.prepare_data()
dm.setup()
num_movies = dm.df_ratings["movie_id"].max() + 1
num_users = dm.df_ratings["user_id"].max() + 1
model = NetLightning(num_users, num_movies)
trainer = pl.Trainer(accelerator="gpu", devices=1, max_epochs=3)
trainer.fit(model, dm)
trainer.validate(model, dm)