add lightning code, finetuning whisper, recommender system neural collaborative filtering

2026-04-10 12:33:44 +00:00 · 2023-02-21 16:25:42 +01:00
parent c646ef65e2
commit 94f6c024fe
51 changed files with 17977 additions and 25 deletions
--- a/ML/Pytorch/recommender_systems/neural_collaborative_filtering/main.py
+++ b/ML/Pytorch/recommender_systems/neural_collaborative_filtering/main.py
@@ -0,0 +1,123 @@
+""" 
+Implementation of Neural collaborative filtering (NCF)
+Next:
+    * Understand and use NDCG = Normalized Discounted Cumulative Gain
+    * Use SVD and compare results
+"""
+
+import torch
+import pytorch_lightning as pl
+import pandas as pd
+import torchmetrics
+from torch.utils.data import Dataset, DataLoader
+from torch import nn
+from sklearn.model_selection import train_test_split
+
+torch.set_float32_matmul_precision("medium") # to make lightning happy
+
+class MovieLens(Dataset):
+    def __init__(self, df_ratings):
+        self.df_ratings = df_ratings
+
+    def __len__(self):
+        return len(self.df_ratings)
+
+    def __getitem__(self, idx):
+        row = self.df_ratings.iloc[idx]
+        user_id = torch.tensor(row["user_id"], dtype=torch.long)
+        movie_id = torch.tensor(row["movie_id"], dtype=torch.long)
+        rating = torch.tensor(row["rating"], dtype=torch.float)
+        return user_id, movie_id, rating
+
+
+class LightningData(pl.LightningDataModule):
+    def __init__(self, batch_size):
+        super().__init__()
+        self.batch_size = batch_size
+
+    def prepare_data(self):
+        self.df_ratings = pd.read_csv(
+            "data/ratings.dat",
+            sep="::",
+            header=None,
+            names=["user_id", "movie_id", "rating", "timestamp"],
+            engine="python",
+        )
+
+
+        # split into train and test 
+        self.df_ratings_train, self.df_ratings_val = train_test_split(
+            self.df_ratings, test_size=0.2, random_state=42
+        )
+
+    def setup(self, stage=None):
+        self.dataset_train = MovieLens(self.df_ratings_train)
+        self.dataset_val = MovieLens(self.df_ratings_val)
+
+    def train_dataloader(self):
+        return DataLoader(self.dataset_train, batch_size=self.batch_size, num_workers=6)
+
+    def val_dataloader(self):
+        return DataLoader(self.dataset_train, batch_size=self.batch_size, num_workers=2)
+
+class Net(nn.Module):
+    def __init__(self, n_users, n_movies, n_factors=50):
+        super().__init__()
+        self.user_factors = nn.Embedding(n_users, n_factors)
+        self.movie_factors = nn.Embedding(n_movies, n_factors)
+        self.lin = nn.Linear(n_factors * 2, 1)
+
+    def forward(self, user, movie):
+        user_embedding = self.user_factors(user)
+        movie_embedding = self.movie_factors(movie)
+        x = torch.cat([user_embedding, movie_embedding], dim=1)
+        return self.lin(x)
+
+
+class NetLightning(pl.LightningModule):
+    def __init__(self, n_users, n_movies, n_factors=50, lr=3e-4):
+        super().__init__()
+        self.num_users = n_users
+        self.num_movies = n_movies
+        self.net = Net(n_users, n_movies, n_factors)
+        self.loss_fn = nn.MSELoss()
+        self.MAE = torchmetrics.MeanAbsoluteError()
+        self.lr = lr
+
+    def forward(self, user, movie):
+        return self.net(user, movie)
+
+    def training_step(self, batch, batch_idx):
+        user, movie, rating = batch
+        out = self.forward(user, movie)
+        mae = self.MAE(out.squeeze(1), rating.float())
+        loss = self.loss_fn(out.squeeze(1), rating.float())
+        self.log_dict({"train_loss": loss, "train_mae": mae}, on_step=False, on_epoch=True, prog_bar=True)
+        return loss
+    
+    def validation_step(self, batch, batch_idx):
+        user, movie, rating = batch
+        out = self.forward(user, movie)
+        mae = self.MAE(out.squeeze(1), rating.float())
+        loss = self.loss_fn(out.squeeze(1), rating.float())
+        self.log_dict({"val_loss": loss, "val_mae": mae}, on_step=False, on_epoch=True, prog_bar=True)
+
+    def predict_step(self, user_id):
+        out = self.forward(user_id, torch.arange(0, self.num_movies))
+        return out
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=self.lr)
+
+
+dm = LightningData(batch_size=512)
+dm.prepare_data()
+dm.setup()
+
+num_movies = dm.df_ratings["movie_id"].max() + 1
+num_users = dm.df_ratings["user_id"].max() + 1
+
+model = NetLightning(num_users, num_movies)
+trainer = pl.Trainer(accelerator="gpu", devices=1, max_epochs=3)
+trainer.fit(model, dm)
+trainer.validate(model, dm)