updated basic tutorials, better comments, code revision, checked it works with latest pytorch version

2026-02-20 13:50:41 +00:00 · 2022-12-19 23:39:48 +01:00
parent 3f53d68c4f
commit cd607c395c
14 changed files with 162 additions and 88 deletions
--- a/ML/Pytorch/Basics/Imbalanced_classes/main.py
+++ b/ML/Pytorch/Basics/Imbalanced_classes/main.py
@@ -1,3 +1,21 @@
+"""
+This code is for dealing with imbalanced datasets in PyTorch. Imbalanced datasets are those where the number of samples in one or more classes is significantly lower than the number of samples in the other classes. This can be a problem because it can lead to a model that is biased towards the more common classes, which can result in poor performance on the less common classes.
+
+To deal with imbalanced datasets, this code implements two methods: oversampling and class weighting.
+
+Oversampling involves generating additional samples for the underrepresented classes, while class weighting involves assigning higher weights to the loss of samples in the underrepresented classes, so that the model pays more attention to them.
+
+In this code, the get_loader function takes a root directory for a dataset and a batch size, and returns a PyTorch data loader. The data loader is used to iterate over the dataset in batches. The get_loader function first applies some transformations to the images in the dataset using the transforms module from torchvision. Then it calculates the class weights based on the number of samples in each class. It then creates a WeightedRandomSampler object, which is used to randomly select a batch of samples with a probability proportional to their weights. Finally, it creates the data loader using the dataset and the weighted random sampler.
+
+The main function then uses the data loader to iterate over the dataset for 10 epochs, and counts the number of samples in each class. Finally, it prints the counts for each class.
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+* 2020-04-08: Initial coding
+* 2021-03-24: Added more detailed comments also removed part of
+              check_accuracy which would only work specifically on MNIST.
+* 2022-12-19: Updated detailed comments, small code revision, checked code still works with latest PyTorch. 
+"""
+
 import torch
 import torchvision.datasets as datasets
 import os
@@ -6,9 +24,10 @@ import torchvision.transforms as transforms
 import torch.nn as nn

 # Methods for dealing with imbalanced datasets:
-# 1. Oversampling
+# 1. Oversampling (probably preferable)
 # 2. Class weighting

+
 def get_loader(root_dir, batch_size):
    my_transforms = transforms.Compose(
        [
@@ -18,10 +37,14 @@ def get_loader(root_dir, batch_size):
    )

    dataset = datasets.ImageFolder(root=root_dir, transform=my_transforms)
+    subdirectories = dataset.classes
    class_weights = []
-    for root, subdir, files in os.walk(root_dir):
-        if len(files) > 0:
-            class_weights.append(1/len(files))
+
+    # loop through each subdirectory and calculate the class weight
+    # that is 1 / len(files) in that subdirectory
+    for subdir in subdirectories:
+        files = os.listdir(os.path.join(root_dir, subdir))
+        class_weights.append(1 / len(files))

    sample_weights = [0] * len(dataset)

@@ -29,8 +52,9 @@ def get_loader(root_dir, batch_size):
        class_weight = class_weights[label]
        sample_weights[idx] = class_weight

-    sampler = WeightedRandomSampler(sample_weights, num_samples=
-                                    len(sample_weights), replacement=True)
+    sampler = WeightedRandomSampler(
+        sample_weights, num_samples=len(sample_weights), replacement=True
+    )

    loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
    return loader
@@ -43,12 +67,12 @@ def main():
    num_elkhounds = 0
    for epoch in range(10):
        for data, labels in loader:
-            num_retrievers += torch.sum(labels==0)
-            num_elkhounds += torch.sum(labels==1)
+            num_retrievers += torch.sum(labels == 0)
+            num_elkhounds += torch.sum(labels == 1)
+
+    print(num_retrievers.item())
+    print(num_elkhounds.item())

-    print(num_retrievers)
-    print(num_elkhounds)

 if __name__ == "__main__":
    main()
-
--- a/ML/Pytorch/Basics/albumentations_tutorial/classification.py
+++ b/ML/Pytorch/Basics/albumentations_tutorial/classification.py
@@ -3,6 +3,7 @@ import albumentations as A
 import numpy as np
 from utils import plot_examples
 from PIL import Image
+from tqdm import tqdm

 image = Image.open("images/elon.jpeg")

@@ -14,18 +15,20 @@ transform = A.Compose(
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.1),
        A.RGBShift(r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.9),
-        A.OneOf([
-            A.Blur(blur_limit=3, p=0.5),
-            A.ColorJitter(p=0.5),
-        ], p=1.0),
+        A.OneOf(
+            [
+                A.Blur(blur_limit=3, p=0.5),
+                A.ColorJitter(p=0.5),
+            ],
+            p=1.0,
+        ),
    ]
 )

 images_list = [image]
 image = np.array(image)
-for i in range(15):
+for i in tqdm(range(15)):
    augmentations = transform(image=image)
    augmented_img = augmentations["image"]
    images_list.append(augmented_img)
 plot_examples(images_list)
-
--- a/ML/Pytorch/Basics/albumentations_tutorial/full_pytorch_example.py
+++ b/ML/Pytorch/Basics/albumentations_tutorial/full_pytorch_example.py
@@ -8,6 +8,7 @@ from albumentations.pytorch import ToTensorV2
 from torch.utils.data import Dataset
 import os

+
 class ImageFolder(Dataset):
    def __init__(self, root_dir, transform=None):
        super(ImageFolder, self).__init__()
@@ -18,7 +19,7 @@ class ImageFolder(Dataset):

        for index, name in enumerate(self.class_names):
            files = os.listdir(os.path.join(root_dir, name))
-            self.data += list(zip(files, [index]*len(files)))
+            self.data += list(zip(files, [index] * len(files)))

    def __len__(self):
        return len(self.data)
@@ -43,10 +44,13 @@ transform = A.Compose(
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.1),
        A.RGBShift(r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.9),
-        A.OneOf([
-            A.Blur(blur_limit=3, p=0.5),
-            A.ColorJitter(p=0.5),
-        ], p=1.0),
+        A.OneOf(
+            [
+                A.Blur(blur_limit=3, p=0.5),
+                A.ColorJitter(p=0.5),
+            ],
+            p=1.0,
+        ),
        A.Normalize(
            mean=[0, 0, 0],
            std=[1, 1, 1],
@@ -58,5 +62,5 @@ transform = A.Compose(

 dataset = ImageFolder(root_dir="cat_dogs", transform=transform)

-for x,y in dataset:
+for x, y in dataset:
    print(x.shape)
--- a/ML/Pytorch/Basics/albumentations_tutorial/utils.py
+++ b/ML/Pytorch/Basics/albumentations_tutorial/utils.py
@@ -8,7 +8,7 @@ import albumentations as A

 def visualize(image):
    plt.figure(figsize=(10, 10))
-    plt.axis('off')
+    plt.axis("off")
    plt.imshow(image)
    plt.show()

@@ -22,7 +22,7 @@ def plot_examples(images, bboxes=None):
        if bboxes is not None:
            img = visualize_bbox(images[i - 1], bboxes[i - 1], class_name="Elon")
        else:
-            img = images[i-1]
+            img = images[i - 1]
        fig.add_subplot(rows, columns, i)
        plt.imshow(img)
    plt.show()
--- a/ML/Pytorch/Basics/custom_dataset_txt/get_data.sh
+++ b/ML/Pytorch/Basics/custom_dataset_txt/get_data.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+wget https://www.kaggle.com/datasets/e1cd22253a9b23b073794872bf565648ddbe4f17e7fa9e74766ad3707141adeb/download?datasetVersionNumber=1
--- a/ML/Pytorch/Basics/custom_dataset_txt/loader_customtext.py
+++ b/ML/Pytorch/Basics/custom_dataset_txt/loader_customtext.py
@@ -1,3 +1,15 @@
+"""
+Introductory tutorial on how to deal with custom text datasets in PyTorch.
+Note that there are better ways to do this when dealing with huge text datasets.
+But this is a good way of understanding how it works and can be used as a starting 
+point, particularly for smaller/medium datasets.
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-04-09 Initial coding
+*    2022-12-19 Updated comments, minor code revision, and checked code still works with latest PyTorch.
+"""
+
+
 import os  # when loading file paths
 import pandas as pd  # for lookup in annotation file
 import spacy  # for tokenizer
@@ -15,8 +27,8 @@ import torchvision.transforms as transforms
 #    of same seq_len and setup dataloader)
 # Note that loading the image is very easy compared to the text!

-# Download with: python -m spacy download en
-spacy_eng = spacy.load("en")
+# Download with: python -m spacy download en_core_web_sm
+spacy_eng = spacy.load("en_core_web_sm")


 class Vocabulary:
@@ -130,7 +142,10 @@ def get_loader(

 if __name__ == "__main__":
    transform = transforms.Compose(
-        [transforms.Resize((224, 224)), transforms.ToTensor(),]
+        [
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+        ]
    )

    loader, dataset = get_loader(
--- a/ML/Pytorch/Basics/pytorch_progress_bar.py
+++ b/ML/Pytorch/Basics/pytorch_progress_bar.py
@@ -1,3 +1,12 @@
+"""
+Example code of how to set progress bar using tqdm that is very efficient and nicely looking.
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-05-09 Initial coding
+*    2022-12-19 Updated with more detailed comments, and checked code works with latest PyTorch.
+
+"""
+
 import torch
 import torch.nn as nn
 from tqdm import tqdm
--- a/ML/Pytorch/Basics/pytorch_simple_CNN.py
+++ b/ML/Pytorch/Basics/pytorch_simple_CNN.py
@@ -8,18 +8,20 @@ check accuracy and more.
 Programmed by Aladdin Persson
 * 2020-04-08: Initial coding
 * 2021-03-24: More detailed comments and small revision of the code
+* 2022-12-19: Small revision of code, checked that it works with latest PyTorch version

 """

 # Imports
 import torch
-import torchvision # torch package for vision related things
 import torch.nn.functional as F  # Parameterless functions, like (some) activation functions
 import torchvision.datasets as datasets  # Standard datasets
 import torchvision.transforms as transforms  # Transformations we can perform on our dataset for augmentation
 from torch import optim  # For optimizers like SGD, Adam, etc.
 from torch import nn  # All neural network modules
-from torch.utils.data import DataLoader  # Gives easier dataset managment by creating mini batches etc.
+from torch.utils.data import (
+    DataLoader,
+)  # Gives easier dataset managment by creating mini batches etc.
 from tqdm import tqdm  # For nice progress bar!

 # Simple CNN
@@ -29,17 +31,17 @@ class CNN(nn.Module):
        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=8,
-            kernel_size=(3, 3),
-            stride=(1, 1),
-            padding=(1, 1),
+            kernel_size=3,
+            stride=1,
+            padding=1,
        )
-        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(
            in_channels=8,
            out_channels=16,
-            kernel_size=(3, 3),
-            stride=(1, 1),
-            padding=(1, 1),
+            kernel_size=3,
+            stride=1,
+            padding=1,
        )
        self.fc1 = nn.Linear(16 * 7 * 7, num_classes)

@@ -59,13 +61,17 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Hyperparameters
 in_channels = 1
 num_classes = 10
-learning_rate = 0.001
+learning_rate = 3e-4 # karpathy's constant
 batch_size = 64
 num_epochs = 3

 # Load Data
-train_dataset = datasets.MNIST(root="dataset/", train=True, transform=transforms.ToTensor(), download=True)
-test_dataset = datasets.MNIST(root="dataset/", train=False, transform=transforms.ToTensor(), download=True)
+train_dataset = datasets.MNIST(
+    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+)
+test_dataset = datasets.MNIST(
+    root="dataset/", train=False, transform=transforms.ToTensor(), download=True
+)
 train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
 test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

@@ -110,10 +116,9 @@ def check_accuracy(loader, model):
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

-
    model.train()
-    return num_correct/num_samples
+    return num_correct / num_samples


 print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
-print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
+print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
--- a/ML/Pytorch/Basics/pytorch_simple_fullynet.py
+++ b/ML/Pytorch/Basics/pytorch_simple_fullynet.py
@@ -14,13 +14,14 @@ Programmed by Aladdin Persson

 # Imports
 import torch
-import torchvision # torch package for vision related things
 import torch.nn.functional as F  # Parameterless functions, like (some) activation functions
 import torchvision.datasets as datasets  # Standard datasets
 import torchvision.transforms as transforms  # Transformations we can perform on our dataset for augmentation
 from torch import optim  # For optimizers like SGD, Adam, etc.
 from torch import nn  # All neural network modules
-from torch.utils.data import DataLoader  # Gives easier dataset managment by creating mini batches etc.
+from torch.utils.data import (
+    DataLoader,
+)  # Gives easier dataset managment by creating mini batches etc.
 from tqdm import tqdm  # For nice progress bar!

 # Here we create our simple neural network. For more details here we are subclassing and
@@ -37,8 +38,6 @@ class NN(nn.Module):
            input_size: the size of the input, in this case 784 (28x28)
            num_classes: the number of classes we want to predict, in this case 10 (0-9)

-        Returns:
-            None
        """
        super(NN, self).__init__()
        # Our first linear layer take input_size, in this case 784 nodes to 50
@@ -76,8 +75,12 @@ batch_size = 64
 num_epochs = 3

 # Load Data
-train_dataset = datasets.MNIST(root="dataset/", train=True, transform=transforms.ToTensor(), download=True)
-test_dataset = datasets.MNIST(root="dataset/", train=False, transform=transforms.ToTensor(), download=True)
+train_dataset = datasets.MNIST(
+    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
+)
+test_dataset = datasets.MNIST(
+    root="dataset/", train=False, transform=transforms.ToTensor(), download=True
+)
 train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
 test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

@@ -153,8 +156,9 @@ def check_accuracy(loader, model):
            num_samples += predictions.size(0)

    model.train()
-    return num_correct/num_samples
+    return num_correct / num_samples
+

 # Check accuracy on training & test to see how good our model
 print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
-print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
+print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
--- a/ML/Pytorch/Basics/pytorch_std_mean.py
+++ b/ML/Pytorch/Basics/pytorch_std_mean.py
@@ -1,3 +1,13 @@
+"""
+Code for calculating the mean and standard deviation of a dataset.
+This is useful for normalizing the dataset to obtain mean 0, std 1. 
+
+Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
+*    2020-05-09 Initial coding
+*    2022-12-16 Updated comments, code revision, and checked code still works with latest PyTorch.
+
+"""
+
 import torch
 import torchvision.transforms as transforms
 from torch.utils.data import DataLoader
@@ -5,20 +15,23 @@ import torchvision.datasets as datasets
 from tqdm import tqdm

 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-train_set = datasets.CIFAR10(root="ds/", transform=transforms.ToTensor(), download=True)
+train_set = datasets.CIFAR10(
+    root="dataset/", transform=transforms.ToTensor(), download=True
+)
 train_loader = DataLoader(dataset=train_set, batch_size=64, shuffle=True)

+
 def get_mean_std(loader):
    # var[X] = E[X**2] - E[X]**2
    channels_sum, channels_sqrd_sum, num_batches = 0, 0, 0

    for data, _ in tqdm(loader):
        channels_sum += torch.mean(data, dim=[0, 2, 3])
-        channels_sqrd_sum += torch.mean(data ** 2, dim=[0, 2, 3])
+        channels_sqrd_sum += torch.mean(data**2, dim=[0, 2, 3])
        num_batches += 1

    mean = channels_sum / num_batches
-    std = (channels_sqrd_sum / num_batches - mean ** 2) ** 0.5
+    std = (channels_sqrd_sum / num_batches - mean**2) ** 0.5

    return mean, std

--- a/ML/Pytorch/Basics/pytorch_tensorbasics.py
+++ b/ML/Pytorch/Basics/pytorch_tensorbasics.py
@@ -13,10 +13,11 @@ numpy array and vice-versa.

 Programmed by Aladdin Persson
 * 2020-06-27: Initial coding
-
+* 2022-12-19: Small revision of code, checked that it works with latest PyTorch version
 """

 import torch
+import numpy as np

 # ================================================================= #
 #                        Initializing Tensor                        #
@@ -77,8 +78,6 @@ print(
 print(f"Converted float64 {tensor.double()}")  # Converted to float64

 # Array to Tensor conversion and vice-versa
-import numpy as np
-
 np_array = np.zeros((5, 5))
 tensor = torch.from_numpy(np_array)
 np_array_again = (
@@ -112,7 +111,7 @@ t += x  # Also inplace: t = t + x is not inplace, bit confusing.

 # -- Exponentiation (Element wise if vector or matrices) --
 z = x.pow(2)  # z = [1, 4, 9]
-z = x ** 2  # z = [1, 4, 9]
+z = x**2  # z = [1, 4, 9]


 # -- Simple Comparison --
@@ -153,7 +152,7 @@ z = (
    x1 - x2
 )  # Shape of z is 5x5: How? The 1x5 vector (x2) is subtracted for each row in the 5x5 (x1)
 z = (
-    x1 ** x2
+    x1**x2
 )  # Shape of z is 5x5: How? Broadcasting! Element wise exponentiation for every row

 # Other useful tensor operations
--- a/ML/Pytorch/Basics/pytorch_tensorboard_.py
+++ b/ML/Pytorch/Basics/pytorch_tensorboard_.py
@@ -5,11 +5,9 @@ and tries to have them all in a compact way, it might not be
 super clear exactly what calls does what, for that I recommend
 watching the YouTube video.

-Video explanation: https://youtu.be/RLqsxWaQdHE
-Got any questions leave a comment on youtube :)
-
 Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
 *    2020-04-17 Initial coding
+*    2022-12-19 Small revision of code, checked that it works with latest PyTorch version
 """

 # Imports
@@ -54,7 +52,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Hyperparameters
 in_channels = 1
 num_classes = 10
-num_epochs = 1
+num_epochs = 3

 # Load Data
 train_dataset = datasets.MNIST(
@@ -63,8 +61,8 @@ train_dataset = datasets.MNIST(

 # To do hyperparameter search, include more batch_sizes you want to try
 # and more learning rates!
-batch_sizes = [256]
-learning_rates = [0.001]
+batch_sizes = [32, 256]
+learning_rates = [1e-2, 1e-3, 1e-4, 1e-5]
 classes = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

 for batch_size in batch_sizes:
--- a/ML/Pytorch/Basics/pytorch_transforms.py
+++ b/ML/Pytorch/Basics/pytorch_transforms.py
@@ -2,11 +2,9 @@
 Shows a small example of how to use transformations (perhaps unecessarily many)
 on CIFAR10 dataset and training on a small CNN toy network.

-Video explanation: https://youtu.be/Zvd276j9sZ8
-Got any questions leave a comment I'm pretty good at responding on youtube
-
 Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
 *    2020-04-09 Initial coding
+*    2022-12-19 Updated comments, minor code revision, and checked code still works with latest PyTorch.
 """

 # Imports
@@ -23,21 +21,21 @@ import torchvision.transforms as transforms  # Transformations we can perform on
 # Simple CNN
 class CNN(nn.Module):
    def __init__(self, in_channels, num_classes):
-        super(CNN, self).__init__()
+        super().__init__()
        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=8,
-            kernel_size=(3, 3),
-            stride=(1, 1),
-            padding=(1, 1),
+            kernel_size=3,
+            stride=1,
+            padding=1,
        )
-        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(
            in_channels=8,
            out_channels=16,
-            kernel_size=(3, 3),
-            stride=(1, 1),
-            padding=(1, 1),
+            kernel_size=3,
+            stride=1,
+            padding=1,
        )
        self.fc1 = nn.Linear(16 * 8 * 8, num_classes)

@@ -47,20 +45,17 @@ class CNN(nn.Module):
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.reshape(x.shape[0], -1)
-        x = self.fc1(x)
-
-        return x
+        return self.fc1(x)


 # Set device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

 # Hyperparameters
-learning_rate = 1e-4
+learning_rate = 3e-4  # for future copilot: this is karpathy's constant
 batch_size = 64
 num_epochs = 5

-
 # Load pretrain model & modify it
 model = CNN(in_channels=3, num_classes=10)
 model.classifier = nn.Sequential(nn.Linear(512, 100), nn.ReLU(), nn.Linear(100, 10))
@@ -89,7 +84,6 @@ my_transforms = transforms.Compose(
    ]
 )

-
 train_dataset = datasets.CIFAR10(
    root="dataset/", train=True, transform=my_transforms, download=True
 )
@@ -120,11 +114,9 @@ for epoch in range(num_epochs):
        # gradient descent or adam step
        optimizer.step()

-    print(f"Cost at epoch {epoch} is {sum(losses)/len(losses):.5f}")
+    print(f"Loss average over epoch {epoch} is {sum(losses)/len(losses):.3f}")

 # Check accuracy on training & test to see how good our model
-
-
 def check_accuracy(loader, model):
    if loader.dataset.train:
        print("Checking accuracy on training data")
--- a/ML/Pytorch/Basics/set_deterministic_behavior/pytorch_set_seeds.py
+++ b/ML/Pytorch/Basics/set_deterministic_behavior/pytorch_set_seeds.py
@@ -1,7 +1,11 @@
-import random, torch, os, numpy as np
+import random
+import torch
+import os
+import numpy as np
+

 def seed_everything(seed=42):
-    os.environ['PYTHONHASHSEED'] = str(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
@@ -10,6 +14,7 @@ def seed_everything(seed=42):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

+
 seed_everything()

-# Do training etc after running seed_everything
+# Do training etc after running seed_everything