DR kaggle

2026-02-20 13:50:41 +00:00 · 2021-05-30 16:24:52 +02:00
parent 9675f0d6af
commit 8136ee169f
6 changed files with 565 additions and 0 deletions
--- a/ML/Kaggles/DiabeticRetinopathy/config.py
+++ b/ML/Kaggles/DiabeticRetinopathy/config.py
@@ -0,0 +1,48 @@
+import torch
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+LEARNING_RATE = 3e-5
+WEIGHT_DECAY = 5e-4
+BATCH_SIZE = 20
+NUM_EPOCHS = 100
+NUM_WORKERS = 6
+CHECKPOINT_FILE = "b3.pth.tar"
+PIN_MEMORY = True
+SAVE_MODEL = True
+LOAD_MODEL = True
+
+# Data augmentation for images
+train_transforms = A.Compose(
+    [
+        A.Resize(width=760, height=760),
+        A.RandomCrop(height=728, width=728),
+        A.HorizontalFlip(p=0.5),
+        A.VerticalFlip(p=0.5),
+        A.RandomRotate90(p=0.5),
+        A.Blur(p=0.3),
+        A.CLAHE(p=0.3),
+        A.ColorJitter(p=0.3),
+        A.CoarseDropout(max_holes=12, max_height=20, max_width=20, p=0.3),
+        A.IAAAffine(shear=30, rotate=0, p=0.2, mode="constant"),
+        A.Normalize(
+            mean=[0.3199, 0.2240, 0.1609],
+            std=[0.3020, 0.2183, 0.1741],
+            max_pixel_value=255.0,
+        ),
+        ToTensorV2(),
+    ]
+)
+
+val_transforms = A.Compose(
+    [
+        A.Resize(height=728, width=728),
+        A.Normalize(
+            mean=[0.3199, 0.2240, 0.1609],
+            std=[0.3020, 0.2183, 0.1741],
+            max_pixel_value=255.0,
+        ),
+        ToTensorV2(),
+    ]
+)
--- a/ML/Kaggles/DiabeticRetinopathy/dataset.py
+++ b/ML/Kaggles/DiabeticRetinopathy/dataset.py
@@ -0,0 +1,56 @@
+import config
+import os
+import pandas as pd
+import numpy as np
+from torch.utils.data import Dataset, DataLoader
+from PIL import Image
+from tqdm import tqdm
+
+
+class DRDataset(Dataset):
+    def __init__(self, images_folder, path_to_csv, train=True, transform=None):
+        super().__init__()
+        self.data = pd.read_csv(path_to_csv)
+        self.images_folder = images_folder
+        self.image_files = os.listdir(images_folder)
+        self.transform = transform
+        self.train = train
+
+    def __len__(self):
+        return self.data.shape[0] if self.train else len(self.image_files)
+
+    def __getitem__(self, index):
+        if self.train:
+            image_file, label = self.data.iloc[index]
+        else:
+            # if test simply return -1 for label, I do this in order to
+            # re-use same dataset class for test set submission later on
+            image_file, label = self.image_files[index], -1
+            image_file = image_file.replace(".jpeg", "")
+
+        image = np.array(Image.open(os.path.join(self.images_folder, image_file+".jpeg")))
+
+        if self.transform:
+            image = self.transform(image=image)["image"]
+
+        return image, label, image_file
+
+
+if __name__ == "__main__":
+    """
+    Test if everything works ok
+    """
+    dataset = DRDataset(
+        images_folder="../train/images_resized_650/",
+        path_to_csv="../train/trainLabels.csv",
+        transform=config.val_transforms,
+    )
+    loader = DataLoader(
+        dataset=dataset, batch_size=32, num_workers=2, shuffle=True, pin_memory=True
+    )
+
+    for x, label, file in tqdm(loader):
+        print(x.shape)
+        print(label.shape)
+        import sys
+        sys.exit()
--- a/ML/Kaggles/DiabeticRetinopathy/preprocess_images.py
+++ b/ML/Kaggles/DiabeticRetinopathy/preprocess_images.py
@@ -0,0 +1,82 @@
+"""
+Tries to remove unnecessary black borders around the images, and
+"trim" the images to they take up the entirety of the image.
+It's hacky & not very nice but it works :))
+"""
+
+import os
+import numpy as np
+from PIL import Image
+import warnings
+from multiprocessing import Pool
+from tqdm import tqdm
+import cv2
+
+
+def trim(im):
+    """
+    Converts image to grayscale using cv2, then computes binary matrix
+    of the pixels that are above a certain threshold, then takes out
+    the first row where a certain percetage of the pixels are above the
+    threshold will be the first clip point. Same idea for col, max row, max col.
+    """
+    percentage = 0.02
+
+    img = np.array(im)
+    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    im = img_gray > 0.1 * np.mean(img_gray[img_gray != 0])
+    row_sums = np.sum(im, axis=1)
+    col_sums = np.sum(im, axis=0)
+    rows = np.where(row_sums > img.shape[1] * percentage)[0]
+    cols = np.where(col_sums > img.shape[0] * percentage)[0]
+    min_row, min_col = np.min(rows), np.min(cols)
+    max_row, max_col = np.max(rows), np.max(cols)
+    im_crop = img[min_row : max_row + 1, min_col : max_col + 1]
+    return Image.fromarray(im_crop)
+
+
+def resize_maintain_aspect(image, desired_size):
+    """
+    Stole this from some stackoverflow post but can't remember which,
+    this will add padding to maintain the aspect ratio.
+    """
+    old_size = image.size  # old_size[0] is in (width, height) format
+    ratio = float(desired_size) / max(old_size)
+    new_size = tuple([int(x * ratio) for x in old_size])
+    im = image.resize(new_size, Image.ANTIALIAS)
+    new_im = Image.new("RGB", (desired_size, desired_size))
+    new_im.paste(im, ((desired_size - new_size[0]) // 2, (desired_size - new_size[1]) // 2))
+    return new_im
+
+
+def save_single(args):
+    img_file, input_path_folder, output_path_folder, output_size = args
+    image_original = Image.open(os.path.join(input_path_folder, img_file))
+    image = trim(image_original)
+    image = resize_maintain_aspect(image, desired_size=output_size[0])
+    image.save(os.path.join(output_path_folder + img_file))
+
+
+def fast_image_resize(input_path_folder, output_path_folder, output_size=None):
+    """
+    Uses multiprocessing to make it fast
+    """
+    if not output_size:
+        warnings.warn("Need to specify output_size! For example: output_size=100")
+        exit()
+
+    if not os.path.exists(output_path_folder):
+        os.makedirs(output_path_folder)
+
+    jobs = [
+        (file, input_path_folder, output_path_folder, output_size)
+        for file in os.listdir(input_path_folder)
+    ]
+
+    with Pool() as p:
+        list(tqdm(p.imap_unordered(save_single, jobs), total=len(jobs)))
+
+
+if __name__ == "__main__":
+    fast_image_resize("../train/images/", "../train/images_resized_150/", output_size=(150, 150))
+    fast_image_resize("../test/images/", "../test/images_resized_150/", output_size=(150, 150))
--- a/ML/Kaggles/DiabeticRetinopathy/train.py
+++ b/ML/Kaggles/DiabeticRetinopathy/train.py
@@ -0,0 +1,125 @@
+import torch
+from torch import nn, optim
+import os
+import config
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from sklearn.metrics import cohen_kappa_score
+from efficientnet_pytorch import EfficientNet
+from dataset import DRDataset
+from torchvision.utils import save_image
+from utils import (
+    load_checkpoint,
+    save_checkpoint,
+    check_accuracy,
+    make_prediction,
+    get_csv_for_blend,
+)
+
+
+def train_one_epoch(loader, model, optimizer, loss_fn, scaler, device):
+    losses = []
+    loop = tqdm(loader)
+    for batch_idx, (data, targets, _) in enumerate(loop):
+        # save examples and make sure they look ok with the data augmentation,
+        # tip is to first set mean=[0,0,0], std=[1,1,1] so they look "normal"
+        #save_image(data, f"hi_{batch_idx}.png")
+
+        data = data.to(device=device)
+        targets = targets.to(device=device)
+
+        # forward
+        with torch.cuda.amp.autocast():
+            scores = model(data)
+            loss = loss_fn(scores, targets.unsqueeze(1).float())
+
+        losses.append(loss.item())
+
+        # backward
+        optimizer.zero_grad()
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+        loop.set_postfix(loss=loss.item())
+
+    print(f"Loss average over epoch: {sum(losses)/len(losses)}")
+
+
+def main():
+    train_ds = DRDataset(
+        images_folder="train/images_preprocessed_1000/",
+        path_to_csv="train/trainLabels.csv",
+        transform=config.val_transforms,
+    )
+    val_ds = DRDataset(
+        images_folder="train/images_preprocessed_1000/",
+        path_to_csv="train/valLabels.csv",
+        transform=config.val_transforms,
+    )
+    test_ds = DRDataset(
+        images_folder="test/images_preprocessed_1000",
+        path_to_csv="train/trainLabels.csv",
+        transform=config.val_transforms,
+        train=False,
+    )
+    test_loader = DataLoader(
+        test_ds, batch_size=config.BATCH_SIZE, num_workers=6, shuffle=False
+    )
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=config.BATCH_SIZE,
+        num_workers=config.NUM_WORKERS,
+        pin_memory=config.PIN_MEMORY,
+        shuffle=False,
+    )
+    val_loader = DataLoader(
+        val_ds,
+        batch_size=config.BATCH_SIZE,
+        num_workers=2,
+        pin_memory=config.PIN_MEMORY,
+        shuffle=False,
+    )
+    loss_fn = nn.MSELoss()
+
+    model = EfficientNet.from_pretrained("efficientnet-b3")
+    model._fc = nn.Linear(1536, 1)
+    model = model.to(config.DEVICE)
+    optimizer = optim.Adam(model.parameters(), lr=config.LEARNING_RATE, weight_decay=config.WEIGHT_DECAY)
+    scaler = torch.cuda.amp.GradScaler()
+
+    if config.LOAD_MODEL and config.CHECKPOINT_FILE in os.listdir():
+        load_checkpoint(torch.load(config.CHECKPOINT_FILE), model, optimizer, config.LEARNING_RATE)
+
+    # Run after training is done and you've achieved good result
+    # on validation set, then run train_blend.py file to use information
+    # about both eyes concatenated
+    get_csv_for_blend(val_loader, model, "../train/val_blend.csv")
+    get_csv_for_blend(train_loader, model, "../train/train_blend.csv")
+    get_csv_for_blend(test_loader, model, "../train/test_blend.csv")
+    make_prediction(model, test_loader, "submission_.csv")
+    import sys
+    sys.exit()
+    #make_prediction(model, test_loader)
+
+    for epoch in range(config.NUM_EPOCHS):
+        train_one_epoch(train_loader, model, optimizer, loss_fn, scaler, config.DEVICE)
+
+        # get on validation
+        preds, labels = check_accuracy(val_loader, model, config.DEVICE)
+        print(f"QuadraticWeightedKappa (Validation): {cohen_kappa_score(labels, preds, weights='quadratic')}")
+
+        # get on train
+        #preds, labels = check_accuracy(train_loader, model, config.DEVICE)
+        #print(f"QuadraticWeightedKappa (Training): {cohen_kappa_score(labels, preds, weights='quadratic')}")
+
+        if config.SAVE_MODEL:
+            checkpoint = {
+                "state_dict": model.state_dict(),
+                "optimizer": optimizer.state_dict(),
+            }
+            save_checkpoint(checkpoint, filename=f"b3_{epoch}.pth.tar")
+
+
+
+if __name__ == "__main__":
+    main()
--- a/ML/Kaggles/DiabeticRetinopathy/train_blend.py
+++ b/ML/Kaggles/DiabeticRetinopathy/train_blend.py
@@ -0,0 +1,126 @@
+import torch
+from tqdm import tqdm
+import numpy as np
+from torch import nn
+from torch import optim
+from torch.utils.data import DataLoader, Dataset
+from utils import save_checkpoint, load_checkpoint, check_accuracy
+from sklearn.metrics import cohen_kappa_score
+import config
+import os
+import pandas as pd
+
+
+def make_prediction(model, loader, file):
+    preds = []
+    filenames = []
+    model.eval()
+
+    for x, y, files in tqdm(loader):
+        x = x.to(config.DEVICE)
+        with torch.no_grad():
+            predictions = model(x)
+            # Convert MSE floats to integer predictions
+            predictions[predictions < 0.5] = 0
+            predictions[(predictions >= 0.5) & (predictions < 1.5)] = 1
+            predictions[(predictions >= 1.5) & (predictions < 2.5)] = 2
+            predictions[(predictions >= 2.5) & (predictions < 3.5)] = 3
+            predictions[(predictions >= 3.5) & (predictions < 1000000000000)] = 4
+            predictions = predictions.long().view(-1)
+            y = y.view(-1)
+
+            preds.append(predictions.cpu().numpy())
+            filenames += map(list, zip(files[0], files[1]))
+
+    filenames = [item for sublist in filenames for item in sublist]
+    df = pd.DataFrame({"image": filenames, "level": np.concatenate(preds, axis=0)})
+    df.to_csv(file, index=False)
+    model.train()
+    print("Done with predictions")
+
+
+class MyDataset(Dataset):
+    def __init__(self, csv_file):
+        self.csv = pd.read_csv(csv_file)
+
+    def __len__(self):
+        return self.csv.shape[0]
+
+    def __getitem__(self, index):
+        example = self.csv.iloc[index, :]
+        features = example.iloc[: example.shape[0] - 4].to_numpy().astype(np.float32)
+        labels = example.iloc[-4:-2].to_numpy().astype(np.int64)
+        filenames = example.iloc[-2:].values.tolist()
+        return features, labels, filenames
+
+
+class MyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = nn.Sequential(
+            nn.BatchNorm1d((1536 + 1) * 2),
+            nn.Linear((1536+1) * 2, 500),
+            nn.BatchNorm1d(500),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(500, 100),
+            nn.BatchNorm1d(100),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(100, 2),
+        )
+
+    def forward(self, x):
+        return self.model(x)
+
+
+if __name__ == "__main__":
+    model = MyModel().to(config.DEVICE)
+    ds = MyDataset(csv_file="train/train_blend.csv")
+    loader = DataLoader(ds, batch_size=256, num_workers=3, pin_memory=True, shuffle=True)
+    ds_val = MyDataset(csv_file="train/val_blend.csv")
+    loader_val = DataLoader(
+        ds_val, batch_size=256, num_workers=3, pin_memory=True, shuffle=True
+    )
+    ds_test = MyDataset(csv_file="train/test_blend.csv")
+    loader_test = DataLoader(
+        ds_test, batch_size=256, num_workers=2, pin_memory=True, shuffle=False
+    )
+    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
+    loss_fn = nn.MSELoss()
+
+    if config.LOAD_MODEL and "linear.pth.tar" in os.listdir():
+        load_checkpoint(torch.load("linear.pth.tar"), model, optimizer, lr=1e-4)
+        model.train()
+
+    for _ in range(5):
+        losses = []
+        for x, y, files in tqdm(loader_val):
+            x = x.to(config.DEVICE).float()
+            y = y.to(config.DEVICE).view(-1).float()
+
+            # forward
+            scores = model(x).view(-1)
+            loss = loss_fn(scores, y)
+            losses.append(loss.item())
+
+            # backward
+            optimizer.zero_grad()
+            loss.backward()
+
+            # gradient descent or adam step
+            optimizer.step()
+
+        print(f"Loss: {sum(losses)/len(losses)}")
+
+    if config.SAVE_MODEL:
+        checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
+        save_checkpoint(checkpoint, filename="linear.pth.tar")
+
+    preds, labels = check_accuracy(loader_val, model)
+    print(cohen_kappa_score(labels, preds, weights="quadratic"))
+
+    preds, labels = check_accuracy(loader, model)
+    print(cohen_kappa_score(labels, preds, weights="quadratic"))
+
+    make_prediction(model, loader_test, "test_preds.csv")
--- a/ML/Kaggles/DiabeticRetinopathy/utils.py
+++ b/ML/Kaggles/DiabeticRetinopathy/utils.py
@@ -0,0 +1,128 @@
+import torch
+import pandas as pd
+import numpy as np
+import config
+from tqdm import tqdm
+import warnings
+import torch.nn.functional as F
+
+
+def make_prediction(model, loader, output_csv="submission.csv"):
+    preds = []
+    filenames = []
+    model.eval()
+
+    for x, y, files in tqdm(loader):
+        x = x.to(config.DEVICE)
+        with torch.no_grad():
+            predictions = model(x)
+            # Convert MSE floats to integer predictions
+            predictions[predictions < 0.5] = 0
+            predictions[(predictions >= 0.5) & (predictions < 1.5)] = 1
+            predictions[(predictions >= 1.5) & (predictions < 2.5)] = 2
+            predictions[(predictions >= 2.5) & (predictions < 3.5)] = 3
+            predictions[(predictions >= 3.5) & (predictions < 10000000)] = 4
+            predictions = predictions.long().squeeze(1)
+            preds.append(predictions.cpu().numpy())
+            filenames += files
+
+    df = pd.DataFrame({"image": filenames, "level": np.concatenate(preds, axis=0)})
+    df.to_csv(output_csv, index=False)
+    model.train()
+    print("Done with predictions")
+
+
+def check_accuracy(loader, model, device="cuda"):
+    model.eval()
+    all_preds, all_labels = [], []
+    num_correct = 0
+    num_samples = 0
+
+    for x, y, filename in tqdm(loader):
+        x = x.to(device=device)
+        y = y.to(device=device)
+
+        with torch.no_grad():
+            predictions = model(x)
+
+        # Convert MSE floats to integer predictions
+        predictions[predictions < 0.5] = 0
+        predictions[(predictions >= 0.5) & (predictions < 1.5)] = 1
+        predictions[(predictions >= 1.5) & (predictions < 2.5)] = 2
+        predictions[(predictions >= 2.5) & (predictions < 3.5)] = 3
+        predictions[(predictions >= 3.5) & (predictions < 100)] = 4
+        predictions = predictions.long().view(-1)
+        y = y.view(-1)
+
+        num_correct += (predictions == y).sum()
+        num_samples += predictions.shape[0]
+
+        # add to lists
+        all_preds.append(predictions.detach().cpu().numpy())
+        all_labels.append(y.detach().cpu().numpy())
+
+    print(
+        f"Got {num_correct} / {num_samples} with accuracy {float(num_correct) / float(num_samples) * 100:.2f}"
+    )
+    model.train()
+    return np.concatenate(all_preds, axis=0, dtype=np.int64), np.concatenate(
+        all_labels, axis=0, dtype=np.int64
+    )
+
+
+def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+
+
+def load_checkpoint(checkpoint, model, optimizer, lr):
+    print("=> Loading checkpoint")
+    model.load_state_dict(checkpoint["state_dict"])
+    #optimizer.load_state_dict(checkpoint["optimizer"])
+
+    # If we don't do this then it will just have learning rate of old checkpoint
+    # and it will lead to many hours of debugging \:
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+
+
+def get_csv_for_blend(loader, model, output_csv_file):
+    warnings.warn("Important to have shuffle=False (and to ensure batch size is even size) when running get_csv_for_blend also set val_transforms to train_loader!")
+    model.eval()
+    filename_first = []
+    filename_second = []
+    labels_first = []
+    labels_second = []
+    all_features = []
+
+    for idx, (images, y, image_files) in enumerate(tqdm(loader)):
+        images = images.to(config.DEVICE)
+
+        with torch.no_grad():
+            features = F.adaptive_avg_pool2d(
+                model.extract_features(images), output_size=1
+            )
+            features_logits = features.reshape(features.shape[0] // 2, 2, features.shape[1])
+            preds = model(images).reshape(images.shape[0] // 2, 2, 1)
+            new_features = (
+                torch.cat([features_logits, preds], dim=2)
+                .view(preds.shape[0], -1)
+                .cpu()
+                .numpy()
+            )
+            all_features.append(new_features)
+            filename_first += image_files[::2]
+            filename_second += image_files[1::2]
+            labels_first.append(y[::2].cpu().numpy())
+            labels_second.append(y[1::2].cpu().numpy())
+
+    all_features = np.concatenate(all_features, axis=0)
+    df = pd.DataFrame(
+        data=all_features, columns=[f"f_{idx}" for idx in range(all_features.shape[1])]
+    )
+    df["label_first"] = np.concatenate(labels_first, axis=0)
+    df["label_second"] = np.concatenate(labels_second, axis=0)
+    df["file_first"] = filename_first
+    df["file_second"] = filename_second
+    df.to_csv(output_csv_file, index=False)
+    model.train()