diff --git a/ML/Kaggles/DiabeticRetinopathy/config.py b/ML/Kaggles/DiabeticRetinopathy/config.py new file mode 100644 index 0000000..718f12d --- /dev/null +++ b/ML/Kaggles/DiabeticRetinopathy/config.py @@ -0,0 +1,48 @@ +import torch +import albumentations as A +from albumentations.pytorch import ToTensorV2 + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +LEARNING_RATE = 3e-5 +WEIGHT_DECAY = 5e-4 +BATCH_SIZE = 20 +NUM_EPOCHS = 100 +NUM_WORKERS = 6 +CHECKPOINT_FILE = "b3.pth.tar" +PIN_MEMORY = True +SAVE_MODEL = True +LOAD_MODEL = True + +# Data augmentation for images +train_transforms = A.Compose( + [ + A.Resize(width=760, height=760), + A.RandomCrop(height=728, width=728), + A.HorizontalFlip(p=0.5), + A.VerticalFlip(p=0.5), + A.RandomRotate90(p=0.5), + A.Blur(p=0.3), + A.CLAHE(p=0.3), + A.ColorJitter(p=0.3), + A.CoarseDropout(max_holes=12, max_height=20, max_width=20, p=0.3), + A.IAAAffine(shear=30, rotate=0, p=0.2, mode="constant"), + A.Normalize( + mean=[0.3199, 0.2240, 0.1609], + std=[0.3020, 0.2183, 0.1741], + max_pixel_value=255.0, + ), + ToTensorV2(), + ] +) + +val_transforms = A.Compose( + [ + A.Resize(height=728, width=728), + A.Normalize( + mean=[0.3199, 0.2240, 0.1609], + std=[0.3020, 0.2183, 0.1741], + max_pixel_value=255.0, + ), + ToTensorV2(), + ] +) \ No newline at end of file diff --git a/ML/Kaggles/DiabeticRetinopathy/dataset.py b/ML/Kaggles/DiabeticRetinopathy/dataset.py new file mode 100644 index 0000000..05e7ddc --- /dev/null +++ b/ML/Kaggles/DiabeticRetinopathy/dataset.py @@ -0,0 +1,56 @@ +import config +import os +import pandas as pd +import numpy as np +from torch.utils.data import Dataset, DataLoader +from PIL import Image +from tqdm import tqdm + + +class DRDataset(Dataset): + def __init__(self, images_folder, path_to_csv, train=True, transform=None): + super().__init__() + self.data = pd.read_csv(path_to_csv) + self.images_folder = images_folder + self.image_files = os.listdir(images_folder) + self.transform = transform + self.train = train + + def __len__(self): + return self.data.shape[0] if self.train else len(self.image_files) + + def __getitem__(self, index): + if self.train: + image_file, label = self.data.iloc[index] + else: + # if test simply return -1 for label, I do this in order to + # re-use same dataset class for test set submission later on + image_file, label = self.image_files[index], -1 + image_file = image_file.replace(".jpeg", "") + + image = np.array(Image.open(os.path.join(self.images_folder, image_file+".jpeg"))) + + if self.transform: + image = self.transform(image=image)["image"] + + return image, label, image_file + + +if __name__ == "__main__": + """ + Test if everything works ok + """ + dataset = DRDataset( + images_folder="../train/images_resized_650/", + path_to_csv="../train/trainLabels.csv", + transform=config.val_transforms, + ) + loader = DataLoader( + dataset=dataset, batch_size=32, num_workers=2, shuffle=True, pin_memory=True + ) + + for x, label, file in tqdm(loader): + print(x.shape) + print(label.shape) + import sys + sys.exit() \ No newline at end of file diff --git a/ML/Kaggles/DiabeticRetinopathy/preprocess_images.py b/ML/Kaggles/DiabeticRetinopathy/preprocess_images.py new file mode 100644 index 0000000..5ef6b71 --- /dev/null +++ b/ML/Kaggles/DiabeticRetinopathy/preprocess_images.py @@ -0,0 +1,82 @@ +""" +Tries to remove unnecessary black borders around the images, and +"trim" the images to they take up the entirety of the image. +It's hacky & not very nice but it works :)) +""" + +import os +import numpy as np +from PIL import Image +import warnings +from multiprocessing import Pool +from tqdm import tqdm +import cv2 + + +def trim(im): + """ + Converts image to grayscale using cv2, then computes binary matrix + of the pixels that are above a certain threshold, then takes out + the first row where a certain percetage of the pixels are above the + threshold will be the first clip point. Same idea for col, max row, max col. + """ + percentage = 0.02 + + img = np.array(im) + img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + im = img_gray > 0.1 * np.mean(img_gray[img_gray != 0]) + row_sums = np.sum(im, axis=1) + col_sums = np.sum(im, axis=0) + rows = np.where(row_sums > img.shape[1] * percentage)[0] + cols = np.where(col_sums > img.shape[0] * percentage)[0] + min_row, min_col = np.min(rows), np.min(cols) + max_row, max_col = np.max(rows), np.max(cols) + im_crop = img[min_row : max_row + 1, min_col : max_col + 1] + return Image.fromarray(im_crop) + + +def resize_maintain_aspect(image, desired_size): + """ + Stole this from some stackoverflow post but can't remember which, + this will add padding to maintain the aspect ratio. + """ + old_size = image.size # old_size[0] is in (width, height) format + ratio = float(desired_size) / max(old_size) + new_size = tuple([int(x * ratio) for x in old_size]) + im = image.resize(new_size, Image.ANTIALIAS) + new_im = Image.new("RGB", (desired_size, desired_size)) + new_im.paste(im, ((desired_size - new_size[0]) // 2, (desired_size - new_size[1]) // 2)) + return new_im + + +def save_single(args): + img_file, input_path_folder, output_path_folder, output_size = args + image_original = Image.open(os.path.join(input_path_folder, img_file)) + image = trim(image_original) + image = resize_maintain_aspect(image, desired_size=output_size[0]) + image.save(os.path.join(output_path_folder + img_file)) + + +def fast_image_resize(input_path_folder, output_path_folder, output_size=None): + """ + Uses multiprocessing to make it fast + """ + if not output_size: + warnings.warn("Need to specify output_size! For example: output_size=100") + exit() + + if not os.path.exists(output_path_folder): + os.makedirs(output_path_folder) + + jobs = [ + (file, input_path_folder, output_path_folder, output_size) + for file in os.listdir(input_path_folder) + ] + + with Pool() as p: + list(tqdm(p.imap_unordered(save_single, jobs), total=len(jobs))) + + +if __name__ == "__main__": + fast_image_resize("../train/images/", "../train/images_resized_150/", output_size=(150, 150)) + fast_image_resize("../test/images/", "../test/images_resized_150/", output_size=(150, 150)) \ No newline at end of file diff --git a/ML/Kaggles/DiabeticRetinopathy/train.py b/ML/Kaggles/DiabeticRetinopathy/train.py new file mode 100644 index 0000000..964afa5 --- /dev/null +++ b/ML/Kaggles/DiabeticRetinopathy/train.py @@ -0,0 +1,125 @@ +import torch +from torch import nn, optim +import os +import config +from torch.utils.data import DataLoader +from tqdm import tqdm +from sklearn.metrics import cohen_kappa_score +from efficientnet_pytorch import EfficientNet +from dataset import DRDataset +from torchvision.utils import save_image +from utils import ( + load_checkpoint, + save_checkpoint, + check_accuracy, + make_prediction, + get_csv_for_blend, +) + + +def train_one_epoch(loader, model, optimizer, loss_fn, scaler, device): + losses = [] + loop = tqdm(loader) + for batch_idx, (data, targets, _) in enumerate(loop): + # save examples and make sure they look ok with the data augmentation, + # tip is to first set mean=[0,0,0], std=[1,1,1] so they look "normal" + #save_image(data, f"hi_{batch_idx}.png") + + data = data.to(device=device) + targets = targets.to(device=device) + + # forward + with torch.cuda.amp.autocast(): + scores = model(data) + loss = loss_fn(scores, targets.unsqueeze(1).float()) + + losses.append(loss.item()) + + # backward + optimizer.zero_grad() + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + loop.set_postfix(loss=loss.item()) + + print(f"Loss average over epoch: {sum(losses)/len(losses)}") + + +def main(): + train_ds = DRDataset( + images_folder="train/images_preprocessed_1000/", + path_to_csv="train/trainLabels.csv", + transform=config.val_transforms, + ) + val_ds = DRDataset( + images_folder="train/images_preprocessed_1000/", + path_to_csv="train/valLabels.csv", + transform=config.val_transforms, + ) + test_ds = DRDataset( + images_folder="test/images_preprocessed_1000", + path_to_csv="train/trainLabels.csv", + transform=config.val_transforms, + train=False, + ) + test_loader = DataLoader( + test_ds, batch_size=config.BATCH_SIZE, num_workers=6, shuffle=False + ) + train_loader = DataLoader( + train_ds, + batch_size=config.BATCH_SIZE, + num_workers=config.NUM_WORKERS, + pin_memory=config.PIN_MEMORY, + shuffle=False, + ) + val_loader = DataLoader( + val_ds, + batch_size=config.BATCH_SIZE, + num_workers=2, + pin_memory=config.PIN_MEMORY, + shuffle=False, + ) + loss_fn = nn.MSELoss() + + model = EfficientNet.from_pretrained("efficientnet-b3") + model._fc = nn.Linear(1536, 1) + model = model.to(config.DEVICE) + optimizer = optim.Adam(model.parameters(), lr=config.LEARNING_RATE, weight_decay=config.WEIGHT_DECAY) + scaler = torch.cuda.amp.GradScaler() + + if config.LOAD_MODEL and config.CHECKPOINT_FILE in os.listdir(): + load_checkpoint(torch.load(config.CHECKPOINT_FILE), model, optimizer, config.LEARNING_RATE) + + # Run after training is done and you've achieved good result + # on validation set, then run train_blend.py file to use information + # about both eyes concatenated + get_csv_for_blend(val_loader, model, "../train/val_blend.csv") + get_csv_for_blend(train_loader, model, "../train/train_blend.csv") + get_csv_for_blend(test_loader, model, "../train/test_blend.csv") + make_prediction(model, test_loader, "submission_.csv") + import sys + sys.exit() + #make_prediction(model, test_loader) + + for epoch in range(config.NUM_EPOCHS): + train_one_epoch(train_loader, model, optimizer, loss_fn, scaler, config.DEVICE) + + # get on validation + preds, labels = check_accuracy(val_loader, model, config.DEVICE) + print(f"QuadraticWeightedKappa (Validation): {cohen_kappa_score(labels, preds, weights='quadratic')}") + + # get on train + #preds, labels = check_accuracy(train_loader, model, config.DEVICE) + #print(f"QuadraticWeightedKappa (Training): {cohen_kappa_score(labels, preds, weights='quadratic')}") + + if config.SAVE_MODEL: + checkpoint = { + "state_dict": model.state_dict(), + "optimizer": optimizer.state_dict(), + } + save_checkpoint(checkpoint, filename=f"b3_{epoch}.pth.tar") + + + +if __name__ == "__main__": + main() diff --git a/ML/Kaggles/DiabeticRetinopathy/train_blend.py b/ML/Kaggles/DiabeticRetinopathy/train_blend.py new file mode 100644 index 0000000..8b2eb02 --- /dev/null +++ b/ML/Kaggles/DiabeticRetinopathy/train_blend.py @@ -0,0 +1,126 @@ +import torch +from tqdm import tqdm +import numpy as np +from torch import nn +from torch import optim +from torch.utils.data import DataLoader, Dataset +from utils import save_checkpoint, load_checkpoint, check_accuracy +from sklearn.metrics import cohen_kappa_score +import config +import os +import pandas as pd + + +def make_prediction(model, loader, file): + preds = [] + filenames = [] + model.eval() + + for x, y, files in tqdm(loader): + x = x.to(config.DEVICE) + with torch.no_grad(): + predictions = model(x) + # Convert MSE floats to integer predictions + predictions[predictions < 0.5] = 0 + predictions[(predictions >= 0.5) & (predictions < 1.5)] = 1 + predictions[(predictions >= 1.5) & (predictions < 2.5)] = 2 + predictions[(predictions >= 2.5) & (predictions < 3.5)] = 3 + predictions[(predictions >= 3.5) & (predictions < 1000000000000)] = 4 + predictions = predictions.long().view(-1) + y = y.view(-1) + + preds.append(predictions.cpu().numpy()) + filenames += map(list, zip(files[0], files[1])) + + filenames = [item for sublist in filenames for item in sublist] + df = pd.DataFrame({"image": filenames, "level": np.concatenate(preds, axis=0)}) + df.to_csv(file, index=False) + model.train() + print("Done with predictions") + + +class MyDataset(Dataset): + def __init__(self, csv_file): + self.csv = pd.read_csv(csv_file) + + def __len__(self): + return self.csv.shape[0] + + def __getitem__(self, index): + example = self.csv.iloc[index, :] + features = example.iloc[: example.shape[0] - 4].to_numpy().astype(np.float32) + labels = example.iloc[-4:-2].to_numpy().astype(np.int64) + filenames = example.iloc[-2:].values.tolist() + return features, labels, filenames + + +class MyModel(nn.Module): + def __init__(self): + super().__init__() + self.model = nn.Sequential( + nn.BatchNorm1d((1536 + 1) * 2), + nn.Linear((1536+1) * 2, 500), + nn.BatchNorm1d(500), + nn.ReLU(), + nn.Dropout(0.2), + nn.Linear(500, 100), + nn.BatchNorm1d(100), + nn.ReLU(), + nn.Dropout(0.2), + nn.Linear(100, 2), + ) + + def forward(self, x): + return self.model(x) + + +if __name__ == "__main__": + model = MyModel().to(config.DEVICE) + ds = MyDataset(csv_file="train/train_blend.csv") + loader = DataLoader(ds, batch_size=256, num_workers=3, pin_memory=True, shuffle=True) + ds_val = MyDataset(csv_file="train/val_blend.csv") + loader_val = DataLoader( + ds_val, batch_size=256, num_workers=3, pin_memory=True, shuffle=True + ) + ds_test = MyDataset(csv_file="train/test_blend.csv") + loader_test = DataLoader( + ds_test, batch_size=256, num_workers=2, pin_memory=True, shuffle=False + ) + optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) + loss_fn = nn.MSELoss() + + if config.LOAD_MODEL and "linear.pth.tar" in os.listdir(): + load_checkpoint(torch.load("linear.pth.tar"), model, optimizer, lr=1e-4) + model.train() + + for _ in range(5): + losses = [] + for x, y, files in tqdm(loader_val): + x = x.to(config.DEVICE).float() + y = y.to(config.DEVICE).view(-1).float() + + # forward + scores = model(x).view(-1) + loss = loss_fn(scores, y) + losses.append(loss.item()) + + # backward + optimizer.zero_grad() + loss.backward() + + # gradient descent or adam step + optimizer.step() + + print(f"Loss: {sum(losses)/len(losses)}") + + if config.SAVE_MODEL: + checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()} + save_checkpoint(checkpoint, filename="linear.pth.tar") + + preds, labels = check_accuracy(loader_val, model) + print(cohen_kappa_score(labels, preds, weights="quadratic")) + + preds, labels = check_accuracy(loader, model) + print(cohen_kappa_score(labels, preds, weights="quadratic")) + + make_prediction(model, loader_test, "test_preds.csv") diff --git a/ML/Kaggles/DiabeticRetinopathy/utils.py b/ML/Kaggles/DiabeticRetinopathy/utils.py new file mode 100644 index 0000000..b4e8fb1 --- /dev/null +++ b/ML/Kaggles/DiabeticRetinopathy/utils.py @@ -0,0 +1,128 @@ +import torch +import pandas as pd +import numpy as np +import config +from tqdm import tqdm +import warnings +import torch.nn.functional as F + + +def make_prediction(model, loader, output_csv="submission.csv"): + preds = [] + filenames = [] + model.eval() + + for x, y, files in tqdm(loader): + x = x.to(config.DEVICE) + with torch.no_grad(): + predictions = model(x) + # Convert MSE floats to integer predictions + predictions[predictions < 0.5] = 0 + predictions[(predictions >= 0.5) & (predictions < 1.5)] = 1 + predictions[(predictions >= 1.5) & (predictions < 2.5)] = 2 + predictions[(predictions >= 2.5) & (predictions < 3.5)] = 3 + predictions[(predictions >= 3.5) & (predictions < 10000000)] = 4 + predictions = predictions.long().squeeze(1) + preds.append(predictions.cpu().numpy()) + filenames += files + + df = pd.DataFrame({"image": filenames, "level": np.concatenate(preds, axis=0)}) + df.to_csv(output_csv, index=False) + model.train() + print("Done with predictions") + + +def check_accuracy(loader, model, device="cuda"): + model.eval() + all_preds, all_labels = [], [] + num_correct = 0 + num_samples = 0 + + for x, y, filename in tqdm(loader): + x = x.to(device=device) + y = y.to(device=device) + + with torch.no_grad(): + predictions = model(x) + + # Convert MSE floats to integer predictions + predictions[predictions < 0.5] = 0 + predictions[(predictions >= 0.5) & (predictions < 1.5)] = 1 + predictions[(predictions >= 1.5) & (predictions < 2.5)] = 2 + predictions[(predictions >= 2.5) & (predictions < 3.5)] = 3 + predictions[(predictions >= 3.5) & (predictions < 100)] = 4 + predictions = predictions.long().view(-1) + y = y.view(-1) + + num_correct += (predictions == y).sum() + num_samples += predictions.shape[0] + + # add to lists + all_preds.append(predictions.detach().cpu().numpy()) + all_labels.append(y.detach().cpu().numpy()) + + print( + f"Got {num_correct} / {num_samples} with accuracy {float(num_correct) / float(num_samples) * 100:.2f}" + ) + model.train() + return np.concatenate(all_preds, axis=0, dtype=np.int64), np.concatenate( + all_labels, axis=0, dtype=np.int64 + ) + + +def save_checkpoint(state, filename="my_checkpoint.pth.tar"): + print("=> Saving checkpoint") + torch.save(state, filename) + + +def load_checkpoint(checkpoint, model, optimizer, lr): + print("=> Loading checkpoint") + model.load_state_dict(checkpoint["state_dict"]) + #optimizer.load_state_dict(checkpoint["optimizer"]) + + # If we don't do this then it will just have learning rate of old checkpoint + # and it will lead to many hours of debugging \: + for param_group in optimizer.param_groups: + param_group["lr"] = lr + + +def get_csv_for_blend(loader, model, output_csv_file): + warnings.warn("Important to have shuffle=False (and to ensure batch size is even size) when running get_csv_for_blend also set val_transforms to train_loader!") + model.eval() + filename_first = [] + filename_second = [] + labels_first = [] + labels_second = [] + all_features = [] + + for idx, (images, y, image_files) in enumerate(tqdm(loader)): + images = images.to(config.DEVICE) + + with torch.no_grad(): + features = F.adaptive_avg_pool2d( + model.extract_features(images), output_size=1 + ) + features_logits = features.reshape(features.shape[0] // 2, 2, features.shape[1]) + preds = model(images).reshape(images.shape[0] // 2, 2, 1) + new_features = ( + torch.cat([features_logits, preds], dim=2) + .view(preds.shape[0], -1) + .cpu() + .numpy() + ) + all_features.append(new_features) + filename_first += image_files[::2] + filename_second += image_files[1::2] + labels_first.append(y[::2].cpu().numpy()) + labels_second.append(y[1::2].cpu().numpy()) + + all_features = np.concatenate(all_features, axis=0) + df = pd.DataFrame( + data=all_features, columns=[f"f_{idx}" for idx in range(all_features.shape[1])] + ) + df["label_first"] = np.concatenate(labels_first, axis=0) + df["label_second"] = np.concatenate(labels_second, axis=0) + df["file_first"] = filename_first + df["file_second"] = filename_second + df.to_csv(output_csv_file, index=False) + model.train()