add imbalanced classes video code and kaggle cat vs dog
12
.github/FUNDING.yml
vendored
@@ -1,12 +0,0 @@
|
|||||||
# These are supported funding model platforms
|
|
||||||
|
|
||||||
github: aladdinpersson # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
|
||||||
patreon: #aladdinpersson # Replace with a single Patreon username
|
|
||||||
open_collective: # Replace with a single Open Collective username
|
|
||||||
ko_fi: # Replace with a single Ko-fi username
|
|
||||||
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
|
|
||||||
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
|
|
||||||
liberapay: # Replace with a single Liberapay username
|
|
||||||
issuehunt: # Replace with a single IssueHunt username
|
|
||||||
otechie: # Replace with a single Otechie username
|
|
||||||
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
|
|
||||||
@@ -0,0 +1,119 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "51c78b68",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import sklearn\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"from sklearn.linear_model import LogisticRegression\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"from sklearn.metrics import log_loss"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "4421a043",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Training data shape: (25000, 2560), labels shape: (25000,)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"LogisticRegression(max_iter=2000)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"X = np.load(f'data_features/X_train_b7.npy')\n",
|
||||||
|
"y = np.load(f'data_features/y_train_b7.npy')\n",
|
||||||
|
"\n",
|
||||||
|
"# Split data and train classifier\n",
|
||||||
|
"print(f\"Training data shape: {X.shape}, labels shape: {y.shape}\")\n",
|
||||||
|
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.001, random_state=1337)\n",
|
||||||
|
"clf = LogisticRegression(max_iter=2000)\n",
|
||||||
|
"clf.fit(X_train, y_train)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "d5cfc5b0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"On validation set:\n",
|
||||||
|
"Accuracy: 1.0\n",
|
||||||
|
"LOG LOSS: 7.980845755748817e-05 \n",
|
||||||
|
"%--------------------------------------------------%\n",
|
||||||
|
"Getting predictions for test set\n",
|
||||||
|
"Done getting predictions!\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Check on validation\n",
|
||||||
|
"val_preds= clf.predict_proba(X_val)[:,1]\n",
|
||||||
|
"print(f\"On validation set:\")\n",
|
||||||
|
"print(f\"Accuracy: {clf.score(X_val, y_val)}\")\n",
|
||||||
|
"print(f\"LOG LOSS: {log_loss(y_val, val_preds)} \")\n",
|
||||||
|
"print(\"%--------------------------------------------------%\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Get predictions on test set\n",
|
||||||
|
"print(\"Getting predictions for test set\")\n",
|
||||||
|
"X_test = np.load(f'data_features/X_test_b7.npy')\n",
|
||||||
|
"X_test_preds = clf.predict_proba(X_test)[:,1]\n",
|
||||||
|
"df = pd.DataFrame({'id': np.arange(1, 12501), 'label': np.clip(X_test_preds, 0.005, 0.995)})\n",
|
||||||
|
"df.to_csv(f\"submissions/mysubmission.csv\", index=False)\n",
|
||||||
|
"print(\"Done getting predictions!\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a9cce7af",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
1
ML/Kaggles/Dog vs Cat Competition/competition.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition
|
||||||
26
ML/Kaggles/Dog vs Cat Competition/config.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
import torch
|
||||||
|
import albumentations as A
|
||||||
|
from albumentations.pytorch import ToTensorV2
|
||||||
|
|
||||||
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
NUM_WORKERS = 4
|
||||||
|
BATCH_SIZE = 20
|
||||||
|
PIN_MEMORY = True
|
||||||
|
LOAD_MODEL = True
|
||||||
|
SAVE_MODEL = True
|
||||||
|
CHECKPOINT_FILE = "b7.pth.tar"
|
||||||
|
WEIGHT_DECAY = 1e-4
|
||||||
|
LEARNING_RATE = 1e-4
|
||||||
|
NUM_EPOCHS = 1
|
||||||
|
|
||||||
|
basic_transform = A.Compose(
|
||||||
|
[
|
||||||
|
A.Resize(height=448, width=448),
|
||||||
|
A.Normalize(
|
||||||
|
mean=[0.485, 0.456, 0.406],
|
||||||
|
std=[0.229, 0.224, 0.225],
|
||||||
|
max_pixel_value=255.0,
|
||||||
|
),
|
||||||
|
ToTensorV2(),
|
||||||
|
]
|
||||||
|
)
|
||||||
32
ML/Kaggles/Dog vs Cat Competition/dataset.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
|
class CatDog(Dataset):
|
||||||
|
def __init__(self, root, transform=None):
|
||||||
|
self.images = os.listdir(root)
|
||||||
|
self.images.sort(key=lambda x: int(re.findall(r"\d+", x)[0]))
|
||||||
|
self.root = root
|
||||||
|
self.transform = transform
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.images)
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
file = self.images[index]
|
||||||
|
img = np.array(Image.open(os.path.join(self.root, file)))
|
||||||
|
|
||||||
|
if self.transform is not None:
|
||||||
|
img = self.transform(image=img)["image"]
|
||||||
|
|
||||||
|
if "dog" in file:
|
||||||
|
label = 1
|
||||||
|
elif "cat" in file:
|
||||||
|
label = 0
|
||||||
|
else:
|
||||||
|
label = -1
|
||||||
|
|
||||||
|
return img, label
|
||||||
12501
ML/Kaggles/Dog vs Cat Competition/submissions/mysubmission.csv
Normal file
93
ML/Kaggles/Dog vs Cat Competition/train.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
# Imports
|
||||||
|
import os
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import numpy as np
|
||||||
|
import config
|
||||||
|
from torch import nn, optim
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
from tqdm import tqdm
|
||||||
|
from dataset import CatDog
|
||||||
|
from efficientnet_pytorch import EfficientNet
|
||||||
|
from utils import check_accuracy, load_checkpoint, save_checkpoint
|
||||||
|
|
||||||
|
|
||||||
|
def save_feature_vectors(model, loader, output_size=(1, 1), file="trainb7"):
|
||||||
|
model.eval()
|
||||||
|
images, labels = [], []
|
||||||
|
|
||||||
|
for idx, (x, y) in enumerate(tqdm(loader)):
|
||||||
|
x = x.to(config.DEVICE)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
features = model.extract_features(x)
|
||||||
|
features = F.adaptive_avg_pool2d(features, output_size=output_size)
|
||||||
|
images.append(features.reshape(x.shape[0], -1).detach().cpu().numpy())
|
||||||
|
labels.append(y.numpy())
|
||||||
|
|
||||||
|
np.save(f"data_features/X_{file}.npy", np.concatenate(images, axis=0))
|
||||||
|
np.save(f"data_features/y_{file}.npy", np.concatenate(labels, axis=0))
|
||||||
|
model.train()
|
||||||
|
|
||||||
|
|
||||||
|
def train_one_epoch(loader, model, loss_fn, optimizer, scaler):
|
||||||
|
loop = tqdm(loader)
|
||||||
|
|
||||||
|
for batch_idx, (data, targets) in enumerate(loop):
|
||||||
|
data = data.to(config.DEVICE)
|
||||||
|
targets = targets.to(config.DEVICE).unsqueeze(1).float()
|
||||||
|
|
||||||
|
with torch.cuda.amp.autocast():
|
||||||
|
scores = model(data)
|
||||||
|
loss = loss_fn(scores, targets)
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
scaler.scale(loss).backward()
|
||||||
|
scaler.step(optimizer)
|
||||||
|
scaler.update()
|
||||||
|
loop.set_postfix(loss=loss.item())
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
model = EfficientNet.from_pretrained("efficientnet-b7")
|
||||||
|
model._fc = nn.Linear(2560, 1)
|
||||||
|
train_dataset = CatDog(root="data/train/", transform=config.basic_transform)
|
||||||
|
test_dataset = CatDog(root="data/test/", transform=config.basic_transform)
|
||||||
|
train_loader = DataLoader(
|
||||||
|
train_dataset,
|
||||||
|
shuffle=True,
|
||||||
|
batch_size=config.BATCH_SIZE,
|
||||||
|
num_workers=config.NUM_WORKERS,
|
||||||
|
pin_memory=True,
|
||||||
|
)
|
||||||
|
test_loader = DataLoader(
|
||||||
|
test_dataset,
|
||||||
|
shuffle=False,
|
||||||
|
batch_size=config.BATCH_SIZE,
|
||||||
|
num_workers=config.NUM_WORKERS,
|
||||||
|
)
|
||||||
|
model = model.to(config.DEVICE)
|
||||||
|
|
||||||
|
scaler = torch.cuda.amp.GradScaler()
|
||||||
|
loss_fn = nn.BCEWithLogitsLoss()
|
||||||
|
optimizer = optim.Adam(
|
||||||
|
model.parameters(), lr=config.LEARNING_RATE, weight_decay=config.WEIGHT_DECAY
|
||||||
|
)
|
||||||
|
|
||||||
|
if config.LOAD_MODEL and config.CHECKPOINT_FILE in os.listdir():
|
||||||
|
load_checkpoint(torch.load(config.CHECKPOINT_FILE), model)
|
||||||
|
|
||||||
|
for epoch in range(config.NUM_EPOCHS):
|
||||||
|
train_one_epoch(train_loader, model, loss_fn, optimizer, scaler)
|
||||||
|
check_accuracy(train_loader, model, loss_fn)
|
||||||
|
|
||||||
|
if config.SAVE_MODEL:
|
||||||
|
checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
|
||||||
|
save_checkpoint(checkpoint, filename=config.CHECKPOINT_FILE)
|
||||||
|
|
||||||
|
save_feature_vectors(model, train_loader, output_size=(1, 1), file="train_b7")
|
||||||
|
save_feature_vectors(model, test_loader, output_size=(1, 1), file="test_b7")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
192
ML/Kaggles/Dog vs Cat Competition/utils.py
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
import torch
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import albumentations as A
|
||||||
|
from albumentations.pytorch import ToTensorV2
|
||||||
|
import config
|
||||||
|
from tqdm import tqdm
|
||||||
|
from dataset import CatDog
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
from sklearn.metrics import log_loss
|
||||||
|
|
||||||
|
|
||||||
|
def check_accuracy(
|
||||||
|
loader, model, loss_fn, input_shape=None, toggle_eval=True, print_accuracy=True
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Check accuracy of model on data from loader
|
||||||
|
"""
|
||||||
|
if toggle_eval:
|
||||||
|
model.eval()
|
||||||
|
device = next(model.parameters()).device
|
||||||
|
num_correct = 0
|
||||||
|
num_samples = 0
|
||||||
|
|
||||||
|
y_preds = []
|
||||||
|
y_true = []
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
for x, y in loader:
|
||||||
|
x = x.to(device=device)
|
||||||
|
y = y.to(device=device)
|
||||||
|
if input_shape:
|
||||||
|
x = x.reshape(x.shape[0], *input_shape)
|
||||||
|
scores = model(x)
|
||||||
|
predictions = torch.sigmoid(scores) > 0.5
|
||||||
|
y_preds.append(torch.clip(torch.sigmoid(scores), 0.005, 0.995).cpu().numpy())
|
||||||
|
y_true.append(y.cpu().numpy())
|
||||||
|
num_correct += (predictions.squeeze(1) == y).sum()
|
||||||
|
num_samples += predictions.size(0)
|
||||||
|
|
||||||
|
accuracy = num_correct / num_samples
|
||||||
|
|
||||||
|
if toggle_eval:
|
||||||
|
model.train()
|
||||||
|
|
||||||
|
if print_accuracy:
|
||||||
|
print(f"Accuracy: {accuracy * 100:.2f}%")
|
||||||
|
print(log_loss(np.concatenate(y_true, axis=0), np.concatenate(y_preds, axis=0)))
|
||||||
|
|
||||||
|
return accuracy
|
||||||
|
|
||||||
|
|
||||||
|
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
|
||||||
|
print("=> Saving checkpoint")
|
||||||
|
torch.save(state, filename)
|
||||||
|
|
||||||
|
|
||||||
|
def load_checkpoint(checkpoint, model):
|
||||||
|
print("=> Loading checkpoint")
|
||||||
|
model.load_state_dict(checkpoint["state_dict"])
|
||||||
|
|
||||||
|
|
||||||
|
def create_submission(model, model_name, files_dir):
|
||||||
|
my_transforms = {
|
||||||
|
"base": A.Compose(
|
||||||
|
[
|
||||||
|
A.Resize(height=240, width=240),
|
||||||
|
A.Normalize(
|
||||||
|
mean=[0.485, 0.456, 0.406],
|
||||||
|
std=[0.229, 0.224, 0.225],
|
||||||
|
max_pixel_value=255.0,
|
||||||
|
),
|
||||||
|
ToTensorV2(),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"horizontal_flip": A.Compose(
|
||||||
|
[
|
||||||
|
A.Resize(height=240, width=240),
|
||||||
|
A.HorizontalFlip(p=1.0),
|
||||||
|
A.Normalize(
|
||||||
|
mean=[0.485, 0.456, 0.406],
|
||||||
|
std=[0.229, 0.224, 0.225],
|
||||||
|
max_pixel_value=255.0,
|
||||||
|
),
|
||||||
|
ToTensorV2(),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"vertical_flip": A.Compose(
|
||||||
|
[
|
||||||
|
A.Resize(height=240, width=240),
|
||||||
|
A.VerticalFlip(p=1.0),
|
||||||
|
A.Normalize(
|
||||||
|
mean=[0.485, 0.456, 0.406],
|
||||||
|
std=[0.229, 0.224, 0.225],
|
||||||
|
max_pixel_value=255.0,
|
||||||
|
),
|
||||||
|
ToTensorV2(),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"coloring": A.Compose(
|
||||||
|
[
|
||||||
|
A.Resize(height=240, width=240),
|
||||||
|
A.ColorJitter(p=1.0),
|
||||||
|
A.Normalize(
|
||||||
|
mean=[0.485, 0.456, 0.406],
|
||||||
|
std=[0.229, 0.224, 0.225],
|
||||||
|
max_pixel_value=255.0,
|
||||||
|
),
|
||||||
|
ToTensorV2(),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"rotate": A.Compose(
|
||||||
|
[
|
||||||
|
A.Resize(height=240, width=240),
|
||||||
|
A.Rotate(p=1.0, limit=45),
|
||||||
|
A.Normalize(
|
||||||
|
mean=[0.485, 0.456, 0.406],
|
||||||
|
std=[0.229, 0.224, 0.225],
|
||||||
|
max_pixel_value=255.0,
|
||||||
|
),
|
||||||
|
ToTensorV2(),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"shear": A.Compose(
|
||||||
|
[
|
||||||
|
A.Resize(height=240, width=240),
|
||||||
|
A.IAAAffine(p=1.0),
|
||||||
|
A.Normalize(
|
||||||
|
mean=[0.485, 0.456, 0.406],
|
||||||
|
std=[0.229, 0.224, 0.225],
|
||||||
|
max_pixel_value=255.0,
|
||||||
|
),
|
||||||
|
ToTensorV2(),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
for t in ["base", "horizontal_flip", "vertical_flip", "coloring", "rotate", "shear"]:
|
||||||
|
predictions = []
|
||||||
|
labels = []
|
||||||
|
all_files = []
|
||||||
|
test_dataset = MyDataset(root=files_dir, transform=my_transforms[t])
|
||||||
|
test_loader = DataLoader(
|
||||||
|
test_dataset, batch_size=32, num_workers=4, shuffle=False, pin_memory=True
|
||||||
|
)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
for idx, (x, y, filenames) in enumerate(tqdm(test_loader)):
|
||||||
|
x = x.to(config.DEVICE)
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = (
|
||||||
|
torch.clip(torch.sigmoid(model(x)), 0.005, 0.995).squeeze(1).cpu().numpy()
|
||||||
|
)
|
||||||
|
predictions.append(outputs)
|
||||||
|
labels += y.numpy().tolist()
|
||||||
|
all_files += filenames
|
||||||
|
|
||||||
|
df = pd.DataFrame(
|
||||||
|
{
|
||||||
|
"id": np.arange(
|
||||||
|
1,
|
||||||
|
(len(predictions) - 1) * predictions[0].shape[0]
|
||||||
|
+ predictions[-1].shape[0]
|
||||||
|
+ 1,
|
||||||
|
),
|
||||||
|
"label": np.concatenate(predictions, axis=0),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
df.to_csv(f"predictions_test/submission_{model_name}_{t}.csv", index=False)
|
||||||
|
|
||||||
|
model.train()
|
||||||
|
print(f"Created submission file for model {model_name} and transform {t}")
|
||||||
|
|
||||||
|
|
||||||
|
def blending_ensemble_data():
|
||||||
|
pred_csvs = []
|
||||||
|
root_dir = "predictions_validation/"
|
||||||
|
|
||||||
|
for file in os.listdir(root_dir):
|
||||||
|
if "label" not in file:
|
||||||
|
df = pd.read_csv(root_dir + "/" + file)
|
||||||
|
pred_csvs.append(df)
|
||||||
|
else:
|
||||||
|
label_csv = pd.read_csv(root_dir + "/" + file)
|
||||||
|
|
||||||
|
all_preds = pd.concat(pred_csvs, axis=1)
|
||||||
|
print(all_preds)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
blending_ensemble_data()
|
||||||
|
After Width: | Height: | Size: 108 KiB |
|
After Width: | Height: | Size: 180 KiB |
|
After Width: | Height: | Size: 69 KiB |
|
After Width: | Height: | Size: 155 KiB |
|
After Width: | Height: | Size: 79 KiB |
|
After Width: | Height: | Size: 38 KiB |
|
After Width: | Height: | Size: 64 KiB |
|
After Width: | Height: | Size: 89 KiB |
|
After Width: | Height: | Size: 100 KiB |
|
After Width: | Height: | Size: 61 KiB |
|
After Width: | Height: | Size: 40 KiB |
|
After Width: | Height: | Size: 96 KiB |
|
After Width: | Height: | Size: 50 KiB |
|
After Width: | Height: | Size: 49 KiB |
|
After Width: | Height: | Size: 184 KiB |
|
After Width: | Height: | Size: 40 KiB |
|
After Width: | Height: | Size: 69 KiB |
|
After Width: | Height: | Size: 64 KiB |
|
After Width: | Height: | Size: 68 KiB |
|
After Width: | Height: | Size: 80 KiB |
|
After Width: | Height: | Size: 80 KiB |
|
After Width: | Height: | Size: 100 KiB |
|
After Width: | Height: | Size: 25 KiB |
|
After Width: | Height: | Size: 62 KiB |
|
After Width: | Height: | Size: 28 KiB |
|
After Width: | Height: | Size: 53 KiB |
|
After Width: | Height: | Size: 45 KiB |
|
After Width: | Height: | Size: 128 KiB |
|
After Width: | Height: | Size: 69 KiB |
|
After Width: | Height: | Size: 55 KiB |
|
After Width: | Height: | Size: 97 KiB |
|
After Width: | Height: | Size: 80 KiB |
|
After Width: | Height: | Size: 52 KiB |
|
After Width: | Height: | Size: 64 KiB |
|
After Width: | Height: | Size: 40 KiB |
|
After Width: | Height: | Size: 24 KiB |
|
After Width: | Height: | Size: 69 KiB |
|
After Width: | Height: | Size: 44 KiB |
|
After Width: | Height: | Size: 36 KiB |
|
After Width: | Height: | Size: 57 KiB |
|
After Width: | Height: | Size: 65 KiB |
|
After Width: | Height: | Size: 298 KiB |
|
After Width: | Height: | Size: 846 KiB |
|
After Width: | Height: | Size: 271 KiB |
|
After Width: | Height: | Size: 50 KiB |
|
After Width: | Height: | Size: 54 KiB |
|
After Width: | Height: | Size: 41 KiB |
|
After Width: | Height: | Size: 60 KiB |
|
After Width: | Height: | Size: 70 KiB |
|
After Width: | Height: | Size: 424 KiB |
|
After Width: | Height: | Size: 151 KiB |
54
ML/Pytorch/Basics/Imbalanced_classes/main.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
import torch
|
||||||
|
import torchvision.datasets as datasets
|
||||||
|
import os
|
||||||
|
from torch.utils.data import WeightedRandomSampler, DataLoader
|
||||||
|
import torchvision.transforms as transforms
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
# Methods for dealing with imbalanced datasets:
|
||||||
|
# 1. Oversampling
|
||||||
|
# 2. Class weighting
|
||||||
|
|
||||||
|
def get_loader(root_dir, batch_size):
|
||||||
|
my_transforms = transforms.Compose(
|
||||||
|
[
|
||||||
|
transforms.Resize((224, 224)),
|
||||||
|
transforms.ToTensor(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset = datasets.ImageFolder(root=root_dir, transform=my_transforms)
|
||||||
|
class_weights = []
|
||||||
|
for root, subdir, files in os.walk(root_dir):
|
||||||
|
if len(files) > 0:
|
||||||
|
class_weights.append(1/len(files))
|
||||||
|
|
||||||
|
sample_weights = [0] * len(dataset)
|
||||||
|
|
||||||
|
for idx, (data, label) in enumerate(dataset):
|
||||||
|
class_weight = class_weights[label]
|
||||||
|
sample_weights[idx] = class_weight
|
||||||
|
|
||||||
|
sampler = WeightedRandomSampler(sample_weights, num_samples=
|
||||||
|
len(sample_weights), replacement=True)
|
||||||
|
|
||||||
|
loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
|
||||||
|
return loader
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
loader = get_loader(root_dir="dataset", batch_size=8)
|
||||||
|
|
||||||
|
num_retrievers = 0
|
||||||
|
num_elkhounds = 0
|
||||||
|
for epoch in range(10):
|
||||||
|
for data, labels in loader:
|
||||||
|
num_retrievers += torch.sum(labels==0)
|
||||||
|
num_elkhounds += torch.sum(labels==1)
|
||||||
|
|
||||||
|
print(num_retrievers)
|
||||||
|
print(num_elkhounds)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
||||||