""" Creates a Pytorch dataset to load the Pascal VOC dataset """ import torch import os import pandas as pd from PIL import Image class VOCDataset(torch.utils.data.Dataset): def __init__( self, csv_file, img_dir, label_dir, S=7, B=2, C=20, transform=None, ): self.annotations = pd.read_csv(csv_file) self.img_dir = img_dir self.label_dir = label_dir self.transform = transform self.S = S self.B = B self.C = C def __len__(self): return len(self.annotations) def __getitem__(self, index): label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1]) boxes = [] with open(label_path) as f: for label in f.readlines(): class_label, x, y, width, height = [ float(x) if float(x) != int(float(x)) else int(x) for x in label.replace("\n", "").split() ] boxes.append([class_label, x, y, width, height]) img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0]) image = Image.open(img_path) boxes = torch.tensor(boxes) if self.transform: # image = self.transform(image) image, boxes = self.transform(image, boxes) # Convert To Cells label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B)) for box in boxes: class_label, x, y, width, height = box.tolist() class_label = int(class_label) # i,j represents the cell row and cell column i, j = int(self.S * y), int(self.S * x) x_cell, y_cell = self.S * x - j, self.S * y - i """ Calculating the width and height of cell of bounding box, relative to the cell is done by the following, with width as the example: width_pixels = (width*self.image_width) cell_pixels = (self.image_width) Then to find the width relative to the cell is simply: width_pixels/cell_pixels, simplification leads to the formulas below. """ width_cell, height_cell = ( width * self.S, height * self.S, ) # If no object already found for specific cell i,j # Note: This means we restrict to ONE object # per cell! if label_matrix[i, j, 20] == 0: # Set that there exists an object label_matrix[i, j, 20] = 1 # Box coordinates box_coordinates = torch.tensor( [x_cell, y_cell, width_cell, height_cell] ) label_matrix[i, j, 21:25] = box_coordinates # Set one hot encoding for class_label label_matrix[i, j, class_label] = 1 return image, label_matrix