Readability and code quality improvements (#959)

* Consistent dataset naming

* consistent section headers
This commit is contained in:
Sebastian Raschka
2026-02-17 19:44:56 -05:00
committed by GitHub
parent 7b1f740f74
commit be5e2a3331
48 changed files with 419 additions and 297 deletions

View File

@@ -130,20 +130,20 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path):
print(f"File downloaded and saved as {new_file_path}")
def random_split(df, train_frac, validation_frac):
def random_split(df, train_frac, val_frac):
# Shuffle the entire DataFrame
df = df.sample(frac=1, random_state=123).reset_index(drop=True)
# Calculate split indices
train_end = int(len(df) * train_frac)
validation_end = train_end + int(len(df) * validation_frac)
val_end = train_end + int(len(df) * val_frac)
# Split the DataFrame
train_df = df[:train_end]
validation_df = df[train_end:validation_end]
test_df = df[validation_end:]
val_df = df[train_end:val_end]
test_df = df[val_end:]
return train_df, validation_df, test_df
return train_df, val_df, test_df
def create_dataset_csvs(new_file_path):
@@ -157,9 +157,9 @@ def create_dataset_csvs(new_file_path):
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
# Sample and save csv files
train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
train_df, val_df, test_df = random_split(balanced_df, 0.7, 0.1)
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
val_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)
@@ -611,7 +611,7 @@ if __name__ == "__main__":
base_path = Path(".")
file_names = ["train.csv", "validation.csv", "test.csv"]
all_exist = all((base_path / file_name).exists() for file_name in file_names)
if not all_exist:
try:
download_and_unzip(url, zip_path, extract_to, new_file_path)