mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Readability and code quality improvements (#959)
* Consistent dataset naming * consistent section headers
This commit is contained in:
committed by
GitHub
parent
7b1f740f74
commit
be5e2a3331
@@ -86,7 +86,8 @@
|
||||
"id": "3a84cf35-b37f-4c15-8972-dfafc9fadc1c"
|
||||
},
|
||||
"source": [
|
||||
"## 6.1 Different categories of finetuning"
|
||||
" \n",
|
||||
"### 6.1 Different categories of finetuning"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -142,7 +143,8 @@
|
||||
"id": "8c7017a2-32aa-4002-a2f3-12aac293ccdf"
|
||||
},
|
||||
"source": [
|
||||
"## 6.2 Preparing the dataset"
|
||||
" \n",
|
||||
"### 6.2 Preparing the dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -699,7 +701,8 @@
|
||||
"id": "a8d7a0c5-1d5f-458a-b685-3f49520b0094",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 6.3 Creating data loaders"
|
||||
" \n",
|
||||
"### 6.3 Creating data loaders"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1019,7 +1022,8 @@
|
||||
"id": "d1c4f61a-5f5d-4b3b-97cf-151b617d1d6c"
|
||||
},
|
||||
"source": [
|
||||
"## 6.4 Initializing a model with pretrained weights"
|
||||
" \n",
|
||||
"### 6.4 Initializing a model with pretrained weights"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1219,7 +1223,8 @@
|
||||
"id": "4c9ae440-32f9-412f-96cf-fd52cc3e2522"
|
||||
},
|
||||
"source": [
|
||||
"## 6.5 Adding a classification head"
|
||||
" \n",
|
||||
"### 6.5 Adding a classification head"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1722,7 +1727,8 @@
|
||||
"id": "32aa4aef-e1e9-491b-9adf-5aa973e59b8c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 6.6 Calculating the classification loss and accuracy"
|
||||
" \n",
|
||||
"### 6.6 Calculating the classification loss and accuracy"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2042,7 +2048,8 @@
|
||||
"id": "456ae0fd-6261-42b4-ab6a-d24289953083"
|
||||
},
|
||||
"source": [
|
||||
"## 6.7 Finetuning the model on supervised data"
|
||||
" \n",
|
||||
"### 6.7 Finetuning the model on supervised data"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2372,7 +2379,8 @@
|
||||
"id": "a74d9ad7-3ec1-450e-8c9f-4fc46d3d5bb0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 6.8 Using the LLM as a spam classifier"
|
||||
" \n",
|
||||
"### 6.8 Using the LLM as a spam classifier"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2564,6 +2572,7 @@
|
||||
"id": "5b70ac71-234f-4eeb-b33d-c62726d50cd4"
|
||||
},
|
||||
"source": [
|
||||
" \n",
|
||||
"## Summary and takeaways"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -130,20 +130,20 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path):
|
||||
print(f"File downloaded and saved as {new_file_path}")
|
||||
|
||||
|
||||
def random_split(df, train_frac, validation_frac):
|
||||
def random_split(df, train_frac, val_frac):
|
||||
# Shuffle the entire DataFrame
|
||||
df = df.sample(frac=1, random_state=123).reset_index(drop=True)
|
||||
|
||||
# Calculate split indices
|
||||
train_end = int(len(df) * train_frac)
|
||||
validation_end = train_end + int(len(df) * validation_frac)
|
||||
val_end = train_end + int(len(df) * val_frac)
|
||||
|
||||
# Split the DataFrame
|
||||
train_df = df[:train_end]
|
||||
validation_df = df[train_end:validation_end]
|
||||
test_df = df[validation_end:]
|
||||
val_df = df[train_end:val_end]
|
||||
test_df = df[val_end:]
|
||||
|
||||
return train_df, validation_df, test_df
|
||||
return train_df, val_df, test_df
|
||||
|
||||
|
||||
def create_dataset_csvs(new_file_path):
|
||||
@@ -157,9 +157,9 @@ def create_dataset_csvs(new_file_path):
|
||||
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
|
||||
|
||||
# Sample and save csv files
|
||||
train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
|
||||
train_df, val_df, test_df = random_split(balanced_df, 0.7, 0.1)
|
||||
train_df.to_csv("train.csv", index=None)
|
||||
validation_df.to_csv("validation.csv", index=None)
|
||||
val_df.to_csv("validation.csv", index=None)
|
||||
test_df.to_csv("test.csv", index=None)
|
||||
|
||||
|
||||
@@ -611,7 +611,7 @@ if __name__ == "__main__":
|
||||
base_path = Path(".")
|
||||
file_names = ["train.csv", "validation.csv", "test.csv"]
|
||||
all_exist = all((base_path / file_name).exists() for file_name in file_names)
|
||||
|
||||
|
||||
if not all_exist:
|
||||
try:
|
||||
download_and_unzip(url, zip_path, extract_to, new_file_path)
|
||||
|
||||
@@ -144,6 +144,7 @@
|
||||
"id": "fae87bc1-14ca-4f89-8e12-49f77b0ec00d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
" \n",
|
||||
"## Scikit-learn baseline"
|
||||
]
|
||||
},
|
||||
@@ -269,7 +270,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.16"
|
||||
"version": "3.13.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -79,20 +79,20 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path):
|
||||
print(f"File downloaded and saved as {new_file_path}")
|
||||
|
||||
|
||||
def random_split(df, train_frac, validation_frac):
|
||||
def random_split(df, train_frac, val_frac):
|
||||
# Shuffle the entire DataFrame
|
||||
df = df.sample(frac=1, random_state=123).reset_index(drop=True)
|
||||
|
||||
# Calculate split indices
|
||||
train_end = int(len(df) * train_frac)
|
||||
validation_end = train_end + int(len(df) * validation_frac)
|
||||
val_end = train_end + int(len(df) * val_frac)
|
||||
|
||||
# Split the DataFrame
|
||||
train_df = df[:train_end]
|
||||
validation_df = df[train_end:validation_end]
|
||||
test_df = df[validation_end:]
|
||||
val_df = df[train_end:val_end]
|
||||
test_df = df[val_end:]
|
||||
|
||||
return train_df, validation_df, test_df
|
||||
return train_df, val_df, test_df
|
||||
|
||||
|
||||
def create_dataset_csvs(new_file_path):
|
||||
@@ -106,9 +106,9 @@ def create_dataset_csvs(new_file_path):
|
||||
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
|
||||
|
||||
# Sample and save csv files
|
||||
train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
|
||||
train_df, val_df, test_df = random_split(balanced_df, 0.7, 0.1)
|
||||
train_df.to_csv("train.csv", index=None)
|
||||
validation_df.to_csv("validation.csv", index=None)
|
||||
val_df.to_csv("validation.csv", index=None)
|
||||
test_df.to_csv("test.csv", index=None)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user