Add backup url for Spam Dataset (#543)

* Add backup url for Spam Dataset

* import urllib

* fix url
This commit is contained in:
Sebastian Raschka
2025-02-20 08:08:28 -06:00
committed by GitHub
parent 1039bf9b80
commit c39aa32ef5
5 changed files with 44 additions and 14 deletions

View File

@@ -50,7 +50,7 @@
"text": [
"matplotlib version: 3.10.0\n",
"numpy version: 2.0.2\n",
"tiktoken version: 0.8.0\n",
"tiktoken version: 0.9.0\n",
"torch version: 2.6.0\n",
"tensorflow version: 2.18.0\n",
"pandas version: 2.2.3\n"
@@ -167,7 +167,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"id": "def7c09b-af9c-4216-90ce-5e67aed1065c",
"metadata": {
"colab": {
@@ -181,7 +181,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.\n"
"File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv\n"
]
}
],
@@ -215,7 +215,13 @@
" os.rename(original_file_path, data_file_path)\n",
" print(f\"File downloaded and saved as {data_file_path}\")\n",
"\n",
"download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)"
"try:\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"except urllib.error.HTTPError:\n",
" print(\"UCI Machine Learning Repository (https://archive.ics.uci.edu)\"\n",
" \" temporary unavailable. Using backup URL.\")\n",
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) "
]
},
{

View File

@@ -276,7 +276,16 @@ if __name__ == "__main__":
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode)
try:
download_and_unzip_spam_data(
url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode
)
except urllib.error.HTTPError:
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
download_and_unzip_spam_data(
backup_url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode
)
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
balanced_df = create_balanced_dataset(df)
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

View File

@@ -603,7 +603,11 @@ if __name__ == "__main__":
all_exist = all((base_path / file_name).exists() for file_name in file_names)
if not all_exist:
download_and_unzip(url, zip_path, extract_to, new_file_path)
try:
download_and_unzip(url, zip_path, extract_to, new_file_path)
except urllib.error.HTTPError:
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
download_and_unzip(backup_url, zip_path, extract_to, new_file_path)
create_dataset_csvs(new_file_path)
tokenizer = tiktoken.get_encoding("gpt2")

View File

@@ -410,7 +410,11 @@ if __name__ == "__main__":
all_exist = all((base_path / file_name).exists() for file_name in file_names)
if not all_exist:
download_and_unzip(url, zip_path, extract_to, new_file_path)
try:
download_and_unzip(url, zip_path, extract_to, new_file_path)
except urllib.error.HTTPError:
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
download_and_unzip(backup_url, zip_path, extract_to, new_file_path)
create_dataset_csvs(new_file_path)
if args.use_attention_mask.lower() == "true":