Switch from urllib to requests to improve reliability (#867)

* Switch from urllib to requests to improve reliability

* Keep ruff linter-specific

* update

* update

* update
This commit is contained in:
Sebastian Raschka
2025-10-07 15:22:59 -05:00
committed by GitHub
parent 8552565bda
commit 7bd263144e
47 changed files with 592 additions and 436 deletions

View File

@@ -190,7 +190,8 @@
}
],
"source": [
"import urllib\n",
"# import urllib\n",
"import requests\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"from previous_chapters import (\n",
@@ -215,13 +216,20 @@
"extracted_path = \"sms_spam_collection\"\n",
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
"\n",
"\n",
"try:\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
"except (requests.exceptions.RequestException, TimeoutError) as e:\n",
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"\n",
"# The book originally used\n",
"# except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
"# in the code above.\n",
"# However, some VPN users reported issues with `urllib`, so the code was updated\n",
"# to use `requests` instead\n",
"\n",
"df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
"balanced_df = create_balanced_dataset(df)\n",
"balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})\n",