Switch from urllib to requests to improve reliability (#867)

* Switch from urllib to requests to improve reliability * Keep ruff linter-specific * update * update * update
2026-04-10 12:33:42 +00:00 · 2025-10-07 15:22:59 -05:00
parent 8552565bda
commit 7bd263144e
47 changed files with 592 additions and 436 deletions
--- a/ch06/01_main-chapter-code/ch06.ipynb
+++ b/ch06/01_main-chapter-code/ch06.ipynb
@@ -186,6 +186,56 @@
    }
   ],
   "source": [
+    "import requests\n",
+    "import zipfile\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
+    "zip_path = \"sms_spam_collection.zip\"\n",
+    "extracted_path = \"sms_spam_collection\"\n",
+    "data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
+    "\n",
+    "\n",
+    "def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):\n",
+    "    if data_file_path.exists():\n",
+    "        print(f\"{data_file_path} already exists. Skipping download and extraction.\")\n",
+    "        return\n",
+    "\n",
+    "    # Downloading the file\n",
+    "    response = requests.get(url, stream=True, timeout=60)\n",
+    "    response.raise_for_status()\n",
+    "    with open(zip_path, \"wb\") as out_file:\n",
+    "        for chunk in response.iter_content(chunk_size=8192):\n",
+    "            if chunk:\n",
+    "                out_file.write(chunk)\n",
+    "\n",
+    "    # Unzipping the file\n",
+    "    with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n",
+    "        zip_ref.extractall(extracted_path)\n",
+    "\n",
+    "    # Add .tsv file extension\n",
+    "    original_file_path = Path(extracted_path) / \"SMSSpamCollection\"\n",
+    "    os.rename(original_file_path, data_file_path)\n",
+    "    print(f\"File downloaded and saved as {data_file_path}\")\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
+    "except (requests.exceptions.RequestException, TimeoutError) as e:\n",
+    "    print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
+    "    url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
+    "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
+    "\n",
+    "\n",
+    "\n",
+    "# The book originally used the following code below\n",
+    "# However, urllib uses older protocol settings that\n",
+    "# can cause problems for some readers using a VPN.\n",
+    "# The `requests` version above is more robust\n",
+    "# in that regard.\n",
+    "\n",
+    "\"\"\"\n",
    "import urllib.request\n",
    "import zipfile\n",
    "import os\n",
@@ -220,7 +270,8 @@
    "except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
    "    print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
    "    url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
-    "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) "
+    "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
+    "\"\"\""
   ]
  },
  {