Switch from urllib to requests to improve reliability (#867)

* Switch from urllib to requests to improve reliability * Keep ruff linter-specific * update * update * update
2026-04-10 12:33:42 +00:00 · 2025-10-07 15:22:59 -05:00
parent 8552565bda
commit 7bd263144e
47 changed files with 592 additions and 436 deletions
--- a/appendix-E/01_main-chapter-code/appendix-E.ipynb
+++ b/appendix-E/01_main-chapter-code/appendix-E.ipynb
@@ -190,7 +190,8 @@
    }
   ],
   "source": [
-    "import urllib\n",
+    "# import urllib\n",
+    "import requests\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "from previous_chapters import (\n",
@@ -215,13 +216,20 @@
    "extracted_path = \"sms_spam_collection\"\n",
    "data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
    "\n",
+    "\n",
    "try:\n",
    "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
-    "except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
+    "except (requests.exceptions.RequestException, TimeoutError) as e:\n",
    "    print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
    "    url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
    "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
    "\n",
+    "# The book originally used\n",
+    "# except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
+    "# in the code above.\n",
+    "# However, some VPN users reported issues with `urllib`, so the code was updated\n",
+    "# to use `requests` instead\n",
+    "\n",
    "df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
    "balanced_df = create_balanced_dataset(df)\n",
    "balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})\n",
--- a/appendix-E/01_main-chapter-code/previous_chapters.py
+++ b/appendix-E/01_main-chapter-code/previous_chapters.py
@@ -9,12 +9,12 @@

 import os
 from pathlib import Path
-import urllib
 import zipfile

 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import requests
 import tiktoken
 import torch
 import torch.nn as nn
@@ -367,9 +367,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
        return

    # Downloading the file
-    with urllib.request.urlopen(url) as response:
-        with open(zip_path, "wb") as out_file:
-            out_file.write(response.read())
+    response = requests.get(url, stream=True, timeout=60)
+    response.raise_for_status()
+    with open(zip_path, "wb") as out_file:
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                out_file.write(chunk)

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref: