Switch from urllib to requests to improve reliability (#867)

* Switch from urllib to requests to improve reliability * Keep ruff linter-specific * update * update * update
2026-04-10 12:33:42 +00:00 · 2025-10-07 15:22:59 -05:00
parent 8552565bda
commit 7bd263144e
47 changed files with 592 additions and 436 deletions
--- a/ch02/01_main-chapter-code/ch02.ipynb
+++ b/ch02/01_main-chapter-code/ch02.ipynb
@@ -163,6 +163,30 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "import os\n",
+    "import requests\n",
+    "\n",
+    "if not os.path.exists(\"the-verdict.txt\"):\n",
+    "    url = (\n",
+    "        \"https://raw.githubusercontent.com/rasbt/\"\n",
+    "        \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
+    "        \"the-verdict.txt\"\n",
+    "    )\n",
+    "    file_path = \"the-verdict.txt\"\n",
+    "\n",
+    "    response = requests.get(url, timeout=30)\n",
+    "    response.raise_for_status()\n",
+    "    with open(file_path, \"wb\") as f:\n",
+    "        f.write(response.content)\n",
+    "\n",
+    "\n",
+    "# The book originally used the following code below\n",
+    "# However, urllib uses older protocol settings that\n",
+    "# can cause problems for some readers using a VPN.\n",
+    "# The `requests` version above is more robust\n",
+    "# in that regard.\n",
+    "\n",
+    "\"\"\"\n",
    "import os\n",
    "import urllib.request\n",
    "\n",
@@ -171,7 +195,8 @@
    "           \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
    "           \"the-verdict.txt\")\n",
    "    file_path = \"the-verdict.txt\"\n",
-    "    urllib.request.urlretrieve(url, file_path)"
+    "    urllib.request.urlretrieve(url, file_path)\n",
+    "\"\"\""
   ]
  },
  {
--- a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb
+++ b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb
@@ -823,7 +823,7 @@
   ],
   "source": [
    "import os\n",
-    "import urllib.request\n",
+    "import requests\n",
    "\n",
    "def download_file_if_absent(url, filename, search_dirs):\n",
    "    for directory in search_dirs:\n",
@@ -834,13 +834,19 @@
    "\n",
    "    target_path = os.path.join(search_dirs[0], filename)\n",
    "    try:\n",
-    "        with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n",
-    "            out_file.write(response.read())\n",
+    "        response = requests.get(url, stream=True, timeout=60)\n",
+    "        response.raise_for_status()\n",
+    "        with open(target_path, \"wb\") as out_file:\n",
+    "            for chunk in response.iter_content(chunk_size=8192):\n",
+    "                if chunk:\n",
+    "                    out_file.write(chunk)\n",
    "        print(f\"Downloaded {filename} to {target_path}\")\n",
    "    except Exception as e:\n",
    "        print(f\"Failed to download {filename}. Error: {e}\")\n",
+    "\n",
    "    return target_path\n",
    "\n",
+    "\n",
    "verdict_path = download_file_if_absent(\n",
    "    url=(\n",
    "         \"https://raw.githubusercontent.com/rasbt/\"\n",