Switch from urllib to requests to improve reliability (#867)

* Switch from urllib to requests to improve reliability * Keep ruff linter-specific * update * update * update
2026-04-10 12:33:42 +00:00 · 2025-10-07 15:22:59 -05:00
parent 8552565bda
commit 7bd263144e
47 changed files with 592 additions and 436 deletions
--- a/ch05/01_main-chapter-code/ch05.ipynb
+++ b/ch05/01_main-chapter-code/ch05.ipynb
@@ -793,19 +793,43 @@
   "outputs": [],
   "source": [
    "import os\n",
-    "import urllib.request\n",
+    "import requests\n",
    "\n",
    "file_path = \"the-verdict.txt\"\n",
    "url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
    "\n",
    "if not os.path.exists(file_path):\n",
-    "    with urllib.request.urlopen(url) as response:\n",
-    "        text_data = response.read().decode('utf-8')\n",
+    "    response = requests.get(url, timeout=30)\n",
+    "    response.raise_for_status()\n",
+    "    text_data = response.text\n",
    "    with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
    "        file.write(text_data)\n",
    "else:\n",
    "    with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
-    "        text_data = file.read()"
+    "        text_data = file.read()\n",
+    "\n",
+    "\n",
+    "# The book originally used the following code below\n",
+    "# However, urllib uses older protocol settings that\n",
+    "# can cause problems for some readers using a VPN.\n",
+    "# The `requests` version above is more robust\n",
+    "# in that regard.\n",
+    "\n",
+    "        \n",
+    "# import os\n",
+    "# import urllib.request\n",
+    "\n",
+    "# file_path = \"the-verdict.txt\"\n",
+    "# url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
+    "\n",
+    "# if not os.path.exists(file_path):\n",
+    "#     with urllib.request.urlopen(url) as response:\n",
+    "#         text_data = response.read().decode('utf-8')\n",
+    "#     with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
+    "#         file.write(text_data)\n",
+    "# else:\n",
+    "#     with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
+    "#         text_data = file.read()"
   ]
  },
  {
--- a/ch05/01_main-chapter-code/exercise-solutions.ipynb
+++ b/ch05/01_main-chapter-code/exercise-solutions.ipynb
@@ -491,7 +491,7 @@
   "outputs": [],
   "source": [
    "import os\n",
-    "import urllib.request\n",
+    "import requests\n",
    "from previous_chapters import create_dataloader_v1\n",
    "\n",
    "\n",
@@ -499,6 +499,25 @@
    "url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
    "\n",
    "if not os.path.exists(file_path):\n",
+    "    response = requests.get(url, timeout=30)\n",
+    "    response.raise_for_status()\n",
+    "    text_data = response.text\n",
+    "    with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
+    "        file.write(text_data)\n",
+    "else:\n",
+    "    with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
+    "        text_data = file.read()\n",
+    "\n",
+    "# The book originally used the following code below\n",
+    "# However, urllib uses older protocol settings that\n",
+    "# can cause problems for some readers using a VPN.\n",
+    "# The `requests` version above is more robust\n",
+    "# in that regard.\n",
+    "\n",
+    "\"\"\"\n",
+    "import urllib.request\n",
+    "\n",
+    "if not os.path.exists(file_path):\n",
    "    with urllib.request.urlopen(url) as response:\n",
    "        text_data = response.read().decode('utf-8')\n",
    "    with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
@@ -506,6 +525,7 @@
    "else:\n",
    "    with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
    "        text_data = file.read()\n",
+    "\"\"\"\n",
    "\n",
    "\n",
    "# Train/validation ratio\n",
--- a/ch05/01_main-chapter-code/gpt_download.py
+++ b/ch05/01_main-chapter-code/gpt_download.py
@@ -5,9 +5,8 @@


 import os
-import urllib.request

-# import requests
+import requests
 import json
 import numpy as np
 import tensorflow as tf
@@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir):

 def download_file(url, destination, backup_url=None):
    def _attempt_download(download_url):
-        with urllib.request.urlopen(download_url) as response:
-            # Get the total file size from headers, defaulting to 0 if not present
-            file_size = int(response.headers.get("Content-Length", 0))
+        response = requests.get(download_url, stream=True, timeout=60)
+        response.raise_for_status()

-            # Check if file exists and has the same size
-            if os.path.exists(destination):
-                file_size_local = os.path.getsize(destination)
-                if file_size == file_size_local:
-                    print(f"File already exists and is up-to-date: {destination}")
-                    return True  # Indicate success without re-downloading
+        file_size = int(response.headers.get("Content-Length", 0))

-            block_size = 1024  # 1 Kilobyte
+        # Check if file exists and has same size
+        if os.path.exists(destination):
+            file_size_local = os.path.getsize(destination)
+            if file_size and file_size == file_size_local:
+                print(f"File already exists and is up-to-date: {destination}")
+                return True

-            # Initialize the progress bar with total file size
-            progress_bar_description = os.path.basename(download_url)
-            with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
-                with open(destination, "wb") as file:
-                    while True:
-                        chunk = response.read(block_size)
-                        if not chunk:
-                            break
+        block_size = 1024  # 1 KB
+        desc = os.path.basename(download_url)
+        with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
+            with open(destination, "wb") as file:
+                for chunk in response.iter_content(chunk_size=block_size):
+                    if chunk:
                        file.write(chunk)
                        progress_bar.update(len(chunk))
-            return True
+        return True

    try:
        if _attempt_download(url):
            return
-    except (urllib.error.HTTPError, urllib.error.URLError):
+    except requests.exceptions.RequestException:
        if backup_url is not None:
            print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
            try:
                if _attempt_download(backup_url):
                    return
-            except urllib.error.HTTPError:
+            except requests.exceptions.RequestException:
                pass

-        # If we reach here, both attempts have failed
        error_message = (
            f"Failed to download from both primary URL ({url})"
            f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
--- a/ch05/01_main-chapter-code/gpt_generate.py
+++ b/ch05/01_main-chapter-code/gpt_generate.py
@@ -7,9 +7,8 @@ import argparse
 import json
 import numpy as np
 import os
-import urllib.request

-# import requests
+import requests
 import tensorflow as tf
 import tiktoken
 import torch
@@ -60,18 +59,18 @@ def download_and_load_gpt2(model_size, models_dir):
    return settings, params


-"""
 def download_file(url, destination):
-    # Send a GET request to download the file in streaming mode
-    response = requests.get(url, stream=True)
+    # Send a GET request to download the file
+    response = requests.get(url, stream=True, timeout=60)
+    response.raise_for_status()

    # Get the total file size from headers, defaulting to 0 if not present
-    file_size = int(response.headers.get("content-length", 0))
+    file_size = int(response.headers.get("Content-Length", 0))

    # Check if file exists and has the same size
    if os.path.exists(destination):
        file_size_local = os.path.getsize(destination)
-        if file_size == file_size_local:
+        if file_size and file_size == file_size_local:
            print(f"File already exists and is up-to-date: {destination}")
            return

@@ -79,43 +78,12 @@ def download_file(url, destination):
    block_size = 1024  # 1 Kilobyte

    # Initialize the progress bar with total file size
-    progress_bar_description = url.split("/")[-1]  # Extract filename from URL
+    progress_bar_description = os.path.basename(url)
    with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
        # Open the destination file in binary write mode
        with open(destination, "wb") as file:
-            # Iterate over the file data in chunks
-            for chunk in response.iter_content(block_size):
-                progress_bar.update(len(chunk))  # Update progress bar
-                file.write(chunk)  # Write the chunk to the file
-"""
-
-
-def download_file(url, destination):
-    # Send a GET request to download the file
-    with urllib.request.urlopen(url) as response:
-        # Get the total file size from headers, defaulting to 0 if not present
-        file_size = int(response.headers.get("Content-Length", 0))
-
-        # Check if file exists and has the same size
-        if os.path.exists(destination):
-            file_size_local = os.path.getsize(destination)
-            if file_size == file_size_local:
-                print(f"File already exists and is up-to-date: {destination}")
-                return
-
-        # Define the block size for reading the file
-        block_size = 1024  # 1 Kilobyte
-
-        # Initialize the progress bar with total file size
-        progress_bar_description = os.path.basename(url)  # Extract filename from URL
-        with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
-            # Open the destination file in binary write mode
-            with open(destination, "wb") as file:
-                # Read the file in chunks and write to destination
-                while True:
-                    chunk = response.read(block_size)
-                    if not chunk:
-                        break
+            for chunk in response.iter_content(chunk_size=block_size):
+                if chunk:
                    file.write(chunk)
                    progress_bar.update(len(chunk))  # Update progress bar

--- a/ch05/01_main-chapter-code/gpt_train.py
+++ b/ch05/01_main-chapter-code/gpt_train.py
@@ -5,8 +5,8 @@

 import matplotlib.pyplot as plt
 import os
+import requests
 import torch
-import urllib.request
 import tiktoken


@@ -141,14 +141,14 @@ def main(gpt_config, settings):
    url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

    if not os.path.exists(file_path):
-        with urllib.request.urlopen(url) as response:
-            text_data = response.read().decode('utf-8')
+        response = requests.get(url, timeout=30)
+        response.raise_for_status()
+        text_data = response.text
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()
-
    ##############################
    # Initialize model
    ##############################
--- a/ch05/01_main-chapter-code/tests.py
+++ b/ch05/01_main-chapter-code/tests.py
@@ -7,9 +7,7 @@

 import pytest
 from gpt_train import main
-import http.client
-from urllib.parse import urlparse
-
+import requests

@pytest.fixture
 def gpt_config():
@@ -43,23 +41,23 @@ def test_main(gpt_config, other_settings):


 def check_file_size(url, expected_size):
-    parsed_url = urlparse(url)
-    if parsed_url.scheme == "https":
-        conn = http.client.HTTPSConnection(parsed_url.netloc)
-    else:
-        conn = http.client.HTTPConnection(parsed_url.netloc)
+    try:
+        response = requests.head(url, allow_redirects=True, timeout=30)
+        if response.status_code != 200:
+            return False, f"{url} not accessible"

-    conn.request("HEAD", parsed_url.path)
-    response = conn.getresponse()
-    if response.status != 200:
-        return False, f"{url} not accessible"
-    size = response.getheader("Content-Length")
-    if size is None:
-        return False, "Content-Length header is missing"
-    size = int(size)
-    if size != expected_size:
-        return False, f"{url} file has expected size {expected_size}, but got {size}"
-    return True, f"{url} file size is correct"
+        size = response.headers.get("Content-Length")
+        if size is None:
+            return False, "Content-Length header is missing"
+
+        size = int(size)
+        if size != expected_size:
+            return False, f"{url} file has expected size {expected_size}, but got {size}"
+
+        return True, f"{url} file size is correct"
+
+    except requests.exceptions.RequestException as e:
+        return False, f"Failed to access {url}: {e}"


 def test_model_files():