Switch from urllib to requests to improve reliability (#867)

* Switch from urllib to requests to improve reliability * Keep ruff linter-specific * update * update * update
2026-04-10 12:33:42 +00:00 · 2025-10-07 15:22:59 -05:00
parent 8552565bda
commit 7bd263144e
47 changed files with 592 additions and 436 deletions
--- a/pkg/llms_from_scratch/tests/test_appendix_e.py
+++ b/pkg/llms_from_scratch/tests/test_appendix_e.py
@@ -12,9 +12,9 @@ from llms_from_scratch.ch06 import (
 from llms_from_scratch.appendix_e import replace_linear_with_lora

 from pathlib import Path
-import urllib

 import pandas as pd
+import requests
 import tiktoken
 import torch
 from torch.utils.data import DataLoader, Subset
@@ -35,7 +35,7 @@ def test_train_classifier_lora(tmp_path):
        download_and_unzip_spam_data(
            url, zip_path, extracted_path, data_file_path
        )
-    except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
+    except (requests.exceptions.RequestException, TimeoutError) as e:
        print(f"Primary URL failed: {e}. Trying backup URL...")
        backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
        download_and_unzip_spam_data(
--- a/pkg/llms_from_scratch/tests/test_ch02.py
+++ b/pkg/llms_from_scratch/tests/test_ch02.py
@@ -6,8 +6,8 @@
 from llms_from_scratch.ch02 import create_dataloader_v1

 import os
-import urllib.request

+import requests
 import pytest
 import torch

@@ -16,11 +16,17 @@ import torch
 def test_dataloader(tmp_path, file_name):

    if not os.path.exists("the-verdict.txt"):
-        url = ("https://raw.githubusercontent.com/rasbt/"
-               "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
-               "the-verdict.txt")
+        url = (
+            "https://raw.githubusercontent.com/rasbt/"
+            "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
+            "the-verdict.txt"
+        )
        file_path = "the-verdict.txt"
-        urllib.request.urlretrieve(url, file_path)
+
+        response = requests.get(url, timeout=30)
+        response.raise_for_status()
+        with open(file_path, "wb") as f:
+            f.write(response.content)

    with open("the-verdict.txt", "r", encoding="utf-8") as f:
        raw_text = f.read()
--- a/pkg/llms_from_scratch/tests/test_ch05.py
+++ b/pkg/llms_from_scratch/tests/test_ch05.py
@@ -8,8 +8,8 @@ from llms_from_scratch.ch04 import GPTModel, GPTModelFast
 from llms_from_scratch.ch05 import train_model_simple

 import os
-import urllib

+import requests
 import pytest
 import tiktoken
 import torch
@@ -46,8 +46,9 @@ def test_train_simple(tmp_path, ModelClass):
    url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

    if not os.path.exists(file_path):
-        with urllib.request.urlopen(url) as response:
-            text_data = response.read().decode("utf-8")
+        response = requests.get(url, timeout=30)
+        response.raise_for_status()
+        text_data = response.text
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(text_data)
    else:
--- a/pkg/llms_from_scratch/tests/test_ch06.py
+++ b/pkg/llms_from_scratch/tests/test_ch06.py
@@ -11,8 +11,8 @@ from llms_from_scratch.ch06 import (
 )

 from pathlib import Path
-import urllib

+import requests
 import pandas as pd
 import tiktoken
 import torch
@@ -34,7 +34,7 @@ def test_train_classifier(tmp_path):
        download_and_unzip_spam_data(
            url, zip_path, extracted_path, data_file_path
        )
-    except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
+    except (requests.exceptions.RequestException, TimeoutError) as e:
        print(f"Primary URL failed: {e}. Trying backup URL...")
        backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
        download_and_unzip_spam_data(