Switch from urllib to requests to improve reliability (#867)

* Switch from urllib to requests to improve reliability

* Keep ruff linter-specific

* update

* update

* update
This commit is contained in:
Sebastian Raschka
2025-10-07 15:22:59 -05:00
committed by GitHub
parent 8552565bda
commit 7bd263144e
47 changed files with 592 additions and 436 deletions

View File

@@ -793,19 +793,43 @@
"outputs": [],
"source": [
"import os\n",
"import urllib.request\n",
"import requests\n",
"\n",
"file_path = \"the-verdict.txt\"\n",
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
"\n",
"if not os.path.exists(file_path):\n",
" with urllib.request.urlopen(url) as response:\n",
" text_data = response.read().decode('utf-8')\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" text_data = response.text\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n",
"else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()"
" text_data = file.read()\n",
"\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
" \n",
"# import os\n",
"# import urllib.request\n",
"\n",
"# file_path = \"the-verdict.txt\"\n",
"# url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
"\n",
"# if not os.path.exists(file_path):\n",
"# with urllib.request.urlopen(url) as response:\n",
"# text_data = response.read().decode('utf-8')\n",
"# with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
"# file.write(text_data)\n",
"# else:\n",
"# with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
"# text_data = file.read()"
]
},
{

View File

@@ -491,7 +491,7 @@
"outputs": [],
"source": [
"import os\n",
"import urllib.request\n",
"import requests\n",
"from previous_chapters import create_dataloader_v1\n",
"\n",
"\n",
@@ -499,6 +499,25 @@
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
"\n",
"if not os.path.exists(file_path):\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" text_data = response.text\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n",
"else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
"\"\"\"\n",
"import urllib.request\n",
"\n",
"if not os.path.exists(file_path):\n",
" with urllib.request.urlopen(url) as response:\n",
" text_data = response.read().decode('utf-8')\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
@@ -506,6 +525,7 @@
"else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()\n",
"\"\"\"\n",
"\n",
"\n",
"# Train/validation ratio\n",

View File

@@ -5,9 +5,8 @@
import os
import urllib.request
# import requests
import requests
import json
import numpy as np
import tensorflow as tf
@@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir):
def download_file(url, destination, backup_url=None):
def _attempt_download(download_url):
with urllib.request.urlopen(download_url) as response:
# Get the total file size from headers, defaulting to 0 if not present
file_size = int(response.headers.get("Content-Length", 0))
response = requests.get(download_url, stream=True, timeout=60)
response.raise_for_status()
# Check if file exists and has the same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True # Indicate success without re-downloading
file_size = int(response.headers.get("Content-Length", 0))
block_size = 1024 # 1 Kilobyte
# Check if file exists and has same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size and file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True
# Initialize the progress bar with total file size
progress_bar_description = os.path.basename(download_url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
with open(destination, "wb") as file:
while True:
chunk = response.read(block_size)
if not chunk:
break
block_size = 1024 # 1 KB
desc = os.path.basename(download_url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
with open(destination, "wb") as file:
for chunk in response.iter_content(chunk_size=block_size):
if chunk:
file.write(chunk)
progress_bar.update(len(chunk))
return True
return True
try:
if _attempt_download(url):
return
except (urllib.error.HTTPError, urllib.error.URLError):
except requests.exceptions.RequestException:
if backup_url is not None:
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
try:
if _attempt_download(backup_url):
return
except urllib.error.HTTPError:
except requests.exceptions.RequestException:
pass
# If we reach here, both attempts have failed
error_message = (
f"Failed to download from both primary URL ({url})"
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."

View File

@@ -7,9 +7,8 @@ import argparse
import json
import numpy as np
import os
import urllib.request
# import requests
import requests
import tensorflow as tf
import tiktoken
import torch
@@ -60,18 +59,18 @@ def download_and_load_gpt2(model_size, models_dir):
return settings, params
"""
def download_file(url, destination):
# Send a GET request to download the file in streaming mode
response = requests.get(url, stream=True)
# Send a GET request to download the file
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
# Get the total file size from headers, defaulting to 0 if not present
file_size = int(response.headers.get("content-length", 0))
file_size = int(response.headers.get("Content-Length", 0))
# Check if file exists and has the same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
if file_size and file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return
@@ -79,43 +78,12 @@ def download_file(url, destination):
block_size = 1024 # 1 Kilobyte
# Initialize the progress bar with total file size
progress_bar_description = url.split("/")[-1] # Extract filename from URL
progress_bar_description = os.path.basename(url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
# Open the destination file in binary write mode
with open(destination, "wb") as file:
# Iterate over the file data in chunks
for chunk in response.iter_content(block_size):
progress_bar.update(len(chunk)) # Update progress bar
file.write(chunk) # Write the chunk to the file
"""
def download_file(url, destination):
# Send a GET request to download the file
with urllib.request.urlopen(url) as response:
# Get the total file size from headers, defaulting to 0 if not present
file_size = int(response.headers.get("Content-Length", 0))
# Check if file exists and has the same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return
# Define the block size for reading the file
block_size = 1024 # 1 Kilobyte
# Initialize the progress bar with total file size
progress_bar_description = os.path.basename(url) # Extract filename from URL
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
# Open the destination file in binary write mode
with open(destination, "wb") as file:
# Read the file in chunks and write to destination
while True:
chunk = response.read(block_size)
if not chunk:
break
for chunk in response.iter_content(chunk_size=block_size):
if chunk:
file.write(chunk)
progress_bar.update(len(chunk)) # Update progress bar

View File

@@ -5,8 +5,8 @@
import matplotlib.pyplot as plt
import os
import requests
import torch
import urllib.request
import tiktoken
@@ -141,14 +141,14 @@ def main(gpt_config, settings):
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response:
text_data = response.read().decode('utf-8')
response = requests.get(url, timeout=30)
response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data)
else:
with open(file_path, "r", encoding="utf-8") as file:
text_data = file.read()
##############################
# Initialize model
##############################

View File

@@ -7,9 +7,7 @@
import pytest
from gpt_train import main
import http.client
from urllib.parse import urlparse
import requests
@pytest.fixture
def gpt_config():
@@ -43,23 +41,23 @@ def test_main(gpt_config, other_settings):
def check_file_size(url, expected_size):
parsed_url = urlparse(url)
if parsed_url.scheme == "https":
conn = http.client.HTTPSConnection(parsed_url.netloc)
else:
conn = http.client.HTTPConnection(parsed_url.netloc)
try:
response = requests.head(url, allow_redirects=True, timeout=30)
if response.status_code != 200:
return False, f"{url} not accessible"
conn.request("HEAD", parsed_url.path)
response = conn.getresponse()
if response.status != 200:
return False, f"{url} not accessible"
size = response.getheader("Content-Length")
if size is None:
return False, "Content-Length header is missing"
size = int(size)
if size != expected_size:
return False, f"{url} file has expected size {expected_size}, but got {size}"
return True, f"{url} file size is correct"
size = response.headers.get("Content-Length")
if size is None:
return False, "Content-Length header is missing"
size = int(size)
if size != expected_size:
return False, f"{url} file has expected size {expected_size}, but got {size}"
return True, f"{url} file size is correct"
except requests.exceptions.RequestException as e:
return False, f"Failed to access {url}: {e}"
def test_model_files():

View File

@@ -134,7 +134,7 @@
"outputs": [],
"source": [
"import os\n",
"import urllib.request\n",
"import requests\n",
"from safetensors.torch import load_file\n",
"\n",
"URL_DIR = {\n",
@@ -149,7 +149,10 @@
"\n",
"# Download file\n",
"if not os.path.exists(output_file):\n",
" urllib.request.urlretrieve(url, output_file)\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" with open(output_file, \"wb\") as f:\n",
" f.write(response.content)\n",
"\n",
"# Load file\n",
"state_dict = load_file(output_file)"

View File

@@ -144,12 +144,15 @@
],
"source": [
"import os\n",
"import urllib.request\n",
"import requests\n",
"\n",
"url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n",
"\n",
"if not os.path.exists(file_name):\n",
" urllib.request.urlretrieve(url, file_name)\n",
" response = requests.get(url, timeout=60)\n",
" response.raise_for_status()\n",
" with open(file_name, \"wb\") as f:\n",
" f.write(response.content)\n",
" print(f\"Downloaded to {file_name}\")"
]
},
@@ -276,12 +279,15 @@
],
"source": [
"import os\n",
"import urllib.request\n",
"import requests\n",
"\n",
"url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n",
"\n",
"if not os.path.exists(file_name):\n",
" urllib.request.urlretrieve(url, file_name)\n",
" response = requests.get(url, timeout=60)\n",
" response.raise_for_status()\n",
" with open(file_name, \"wb\") as f:\n",
" f.write(response.content)\n",
" print(f\"Downloaded to {file_name}\")"
]
},

View File

@@ -58,12 +58,17 @@ This automatically downloads the weight file based on the model choice above:
```python
import os
import urllib.request
import requests
url = f"https://huggingface.co/rasbt/llama-3.2-from-scratch/resolve/main/{MODEL_FILE}"
if not os.path.exists(MODEL_FILE):
urllib.request.urlretrieve(url, MODEL_FILE)
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
with open(MODEL_FILE, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print(f"Downloaded to {MODEL_FILE}")
```

View File

@@ -6,9 +6,9 @@
import os
import time
import urllib.request
import matplotlib.pyplot as plt
import requests
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
@@ -397,8 +397,9 @@ def main(gpt_config, settings):
url = "https://www.gutenberg.org/cache/epub/145/pg145.txt"
if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response:
text_data = response.read().decode('utf-8')
response = requests.get(url, timeout=30)
response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data)
else:

View File

@@ -6,9 +6,9 @@
import os
import time
import urllib.request
import matplotlib.pyplot as plt
import requests
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
@@ -468,11 +468,11 @@ def main(gpt_config, settings, rank, world_size):
# NEW: Only download 1 time
if rank == 0:
if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response:
text_data = response.read().decode('utf-8')
response = requests.get(url, timeout=30)
response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data)
# NEW: All processes wait until rank 0 is done, using the GPU index.
torch.distributed.barrier(device_ids=[device.index])