mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Switch from urllib to requests to improve reliability (#867)
* Switch from urllib to requests to improve reliability * Keep ruff linter-specific * update * update * update
This commit is contained in:
committed by
GitHub
parent
8552565bda
commit
7bd263144e
@@ -793,19 +793,43 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib.request\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"file_path = \"the-verdict.txt\"\n",
|
||||
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
||||
"\n",
|
||||
"if not os.path.exists(file_path):\n",
|
||||
" with urllib.request.urlopen(url) as response:\n",
|
||||
" text_data = response.read().decode('utf-8')\n",
|
||||
" response = requests.get(url, timeout=30)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" text_data = response.text\n",
|
||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||
" file.write(text_data)\n",
|
||||
"else:\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" text_data = file.read()"
|
||||
" text_data = file.read()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# The book originally used the following code below\n",
|
||||
"# However, urllib uses older protocol settings that\n",
|
||||
"# can cause problems for some readers using a VPN.\n",
|
||||
"# The `requests` version above is more robust\n",
|
||||
"# in that regard.\n",
|
||||
"\n",
|
||||
" \n",
|
||||
"# import os\n",
|
||||
"# import urllib.request\n",
|
||||
"\n",
|
||||
"# file_path = \"the-verdict.txt\"\n",
|
||||
"# url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
||||
"\n",
|
||||
"# if not os.path.exists(file_path):\n",
|
||||
"# with urllib.request.urlopen(url) as response:\n",
|
||||
"# text_data = response.read().decode('utf-8')\n",
|
||||
"# with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||
"# file.write(text_data)\n",
|
||||
"# else:\n",
|
||||
"# with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
"# text_data = file.read()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -491,7 +491,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib.request\n",
|
||||
"import requests\n",
|
||||
"from previous_chapters import create_dataloader_v1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -499,6 +499,25 @@
|
||||
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
||||
"\n",
|
||||
"if not os.path.exists(file_path):\n",
|
||||
" response = requests.get(url, timeout=30)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" text_data = response.text\n",
|
||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||
" file.write(text_data)\n",
|
||||
"else:\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" text_data = file.read()\n",
|
||||
"\n",
|
||||
"# The book originally used the following code below\n",
|
||||
"# However, urllib uses older protocol settings that\n",
|
||||
"# can cause problems for some readers using a VPN.\n",
|
||||
"# The `requests` version above is more robust\n",
|
||||
"# in that regard.\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"import urllib.request\n",
|
||||
"\n",
|
||||
"if not os.path.exists(file_path):\n",
|
||||
" with urllib.request.urlopen(url) as response:\n",
|
||||
" text_data = response.read().decode('utf-8')\n",
|
||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||
@@ -506,6 +525,7 @@
|
||||
"else:\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" text_data = file.read()\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Train/validation ratio\n",
|
||||
|
||||
@@ -5,9 +5,8 @@
|
||||
|
||||
|
||||
import os
|
||||
import urllib.request
|
||||
|
||||
# import requests
|
||||
import requests
|
||||
import json
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
@@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir):
|
||||
|
||||
def download_file(url, destination, backup_url=None):
|
||||
def _attempt_download(download_url):
|
||||
with urllib.request.urlopen(download_url) as response:
|
||||
# Get the total file size from headers, defaulting to 0 if not present
|
||||
file_size = int(response.headers.get("Content-Length", 0))
|
||||
response = requests.get(download_url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check if file exists and has the same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return True # Indicate success without re-downloading
|
||||
file_size = int(response.headers.get("Content-Length", 0))
|
||||
|
||||
block_size = 1024 # 1 Kilobyte
|
||||
# Check if file exists and has same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size and file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return True
|
||||
|
||||
# Initialize the progress bar with total file size
|
||||
progress_bar_description = os.path.basename(download_url)
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
||||
with open(destination, "wb") as file:
|
||||
while True:
|
||||
chunk = response.read(block_size)
|
||||
if not chunk:
|
||||
break
|
||||
block_size = 1024 # 1 KB
|
||||
desc = os.path.basename(download_url)
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
|
||||
with open(destination, "wb") as file:
|
||||
for chunk in response.iter_content(chunk_size=block_size):
|
||||
if chunk:
|
||||
file.write(chunk)
|
||||
progress_bar.update(len(chunk))
|
||||
return True
|
||||
return True
|
||||
|
||||
try:
|
||||
if _attempt_download(url):
|
||||
return
|
||||
except (urllib.error.HTTPError, urllib.error.URLError):
|
||||
except requests.exceptions.RequestException:
|
||||
if backup_url is not None:
|
||||
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
|
||||
try:
|
||||
if _attempt_download(backup_url):
|
||||
return
|
||||
except urllib.error.HTTPError:
|
||||
except requests.exceptions.RequestException:
|
||||
pass
|
||||
|
||||
# If we reach here, both attempts have failed
|
||||
error_message = (
|
||||
f"Failed to download from both primary URL ({url})"
|
||||
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
|
||||
|
||||
@@ -7,9 +7,8 @@ import argparse
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import urllib.request
|
||||
|
||||
# import requests
|
||||
import requests
|
||||
import tensorflow as tf
|
||||
import tiktoken
|
||||
import torch
|
||||
@@ -60,18 +59,18 @@ def download_and_load_gpt2(model_size, models_dir):
|
||||
return settings, params
|
||||
|
||||
|
||||
"""
|
||||
def download_file(url, destination):
|
||||
# Send a GET request to download the file in streaming mode
|
||||
response = requests.get(url, stream=True)
|
||||
# Send a GET request to download the file
|
||||
response = requests.get(url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
# Get the total file size from headers, defaulting to 0 if not present
|
||||
file_size = int(response.headers.get("content-length", 0))
|
||||
file_size = int(response.headers.get("Content-Length", 0))
|
||||
|
||||
# Check if file exists and has the same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size == file_size_local:
|
||||
if file_size and file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return
|
||||
|
||||
@@ -79,43 +78,12 @@ def download_file(url, destination):
|
||||
block_size = 1024 # 1 Kilobyte
|
||||
|
||||
# Initialize the progress bar with total file size
|
||||
progress_bar_description = url.split("/")[-1] # Extract filename from URL
|
||||
progress_bar_description = os.path.basename(url)
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
||||
# Open the destination file in binary write mode
|
||||
with open(destination, "wb") as file:
|
||||
# Iterate over the file data in chunks
|
||||
for chunk in response.iter_content(block_size):
|
||||
progress_bar.update(len(chunk)) # Update progress bar
|
||||
file.write(chunk) # Write the chunk to the file
|
||||
"""
|
||||
|
||||
|
||||
def download_file(url, destination):
|
||||
# Send a GET request to download the file
|
||||
with urllib.request.urlopen(url) as response:
|
||||
# Get the total file size from headers, defaulting to 0 if not present
|
||||
file_size = int(response.headers.get("Content-Length", 0))
|
||||
|
||||
# Check if file exists and has the same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return
|
||||
|
||||
# Define the block size for reading the file
|
||||
block_size = 1024 # 1 Kilobyte
|
||||
|
||||
# Initialize the progress bar with total file size
|
||||
progress_bar_description = os.path.basename(url) # Extract filename from URL
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
||||
# Open the destination file in binary write mode
|
||||
with open(destination, "wb") as file:
|
||||
# Read the file in chunks and write to destination
|
||||
while True:
|
||||
chunk = response.read(block_size)
|
||||
if not chunk:
|
||||
break
|
||||
for chunk in response.iter_content(chunk_size=block_size):
|
||||
if chunk:
|
||||
file.write(chunk)
|
||||
progress_bar.update(len(chunk)) # Update progress bar
|
||||
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import os
|
||||
import requests
|
||||
import torch
|
||||
import urllib.request
|
||||
import tiktoken
|
||||
|
||||
|
||||
@@ -141,14 +141,14 @@ def main(gpt_config, settings):
|
||||
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
with urllib.request.urlopen(url) as response:
|
||||
text_data = response.read().decode('utf-8')
|
||||
response = requests.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
text_data = response.text
|
||||
with open(file_path, "w", encoding="utf-8") as file:
|
||||
file.write(text_data)
|
||||
else:
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
text_data = file.read()
|
||||
|
||||
##############################
|
||||
# Initialize model
|
||||
##############################
|
||||
|
||||
@@ -7,9 +7,7 @@
|
||||
|
||||
import pytest
|
||||
from gpt_train import main
|
||||
import http.client
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
@pytest.fixture
|
||||
def gpt_config():
|
||||
@@ -43,23 +41,23 @@ def test_main(gpt_config, other_settings):
|
||||
|
||||
|
||||
def check_file_size(url, expected_size):
|
||||
parsed_url = urlparse(url)
|
||||
if parsed_url.scheme == "https":
|
||||
conn = http.client.HTTPSConnection(parsed_url.netloc)
|
||||
else:
|
||||
conn = http.client.HTTPConnection(parsed_url.netloc)
|
||||
try:
|
||||
response = requests.head(url, allow_redirects=True, timeout=30)
|
||||
if response.status_code != 200:
|
||||
return False, f"{url} not accessible"
|
||||
|
||||
conn.request("HEAD", parsed_url.path)
|
||||
response = conn.getresponse()
|
||||
if response.status != 200:
|
||||
return False, f"{url} not accessible"
|
||||
size = response.getheader("Content-Length")
|
||||
if size is None:
|
||||
return False, "Content-Length header is missing"
|
||||
size = int(size)
|
||||
if size != expected_size:
|
||||
return False, f"{url} file has expected size {expected_size}, but got {size}"
|
||||
return True, f"{url} file size is correct"
|
||||
size = response.headers.get("Content-Length")
|
||||
if size is None:
|
||||
return False, "Content-Length header is missing"
|
||||
|
||||
size = int(size)
|
||||
if size != expected_size:
|
||||
return False, f"{url} file has expected size {expected_size}, but got {size}"
|
||||
|
||||
return True, f"{url} file size is correct"
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
return False, f"Failed to access {url}: {e}"
|
||||
|
||||
|
||||
def test_model_files():
|
||||
|
||||
Reference in New Issue
Block a user