Switch from urllib to requests to improve reliability (#867)

* Switch from urllib to requests to improve reliability

* Keep ruff linter-specific

* update

* update

* update
This commit is contained in:
Sebastian Raschka
2025-10-07 15:22:59 -05:00
committed by GitHub
parent 8552565bda
commit 7bd263144e
47 changed files with 592 additions and 436 deletions

View File

@@ -38,14 +38,14 @@ jobs:
- name: Test Selected Python Scripts
run: |
source .venv/bin/activate
pytest --ruff setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py
pytest --ruff ch06/01_main-chapter-code/tests.py
pytest setup/02_installing-python-libraries/tests.py
pytest ch04/01_main-chapter-code/tests.py
pytest ch05/01_main-chapter-code/tests.py
pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks
run: |
source .venv/bin/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -47,24 +47,24 @@ jobs:
shell: bash
run: |
source .venv/bin/activate
pytest --ruff setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py
pytest --ruff ch04/03_kv-cache/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py
pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py
pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py
pytest --ruff ch05/12_gemma3/tests/test_gemma3_nb.py
pytest --ruff ch05/12_gemma3/tests/test_gemma3_kv_nb.py
pytest --ruff ch06/01_main-chapter-code/tests.py
pytest setup/02_installing-python-libraries/tests.py
pytest ch04/01_main-chapter-code/tests.py
pytest ch04/03_kv-cache/tests.py
pytest ch05/01_main-chapter-code/tests.py
pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py
pytest ch05/11_qwen3/tests/test_qwen3_nb.py
pytest ch05/12_gemma3/tests/test_gemma3_nb.py
pytest ch05/12_gemma3/tests/test_gemma3_kv_nb.py
pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks (uv)
shell: bash
run: |
source .venv/bin/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
- name: Test Selected Bonus Materials
shell: bash

View File

@@ -47,20 +47,20 @@ jobs:
shell: bash
run: |
source .venv/bin/activate
pytest --ruff setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py
pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py
pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py
pytest --ruff ch05/12_gemma3/tests/test_gemma3_nb.py
pytest --ruff ch05/12_gemma3/tests/test_gemma3_kv_nb.py
pytest --ruff ch06/01_main-chapter-code/tests.py
pytest setup/02_installing-python-libraries/tests.py
pytest ch04/01_main-chapter-code/tests.py
pytest ch05/01_main-chapter-code/tests.py
pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py
pytest ch05/11_qwen3/tests/test_qwen3_nb.py
pytest ch05/12_gemma3/tests/test_gemma3_nb.py
pytest ch05/12_gemma3/tests/test_gemma3_kv_nb.py
pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks (uv)
shell: bash
run: |
source .venv/bin/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -43,14 +43,14 @@ jobs:
- name: Test Selected Python Scripts
run: |
source .venv/bin/activate
pytest --ruff setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py
pytest --ruff ch06/01_main-chapter-code/tests.py
pytest setup/02_installing-python-libraries/tests.py
pytest ch04/01_main-chapter-code/tests.py
pytest ch05/01_main-chapter-code/tests.py
pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks
run: |
source .venv/bin/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -46,14 +46,14 @@ jobs:
- name: Test Selected Python Scripts
run: |
source .venv/bin/activate
pytest --ruff setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py
pytest --ruff ch06/01_main-chapter-code/tests.py
pytest setup/02_installing-python-libraries/tests.py
pytest ch04/01_main-chapter-code/tests.py
pytest ch05/01_main-chapter-code/tests.py
pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks
run: |
source .venv/bin/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -47,14 +47,14 @@ jobs:
- name: Test Selected Python Scripts
shell: pixi run --environment tests bash -e {0}
run: |
pytest --ruff setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py
pytest --ruff ch06/01_main-chapter-code/tests.py
pytest setup/02_installing-python-libraries/tests.py
pytest ch04/01_main-chapter-code/tests.py
pytest ch05/01_main-chapter-code/tests.py
pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks
shell: pixi run --environment tests bash -e {0}
run: |
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -39,14 +39,14 @@ jobs:
- name: Test Selected Python Scripts
run: |
source .venv/bin/activate
pytest --ruff setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py
pytest --ruff ch06/01_main-chapter-code/tests.py
pytest setup/02_installing-python-libraries/tests.py
pytest ch04/01_main-chapter-code/tests.py
pytest ch05/01_main-chapter-code/tests.py
pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks
run: |
source .venv/bin/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -49,18 +49,18 @@ jobs:
shell: bash
run: |
source .venv/Scripts/activate
pytest --ruff setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py
pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py
pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py
pytest --ruff ch06/01_main-chapter-code/tests.py
pytest setup/02_installing-python-libraries/tests.py
pytest ch04/01_main-chapter-code/tests.py
pytest ch05/01_main-chapter-code/tests.py
pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py
pytest ch05/11_qwen3/tests/test_qwen3_nb.py
pytest ch06/01_main-chapter-code/tests.py
- name: Run Jupyter Notebook Tests
shell: bash
run: |
source .venv/Scripts/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -121,19 +121,40 @@
"outputs": [],
"source": [
"import os\n",
"import urllib.request\n",
"import requests\n",
"\n",
"file_path = \"the-verdict.txt\"\n",
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
"\n",
"if not os.path.exists(file_path):\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" text_data = response.text\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n",
"else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
"\"\"\"\n",
"import os\n",
"import urllib.request\n",
"\n",
"if not os.path.exists(file_path):\n",
" with urllib.request.urlopen(url) as response:\n",
" text_data = response.read().decode('utf-8')\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n",
"else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()"
" text_data = file.read()\n",
"\"\"\""
]
},
{

View File

@@ -190,7 +190,8 @@
}
],
"source": [
"import urllib\n",
"# import urllib\n",
"import requests\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"from previous_chapters import (\n",
@@ -215,13 +216,20 @@
"extracted_path = \"sms_spam_collection\"\n",
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
"\n",
"\n",
"try:\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
"except (requests.exceptions.RequestException, TimeoutError) as e:\n",
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"\n",
"# The book originally used\n",
"# except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
"# in the code above.\n",
"# However, some VPN users reported issues with `urllib`, so the code was updated\n",
"# to use `requests` instead\n",
"\n",
"df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
"balanced_df = create_balanced_dataset(df)\n",
"balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})\n",

View File

@@ -9,12 +9,12 @@
import os
from pathlib import Path
import urllib
import zipfile
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import tiktoken
import torch
import torch.nn as nn
@@ -367,9 +367,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
return
# Downloading the file
with urllib.request.urlopen(url) as response:
with open(zip_path, "wb") as out_file:
out_file.write(response.read())
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
with open(zip_path, "wb") as out_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
out_file.write(chunk)
# Unzipping the file
with zipfile.ZipFile(zip_path, "r") as zip_ref:

View File

@@ -163,6 +163,30 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import requests\n",
"\n",
"if not os.path.exists(\"the-verdict.txt\"):\n",
" url = (\n",
" \"https://raw.githubusercontent.com/rasbt/\"\n",
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
" \"the-verdict.txt\"\n",
" )\n",
" file_path = \"the-verdict.txt\"\n",
"\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" with open(file_path, \"wb\") as f:\n",
" f.write(response.content)\n",
"\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
"\"\"\"\n",
"import os\n",
"import urllib.request\n",
"\n",
@@ -171,7 +195,8 @@
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
" \"the-verdict.txt\")\n",
" file_path = \"the-verdict.txt\"\n",
" urllib.request.urlretrieve(url, file_path)"
" urllib.request.urlretrieve(url, file_path)\n",
"\"\"\""
]
},
{

View File

@@ -823,7 +823,7 @@
],
"source": [
"import os\n",
"import urllib.request\n",
"import requests\n",
"\n",
"def download_file_if_absent(url, filename, search_dirs):\n",
" for directory in search_dirs:\n",
@@ -834,13 +834,19 @@
"\n",
" target_path = os.path.join(search_dirs[0], filename)\n",
" try:\n",
" with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n",
" out_file.write(response.read())\n",
" response = requests.get(url, stream=True, timeout=60)\n",
" response.raise_for_status()\n",
" with open(target_path, \"wb\") as out_file:\n",
" for chunk in response.iter_content(chunk_size=8192):\n",
" if chunk:\n",
" out_file.write(chunk)\n",
" print(f\"Downloaded {filename} to {target_path}\")\n",
" except Exception as e:\n",
" print(f\"Failed to download {filename}. Error: {e}\")\n",
"\n",
" return target_path\n",
"\n",
"\n",
"verdict_path = download_file_if_absent(\n",
" url=(\n",
" \"https://raw.githubusercontent.com/rasbt/\"\n",

View File

@@ -793,19 +793,43 @@
"outputs": [],
"source": [
"import os\n",
"import urllib.request\n",
"import requests\n",
"\n",
"file_path = \"the-verdict.txt\"\n",
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
"\n",
"if not os.path.exists(file_path):\n",
" with urllib.request.urlopen(url) as response:\n",
" text_data = response.read().decode('utf-8')\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" text_data = response.text\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n",
"else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()"
" text_data = file.read()\n",
"\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
" \n",
"# import os\n",
"# import urllib.request\n",
"\n",
"# file_path = \"the-verdict.txt\"\n",
"# url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
"\n",
"# if not os.path.exists(file_path):\n",
"# with urllib.request.urlopen(url) as response:\n",
"# text_data = response.read().decode('utf-8')\n",
"# with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
"# file.write(text_data)\n",
"# else:\n",
"# with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
"# text_data = file.read()"
]
},
{

View File

@@ -491,7 +491,7 @@
"outputs": [],
"source": [
"import os\n",
"import urllib.request\n",
"import requests\n",
"from previous_chapters import create_dataloader_v1\n",
"\n",
"\n",
@@ -499,6 +499,25 @@
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
"\n",
"if not os.path.exists(file_path):\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" text_data = response.text\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n",
"else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
"\"\"\"\n",
"import urllib.request\n",
"\n",
"if not os.path.exists(file_path):\n",
" with urllib.request.urlopen(url) as response:\n",
" text_data = response.read().decode('utf-8')\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
@@ -506,6 +525,7 @@
"else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()\n",
"\"\"\"\n",
"\n",
"\n",
"# Train/validation ratio\n",

View File

@@ -5,9 +5,8 @@
import os
import urllib.request
# import requests
import requests
import json
import numpy as np
import tensorflow as tf
@@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir):
def download_file(url, destination, backup_url=None):
def _attempt_download(download_url):
with urllib.request.urlopen(download_url) as response:
# Get the total file size from headers, defaulting to 0 if not present
file_size = int(response.headers.get("Content-Length", 0))
response = requests.get(download_url, stream=True, timeout=60)
response.raise_for_status()
# Check if file exists and has the same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True # Indicate success without re-downloading
file_size = int(response.headers.get("Content-Length", 0))
block_size = 1024 # 1 Kilobyte
# Check if file exists and has same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size and file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True
# Initialize the progress bar with total file size
progress_bar_description = os.path.basename(download_url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
with open(destination, "wb") as file:
while True:
chunk = response.read(block_size)
if not chunk:
break
block_size = 1024 # 1 KB
desc = os.path.basename(download_url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
with open(destination, "wb") as file:
for chunk in response.iter_content(chunk_size=block_size):
if chunk:
file.write(chunk)
progress_bar.update(len(chunk))
return True
return True
try:
if _attempt_download(url):
return
except (urllib.error.HTTPError, urllib.error.URLError):
except requests.exceptions.RequestException:
if backup_url is not None:
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
try:
if _attempt_download(backup_url):
return
except urllib.error.HTTPError:
except requests.exceptions.RequestException:
pass
# If we reach here, both attempts have failed
error_message = (
f"Failed to download from both primary URL ({url})"
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."

View File

@@ -7,9 +7,8 @@ import argparse
import json
import numpy as np
import os
import urllib.request
# import requests
import requests
import tensorflow as tf
import tiktoken
import torch
@@ -60,18 +59,18 @@ def download_and_load_gpt2(model_size, models_dir):
return settings, params
"""
def download_file(url, destination):
# Send a GET request to download the file in streaming mode
response = requests.get(url, stream=True)
# Send a GET request to download the file
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
# Get the total file size from headers, defaulting to 0 if not present
file_size = int(response.headers.get("content-length", 0))
file_size = int(response.headers.get("Content-Length", 0))
# Check if file exists and has the same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
if file_size and file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return
@@ -79,43 +78,12 @@ def download_file(url, destination):
block_size = 1024 # 1 Kilobyte
# Initialize the progress bar with total file size
progress_bar_description = url.split("/")[-1] # Extract filename from URL
progress_bar_description = os.path.basename(url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
# Open the destination file in binary write mode
with open(destination, "wb") as file:
# Iterate over the file data in chunks
for chunk in response.iter_content(block_size):
progress_bar.update(len(chunk)) # Update progress bar
file.write(chunk) # Write the chunk to the file
"""
def download_file(url, destination):
# Send a GET request to download the file
with urllib.request.urlopen(url) as response:
# Get the total file size from headers, defaulting to 0 if not present
file_size = int(response.headers.get("Content-Length", 0))
# Check if file exists and has the same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return
# Define the block size for reading the file
block_size = 1024 # 1 Kilobyte
# Initialize the progress bar with total file size
progress_bar_description = os.path.basename(url) # Extract filename from URL
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
# Open the destination file in binary write mode
with open(destination, "wb") as file:
# Read the file in chunks and write to destination
while True:
chunk = response.read(block_size)
if not chunk:
break
for chunk in response.iter_content(chunk_size=block_size):
if chunk:
file.write(chunk)
progress_bar.update(len(chunk)) # Update progress bar

View File

@@ -5,8 +5,8 @@
import matplotlib.pyplot as plt
import os
import requests
import torch
import urllib.request
import tiktoken
@@ -141,14 +141,14 @@ def main(gpt_config, settings):
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response:
text_data = response.read().decode('utf-8')
response = requests.get(url, timeout=30)
response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data)
else:
with open(file_path, "r", encoding="utf-8") as file:
text_data = file.read()
##############################
# Initialize model
##############################

View File

@@ -7,9 +7,7 @@
import pytest
from gpt_train import main
import http.client
from urllib.parse import urlparse
import requests
@pytest.fixture
def gpt_config():
@@ -43,23 +41,23 @@ def test_main(gpt_config, other_settings):
def check_file_size(url, expected_size):
parsed_url = urlparse(url)
if parsed_url.scheme == "https":
conn = http.client.HTTPSConnection(parsed_url.netloc)
else:
conn = http.client.HTTPConnection(parsed_url.netloc)
try:
response = requests.head(url, allow_redirects=True, timeout=30)
if response.status_code != 200:
return False, f"{url} not accessible"
conn.request("HEAD", parsed_url.path)
response = conn.getresponse()
if response.status != 200:
return False, f"{url} not accessible"
size = response.getheader("Content-Length")
if size is None:
return False, "Content-Length header is missing"
size = int(size)
if size != expected_size:
return False, f"{url} file has expected size {expected_size}, but got {size}"
return True, f"{url} file size is correct"
size = response.headers.get("Content-Length")
if size is None:
return False, "Content-Length header is missing"
size = int(size)
if size != expected_size:
return False, f"{url} file has expected size {expected_size}, but got {size}"
return True, f"{url} file size is correct"
except requests.exceptions.RequestException as e:
return False, f"Failed to access {url}: {e}"
def test_model_files():

View File

@@ -134,7 +134,7 @@
"outputs": [],
"source": [
"import os\n",
"import urllib.request\n",
"import requests\n",
"from safetensors.torch import load_file\n",
"\n",
"URL_DIR = {\n",
@@ -149,7 +149,10 @@
"\n",
"# Download file\n",
"if not os.path.exists(output_file):\n",
" urllib.request.urlretrieve(url, output_file)\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" with open(output_file, \"wb\") as f:\n",
" f.write(response.content)\n",
"\n",
"# Load file\n",
"state_dict = load_file(output_file)"

View File

@@ -144,12 +144,15 @@
],
"source": [
"import os\n",
"import urllib.request\n",
"import requests\n",
"\n",
"url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n",
"\n",
"if not os.path.exists(file_name):\n",
" urllib.request.urlretrieve(url, file_name)\n",
" response = requests.get(url, timeout=60)\n",
" response.raise_for_status()\n",
" with open(file_name, \"wb\") as f:\n",
" f.write(response.content)\n",
" print(f\"Downloaded to {file_name}\")"
]
},
@@ -276,12 +279,15 @@
],
"source": [
"import os\n",
"import urllib.request\n",
"import requests\n",
"\n",
"url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n",
"\n",
"if not os.path.exists(file_name):\n",
" urllib.request.urlretrieve(url, file_name)\n",
" response = requests.get(url, timeout=60)\n",
" response.raise_for_status()\n",
" with open(file_name, \"wb\") as f:\n",
" f.write(response.content)\n",
" print(f\"Downloaded to {file_name}\")"
]
},

View File

@@ -58,12 +58,17 @@ This automatically downloads the weight file based on the model choice above:
```python
import os
import urllib.request
import requests
url = f"https://huggingface.co/rasbt/llama-3.2-from-scratch/resolve/main/{MODEL_FILE}"
if not os.path.exists(MODEL_FILE):
urllib.request.urlretrieve(url, MODEL_FILE)
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
with open(MODEL_FILE, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print(f"Downloaded to {MODEL_FILE}")
```

View File

@@ -6,9 +6,9 @@
import os
import time
import urllib.request
import matplotlib.pyplot as plt
import requests
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
@@ -397,8 +397,9 @@ def main(gpt_config, settings):
url = "https://www.gutenberg.org/cache/epub/145/pg145.txt"
if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response:
text_data = response.read().decode('utf-8')
response = requests.get(url, timeout=30)
response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data)
else:

View File

@@ -6,9 +6,9 @@
import os
import time
import urllib.request
import matplotlib.pyplot as plt
import requests
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
@@ -468,11 +468,11 @@ def main(gpt_config, settings, rank, world_size):
# NEW: Only download 1 time
if rank == 0:
if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response:
text_data = response.read().decode('utf-8')
response = requests.get(url, timeout=30)
response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data)
# NEW: All processes wait until rank 0 is done, using the GPU index.
torch.distributed.barrier(device_ids=[device.index])

View File

@@ -186,6 +186,56 @@
}
],
"source": [
"import requests\n",
"import zipfile\n",
"import os\n",
"from pathlib import Path\n",
"\n",
"url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
"zip_path = \"sms_spam_collection.zip\"\n",
"extracted_path = \"sms_spam_collection\"\n",
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
"\n",
"\n",
"def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):\n",
" if data_file_path.exists():\n",
" print(f\"{data_file_path} already exists. Skipping download and extraction.\")\n",
" return\n",
"\n",
" # Downloading the file\n",
" response = requests.get(url, stream=True, timeout=60)\n",
" response.raise_for_status()\n",
" with open(zip_path, \"wb\") as out_file:\n",
" for chunk in response.iter_content(chunk_size=8192):\n",
" if chunk:\n",
" out_file.write(chunk)\n",
"\n",
" # Unzipping the file\n",
" with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n",
" zip_ref.extractall(extracted_path)\n",
"\n",
" # Add .tsv file extension\n",
" original_file_path = Path(extracted_path) / \"SMSSpamCollection\"\n",
" os.rename(original_file_path, data_file_path)\n",
" print(f\"File downloaded and saved as {data_file_path}\")\n",
"\n",
"\n",
"try:\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"except (requests.exceptions.RequestException, TimeoutError) as e:\n",
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"\n",
"\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
"\"\"\"\n",
"import urllib.request\n",
"import zipfile\n",
"import os\n",
@@ -220,7 +270,8 @@
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) "
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"\"\"\""
]
},
{

View File

@@ -5,7 +5,7 @@
# This is a summary file containing the main takeaways from chapter 6.
import urllib.request
import requests
import zipfile
import os
from pathlib import Path
@@ -27,9 +27,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
return
# Downloading the file
with urllib.request.urlopen(url) as response:
with open(zip_path, "wb") as out_file:
out_file.write(response.read())
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
with open(zip_path, "wb") as out_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
out_file.write(chunk)
# Unzipping the file
with zipfile.ZipFile(zip_path, "r") as zip_ref:
@@ -259,7 +262,7 @@ if __name__ == "__main__":
try:
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
except (requests.exceptions.RequestException, TimeoutError) as e:
print(f"Primary URL failed: {e}. Trying backup URL...")
url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

View File

@@ -8,10 +8,10 @@ import math
import os
from pathlib import Path
import time
import urllib.request
import zipfile
import pandas as pd
import requests
import tiktoken
import torch
from torch.utils.data import DataLoader
@@ -113,9 +113,12 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path):
return
# Downloading the file
with urllib.request.urlopen(url) as response:
with open(zip_path, "wb") as out_file:
out_file.write(response.read())
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
with open(zip_path, "wb") as out_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
out_file.write(chunk)
# Unzipping the file
with zipfile.ZipFile(zip_path, "r") as zip_ref:
@@ -608,11 +611,11 @@ if __name__ == "__main__":
base_path = Path(".")
file_names = ["train.csv", "validation.csv", "test.csv"]
all_exist = all((base_path / file_name).exists() for file_name in file_names)
if not all_exist:
try:
download_and_unzip(url, zip_path, extract_to, new_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
except (requests.exceptions.RequestException, TimeoutError) as e:
print(f"Primary URL failed: {e}. Trying backup URL...")
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
download_and_unzip(backup_url, zip_path, extract_to, new_file_path)

View File

@@ -7,7 +7,7 @@ import os
import sys
import tarfile
import time
import urllib.request
import requests
import pandas as pd
@@ -32,7 +32,15 @@ def download_and_extract_dataset(dataset_url, target_file, directory):
if not os.path.exists(directory):
if os.path.exists(target_file):
os.remove(target_file)
urllib.request.urlretrieve(dataset_url, target_file, reporthook)
response = requests.get(dataset_url, stream=True, timeout=60)
response.raise_for_status()
with open(target_file, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("\nExtracting dataset ...")
with tarfile.open(target_file, "r:gz") as tar:
tar.extractall()

View File

@@ -7,7 +7,7 @@ import argparse
import os
from pathlib import Path
import time
import urllib
import requests
import zipfile
import pandas as pd
@@ -62,9 +62,12 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path):
return
# Downloading the file
with urllib.request.urlopen(url) as response:
with open(zip_path, "wb") as out_file:
out_file.write(response.read())
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
with open(zip_path, "wb") as out_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
out_file.write(chunk)
# Unzipping the file
with zipfile.ZipFile(zip_path, "r") as zip_ref:
@@ -412,7 +415,7 @@ if __name__ == "__main__":
if not all_exist:
try:
download_and_unzip(url, zip_path, extract_to, new_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
except (requests.exceptions.RequestException, TimeoutError) as e:
print(f"Primary URL failed: {e}. Trying backup URL...")
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
download_and_unzip(backup_url, zip_path, extract_to, new_file_path)

View File

@@ -169,10 +169,33 @@
"source": [
"import json\n",
"import os\n",
"import urllib\n",
"import requests\n",
"\n",
"\n",
"def download_and_load_file(file_path, url):\n",
" if not os.path.exists(file_path):\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" text_data = response.text\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n",
"\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" data = json.load(file)\n",
"\n",
" return data\n",
"\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
"\"\"\"\n",
"import urllib\n",
"\n",
"def download_and_load_file(file_path, url):\n",
"\n",
" if not os.path.exists(file_path):\n",
" with urllib.request.urlopen(url) as response:\n",
@@ -180,15 +203,15 @@
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n",
"\n",
" # The book originally contained this unnecessary \"else\" clause:\n",
" #else:\n",
" # with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" # text_data = file.read()\n",
" else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()\n",
"\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" data = json.load(file)\n",
"\n",
" return data\n",
"\"\"\"\n",
"\n",
"\n",
"file_path = \"instruction-data.json\"\n",
@@ -2490,7 +2513,8 @@
}
],
"source": [
"import urllib.request\n",
"import requests # noqa: F811\n",
"# import urllib.request\n",
"\n",
"def query_model(\n",
" prompt,\n",
@@ -2512,7 +2536,8 @@
" }\n",
" }\n",
"\n",
"\n",
" \n",
" \"\"\"\n",
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
" payload = json.dumps(data).encode(\"utf-8\")\n",
"\n",
@@ -2536,6 +2561,26 @@
" response_data += response_json[\"message\"][\"content\"]\n",
"\n",
" return response_data\n",
" \"\"\"\n",
"\n",
" # The book originally used the commented-out above, which is based\n",
" # on urllib. It works generally fine, but some readers reported\n",
" # issues with using urlib when using a (company) VPN.\n",
" # The code below uses the requests library, which doesn't seem\n",
" # to have these issues.\n",
"\n",
" # Send the POST request\n",
" with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
" r.raise_for_status()\n",
" response_data = \"\"\n",
" for line in r.iter_lines(decode_unicode=True):\n",
" if not line:\n",
" continue\n",
" response_json = json.loads(line)\n",
" if \"message\" in response_json:\n",
" response_data += response_json[\"message\"][\"content\"]\n",
"\n",
" return response_data\n",
"\n",
"\n",
"model = \"llama3\"\n",

View File

@@ -12,10 +12,10 @@ import math
import os
import re
import time
import urllib
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import requests
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader
@@ -234,17 +234,17 @@ def custom_collate_with_masking_fn(
def download_and_load_file(file_path, url):
if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response:
text_data = response.read().decode("utf-8")
response = requests.get(url, timeout=30)
response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data)
else:
with open(file_path, "r", encoding="utf-8") as file:
text_data = file.read()
with open(file_path, "r") as file:
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
return data

View File

@@ -5,11 +5,10 @@
import os
import urllib.request
# import requests
import json
import numpy as np
import requests
import tensorflow as tf
from tqdm import tqdm
@@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir):
def download_file(url, destination, backup_url=None):
def _attempt_download(download_url):
with urllib.request.urlopen(download_url) as response:
# Get the total file size from headers, defaulting to 0 if not present
file_size = int(response.headers.get("Content-Length", 0))
response = requests.get(download_url, stream=True, timeout=60)
response.raise_for_status()
# Check if file exists and has the same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True # Indicate success without re-downloading
file_size = int(response.headers.get("Content-Length", 0))
block_size = 1024 # 1 Kilobyte
# Check if file exists and has same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size and file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True
# Initialize the progress bar with total file size
progress_bar_description = os.path.basename(download_url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
with open(destination, "wb") as file:
while True:
chunk = response.read(block_size)
if not chunk:
break
block_size = 1024 # 1 KB
desc = os.path.basename(download_url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
with open(destination, "wb") as file:
for chunk in response.iter_content(chunk_size=block_size):
if chunk:
file.write(chunk)
progress_bar.update(len(chunk))
return True
return True
try:
if _attempt_download(url):
return
except (urllib.error.HTTPError, urllib.error.URLError):
except requests.exceptions.RequestException:
if backup_url is not None:
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
try:
if _attempt_download(backup_url):
return
except urllib.error.HTTPError:
except requests.exceptions.RequestException:
pass
# If we reach here, both attempts have failed
error_message = (
f"Failed to download from both primary URL ({url})"
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
@@ -97,37 +92,6 @@ def download_file(url, destination, backup_url=None):
print(f"An unexpected error occurred: {e}")
# Alternative way using `requests`
"""
def download_file(url, destination):
# Send a GET request to download the file in streaming mode
response = requests.get(url, stream=True)
# Get the total file size from headers, defaulting to 0 if not present
file_size = int(response.headers.get("content-length", 0))
# Check if file exists and has the same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return
# Define the block size for reading the file
block_size = 1024 # 1 Kilobyte
# Initialize the progress bar with total file size
progress_bar_description = url.split("/")[-1] # Extract filename from URL
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
# Open the destination file in binary write mode
with open(destination, "wb") as file:
# Iterate over the file data in chunks
for chunk in response.iter_content(block_size):
progress_bar.update(len(chunk)) # Update progress bar
file.write(chunk) # Write the chunk to the file
"""
def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
# Initialize parameters dictionary with empty blocks for each layer
params = {"blocks": [{} for _ in range(settings["n_layer"])]}

View File

@@ -11,9 +11,9 @@ import json
import os
import re
import time
import urllib
import matplotlib.pyplot as plt
import requests
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader
@@ -97,14 +97,14 @@ def custom_collate_fn(
def download_and_load_file(file_path, url):
if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response:
text_data = response.read().decode("utf-8")
response = requests.get(url, timeout=30)
response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data)
with open(file_path, "r") as file:
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
return data

View File

@@ -8,7 +8,7 @@
import json
import psutil
from tqdm import tqdm
import urllib.request
import requests
def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"):
@@ -25,23 +25,16 @@ def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"):
}
}
# Convert the dictionary to a JSON formatted string and encode it to bytes
payload = json.dumps(data).encode("utf-8")
# Create a request object, setting the method to POST and adding necessary headers
request = urllib.request.Request(url, data=payload, method="POST")
request.add_header("Content-Type", "application/json")
# Send the request and capture the response
response_data = ""
with urllib.request.urlopen(request) as response:
# Read and decode the response
while True:
line = response.readline().decode("utf-8")
# Send the POST request
with requests.post(url, json=data, stream=True, timeout=30) as r:
r.raise_for_status()
response_data = ""
for line in r.iter_lines(decode_unicode=True):
if not line:
break
continue
response_json = json.loads(line)
response_data += response_json["message"]["content"]
if "message" in response_json:
response_data += response_json["message"]["content"]
return response_data

View File

@@ -215,8 +215,8 @@
}
],
"source": [
"import urllib.request\n",
"import json\n",
"import requests\n",
"\n",
"\n",
"def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\"):\n",
@@ -236,27 +236,19 @@
" }\n",
" }\n",
"\n",
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
" payload = json.dumps(data).encode(\"utf-8\")\n",
"\n",
" # Create a request object, setting the method to POST and adding necessary headers\n",
" request = urllib.request.Request(url, data=payload, method=\"POST\")\n",
" request.add_header(\"Content-Type\", \"application/json\")\n",
"\n",
" # Send the request and capture the response\n",
" response_data = \"\"\n",
" with urllib.request.urlopen(request) as response:\n",
" # Read and decode the response\n",
" while True:\n",
" line = response.readline().decode(\"utf-8\")\n",
" # Send the POST request\n",
" with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
" r.raise_for_status()\n",
" response_data = \"\"\n",
" for line in r.iter_lines(decode_unicode=True):\n",
" if not line:\n",
" break\n",
" continue\n",
" response_json = json.loads(line)\n",
" response_data += response_json[\"message\"][\"content\"]\n",
" if \"message\" in response_json:\n",
" response_data += response_json[\"message\"][\"content\"]\n",
"\n",
" return response_data\n",
"\n",
"\n",
"result = query_model(\"What do Llamas eat?\")\n",
"print(result)"
]
@@ -640,7 +632,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.10.16"
}
},
"nbformat": 4,

View File

@@ -274,8 +274,8 @@
}
],
"source": [
"import urllib.request\n",
"import json\n",
"import requests\n",
"\n",
"\n",
"def query_model(prompt, model=\"llama3.1:70b\", url=\"http://localhost:11434/api/chat\"):\n",
@@ -294,23 +294,16 @@
" }\n",
" }\n",
"\n",
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
" payload = json.dumps(data).encode(\"utf-8\")\n",
"\n",
" # Create a request object, setting the method to POST and adding necessary headers\n",
" request = urllib.request.Request(url, data=payload, method=\"POST\")\n",
" request.add_header(\"Content-Type\", \"application/json\")\n",
"\n",
" # Send the request and capture the response\n",
" response_data = \"\"\n",
" with urllib.request.urlopen(request) as response:\n",
" # Read and decode the response\n",
" while True:\n",
" line = response.readline().decode(\"utf-8\")\n",
" # Send the POST request\n",
" with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
" r.raise_for_status()\n",
" response_data = \"\"\n",
" for line in r.iter_lines(decode_unicode=True):\n",
" if not line:\n",
" break\n",
" continue\n",
" response_json = json.loads(line)\n",
" response_data += response_json[\"message\"][\"content\"]\n",
" if \"message\" in response_json:\n",
" response_data += response_json[\"message\"][\"content\"]\n",
"\n",
" return response_data\n",
"\n",
@@ -587,7 +580,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.10.16"
}
},
"nbformat": 4,

View File

@@ -231,23 +231,21 @@
"source": [
"import json\n",
"import os\n",
"import urllib\n",
"import requests\n",
"\n",
"\n",
"def download_and_load_file(file_path, url):\n",
"\n",
" if not os.path.exists(file_path):\n",
" with urllib.request.urlopen(url) as response:\n",
" text_data = response.read().decode(\"utf-8\")\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" text_data = response.text\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n",
" else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()\n",
"\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" data = json.load(file)\n",
"\n",
" data = json.loads(text_data)\n",
" return data\n",
"\n",
"\n",

View File

@@ -194,8 +194,8 @@
"metadata": {},
"outputs": [],
"source": [
"import urllib.request\n",
"import json\n",
"import requests\n",
"\n",
"def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\", role=\"user\"):\n",
" # Create the data payload as a dictionary\n",
@@ -209,25 +209,21 @@
" ]\n",
" }\n",
"\n",
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
" payload = json.dumps(data).encode(\"utf-8\")\n",
"\n",
" # Create a request object, setting the method to POST and adding necessary headers\n",
" request = urllib.request.Request(url, data=payload, method=\"POST\")\n",
" request.add_header(\"Content-Type\", \"application/json\")\n",
"\n",
" # Send the request and capture the response\n",
" response_data = \"\"\n",
" with urllib.request.urlopen(request) as response:\n",
" # Read and decode the response\n",
" while True:\n",
" line = response.readline().decode(\"utf-8\")\n",
" # Send the POST request\n",
" with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
" r.raise_for_status()\n",
" response_data = \"\"\n",
" for line in r.iter_lines(decode_unicode=True):\n",
" if not line:\n",
" break\n",
" continue\n",
" response_json = json.loads(line)\n",
" response_data += response_json[\"message\"][\"content\"]\n",
" if \"message\" in response_json:\n",
" response_data += response_json[\"message\"][\"content\"]\n",
"\n",
" return response_data"
" return response_data\n",
"\n",
"result = query_model(\"What do Llamas eat?\")\n",
"print(result)"
]
},
{
@@ -498,7 +494,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
"version": "3.10.16"
}
},
"nbformat": 4,

View File

@@ -7,11 +7,11 @@ from .ch04 import generate_text_simple
import json
import os
import urllib.request
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import requests
import torch
from tqdm import tqdm
@@ -279,44 +279,40 @@ def download_and_load_gpt2(model_size, models_dir):
def download_file(url, destination, backup_url=None):
def _attempt_download(download_url):
with urllib.request.urlopen(download_url) as response:
# Get the total file size from headers, defaulting to 0 if not present
file_size = int(response.headers.get("Content-Length", 0))
response = requests.get(download_url, stream=True, timeout=60)
response.raise_for_status()
# Check if file exists and has the same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True # Indicate success without re-downloading
file_size = int(response.headers.get("Content-Length", 0))
block_size = 1024 # 1 Kilobyte
# Check if file exists and has same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size and file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True
# Initialize the progress bar with total file size
progress_bar_description = os.path.basename(download_url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
with open(destination, "wb") as file:
while True:
chunk = response.read(block_size)
if not chunk:
break
block_size = 1024 # 1 KB
desc = os.path.basename(download_url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
with open(destination, "wb") as file:
for chunk in response.iter_content(chunk_size=block_size):
if chunk:
file.write(chunk)
progress_bar.update(len(chunk))
return True
return True
try:
if _attempt_download(url):
return
except (urllib.error.HTTPError, urllib.error.URLError):
except requests.exceptions.RequestException:
if backup_url is not None:
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
try:
if _attempt_download(backup_url):
return
except urllib.error.HTTPError:
except requests.exceptions.RequestException:
pass
# If we reach here, both attempts have failed
error_message = (
f"Failed to download from both primary URL ({url})"
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."

View File

@@ -4,11 +4,11 @@
# Code: https://github.com/rasbt/LLMs-from-scratch
import urllib.request
import zipfile
import os
from pathlib import Path
import requests
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
import torch
@@ -21,9 +21,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
return
# Downloading the file
with urllib.request.urlopen(url) as response:
with open(zip_path, "wb") as out_file:
out_file.write(response.read())
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
with open(zip_path, "wb") as out_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
out_file.write(chunk)
# Unzipping the file
with zipfile.ZipFile(zip_path, "r") as zip_ref:

View File

@@ -6,7 +6,7 @@
import json
import os
import psutil
import urllib
import requests
import torch
from tqdm import tqdm
@@ -14,24 +14,46 @@ from torch.utils.data import Dataset
def download_and_load_file(file_path, url):
if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response:
text_data = response.read().decode("utf-8")
response = requests.get(url, timeout=30)
response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data)
# The book originally contained this unnecessary "else" clause:
# else:
# with open(file_path, "r", encoding="utf-8") as file:
# text_data = file.read()
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
return data
# The book originally used the following code below
# However, urllib uses older protocol settings that
# can cause problems for some readers using a VPN.
# The `requests` version above is more robust
# in that regard.
# import urllib
# def download_and_load_file(file_path, url):
# if not os.path.exists(file_path):
# with urllib.request.urlopen(url) as response:
# text_data = response.read().decode("utf-8")
# with open(file_path, "w", encoding="utf-8") as file:
# file.write(text_data)
# else:
# with open(file_path, "r", encoding="utf-8") as file:
# text_data = file.read()
# with open(file_path, "r", encoding="utf-8") as file:
# data = json.load(file)
# return data
def format_input(entry):
instruction_text = (
f"Below is an instruction that describes a task. "
@@ -202,27 +224,16 @@ def query_model(
}
}
# Convert the dictionary to a JSON formatted string and encode it to bytes
payload = json.dumps(data).encode("utf-8")
# Create a request object, setting the method to POST and adding necessary headers
request = urllib.request.Request(
url,
data=payload,
method="POST"
)
request.add_header("Content-Type", "application/json")
# Send the request and capture the response
response_data = ""
with urllib.request.urlopen(request) as response:
# Read and decode the response
while True:
line = response.readline().decode("utf-8")
# Send the POST request
with requests.post(url, json=data, stream=True, timeout=30) as r:
r.raise_for_status()
response_data = ""
for line in r.iter_lines(decode_unicode=True):
if not line:
break
continue
response_json = json.loads(line)
response_data += response_json["message"]["content"]
if "message" in response_json:
response_data += response_json["message"]["content"]
return response_data

View File

@@ -6,9 +6,9 @@
import os
import json
import re
import urllib.request
from pathlib import Path
import requests
import torch
import torch.nn as nn
@@ -660,7 +660,12 @@ def download_from_huggingface(repo_id, filename, local_dir, revision="main"):
print(f"File already exists: {dest_path}")
else:
print(f"Downloading {url} to {dest_path}...")
urllib.request.urlretrieve(url, dest_path)
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
with open(dest_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return dest_path

View File

@@ -12,9 +12,9 @@ from llms_from_scratch.ch06 import (
from llms_from_scratch.appendix_e import replace_linear_with_lora
from pathlib import Path
import urllib
import pandas as pd
import requests
import tiktoken
import torch
from torch.utils.data import DataLoader, Subset
@@ -35,7 +35,7 @@ def test_train_classifier_lora(tmp_path):
download_and_unzip_spam_data(
url, zip_path, extracted_path, data_file_path
)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
except (requests.exceptions.RequestException, TimeoutError) as e:
print(f"Primary URL failed: {e}. Trying backup URL...")
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
download_and_unzip_spam_data(

View File

@@ -6,8 +6,8 @@
from llms_from_scratch.ch02 import create_dataloader_v1
import os
import urllib.request
import requests
import pytest
import torch
@@ -16,11 +16,17 @@ import torch
def test_dataloader(tmp_path, file_name):
if not os.path.exists("the-verdict.txt"):
url = ("https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
"the-verdict.txt")
url = (
"https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
"the-verdict.txt"
)
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)
response = requests.get(url, timeout=30)
response.raise_for_status()
with open(file_path, "wb") as f:
f.write(response.content)
with open("the-verdict.txt", "r", encoding="utf-8") as f:
raw_text = f.read()

View File

@@ -8,8 +8,8 @@ from llms_from_scratch.ch04 import GPTModel, GPTModelFast
from llms_from_scratch.ch05 import train_model_simple
import os
import urllib
import requests
import pytest
import tiktoken
import torch
@@ -46,8 +46,9 @@ def test_train_simple(tmp_path, ModelClass):
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response:
text_data = response.read().decode("utf-8")
response = requests.get(url, timeout=30)
response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as f:
f.write(text_data)
else:

View File

@@ -11,8 +11,8 @@ from llms_from_scratch.ch06 import (
)
from pathlib import Path
import urllib
import requests
import pandas as pd
import tiktoken
import torch
@@ -34,7 +34,7 @@ def test_train_classifier(tmp_path):
download_and_unzip_spam_data(
url, zip_path, extracted_path, data_file_path
)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
except (requests.exceptions.RequestException, TimeoutError) as e:
print(f"Primary URL failed: {e}. Trying backup URL...")
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
download_and_unzip_spam_data(

View File

@@ -9,10 +9,9 @@ import ast
import re
import types
from pathlib import Path
import urllib.request
import urllib.parse
import nbformat
import requests
def _extract_imports(src: str):
@@ -125,21 +124,24 @@ def import_definitions_from_notebook(nb_dir_or_path, notebook_name=None, *, extr
exec(src, mod.__dict__)
return mod
def download_file(url, out_dir="."):
"""Simple file download utility for tests."""
from pathlib import Path
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
filename = Path(urllib.parse.urlparse(url).path).name
filename = Path(url).name
dest = out_dir / filename
if dest.exists():
return dest
try:
with urllib.request.urlopen(url) as response:
with open(dest, 'wb') as f:
f.write(response.read())
response = requests.get(url, stream=True, timeout=30)
response.raise_for_status()
with open(dest, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return dest
except Exception as e:
raise RuntimeError(f"Failed to download {url}: {e}")