Switch from urllib to requests to improve reliability (#867)

* Switch from urllib to requests to improve reliability

* Keep ruff linter-specific

* update

* update

* update
This commit is contained in:
Sebastian Raschka
2025-10-07 15:22:59 -05:00
committed by GitHub
parent 8552565bda
commit 7bd263144e
47 changed files with 592 additions and 436 deletions

View File

@@ -38,14 +38,14 @@ jobs:
- name: Test Selected Python Scripts - name: Test Selected Python Scripts
run: | run: |
source .venv/bin/activate source .venv/bin/activate
pytest --ruff setup/02_installing-python-libraries/tests.py pytest setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py pytest ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py pytest ch05/01_main-chapter-code/tests.py
pytest --ruff ch06/01_main-chapter-code/tests.py pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks - name: Validate Selected Jupyter Notebooks
run: | run: |
source .venv/bin/activate source .venv/bin/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -47,24 +47,24 @@ jobs:
shell: bash shell: bash
run: | run: |
source .venv/bin/activate source .venv/bin/activate
pytest --ruff setup/02_installing-python-libraries/tests.py pytest setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py pytest ch04/01_main-chapter-code/tests.py
pytest --ruff ch04/03_kv-cache/tests.py pytest ch04/03_kv-cache/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py pytest ch05/01_main-chapter-code/tests.py
pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py
pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py pytest ch05/11_qwen3/tests/test_qwen3_nb.py
pytest --ruff ch05/12_gemma3/tests/test_gemma3_nb.py pytest ch05/12_gemma3/tests/test_gemma3_nb.py
pytest --ruff ch05/12_gemma3/tests/test_gemma3_kv_nb.py pytest ch05/12_gemma3/tests/test_gemma3_kv_nb.py
pytest --ruff ch06/01_main-chapter-code/tests.py pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks (uv) - name: Validate Selected Jupyter Notebooks (uv)
shell: bash shell: bash
run: | run: |
source .venv/bin/activate source .venv/bin/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
- name: Test Selected Bonus Materials - name: Test Selected Bonus Materials
shell: bash shell: bash

View File

@@ -47,20 +47,20 @@ jobs:
shell: bash shell: bash
run: | run: |
source .venv/bin/activate source .venv/bin/activate
pytest --ruff setup/02_installing-python-libraries/tests.py pytest setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py pytest ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py pytest ch05/01_main-chapter-code/tests.py
pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py
pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py pytest ch05/11_qwen3/tests/test_qwen3_nb.py
pytest --ruff ch05/12_gemma3/tests/test_gemma3_nb.py pytest ch05/12_gemma3/tests/test_gemma3_nb.py
pytest --ruff ch05/12_gemma3/tests/test_gemma3_kv_nb.py pytest ch05/12_gemma3/tests/test_gemma3_kv_nb.py
pytest --ruff ch06/01_main-chapter-code/tests.py pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks (uv) - name: Validate Selected Jupyter Notebooks (uv)
shell: bash shell: bash
run: | run: |
source .venv/bin/activate source .venv/bin/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -43,14 +43,14 @@ jobs:
- name: Test Selected Python Scripts - name: Test Selected Python Scripts
run: | run: |
source .venv/bin/activate source .venv/bin/activate
pytest --ruff setup/02_installing-python-libraries/tests.py pytest setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py pytest ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py pytest ch05/01_main-chapter-code/tests.py
pytest --ruff ch06/01_main-chapter-code/tests.py pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks - name: Validate Selected Jupyter Notebooks
run: | run: |
source .venv/bin/activate source .venv/bin/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -46,14 +46,14 @@ jobs:
- name: Test Selected Python Scripts - name: Test Selected Python Scripts
run: | run: |
source .venv/bin/activate source .venv/bin/activate
pytest --ruff setup/02_installing-python-libraries/tests.py pytest setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py pytest ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py pytest ch05/01_main-chapter-code/tests.py
pytest --ruff ch06/01_main-chapter-code/tests.py pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks - name: Validate Selected Jupyter Notebooks
run: | run: |
source .venv/bin/activate source .venv/bin/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -47,14 +47,14 @@ jobs:
- name: Test Selected Python Scripts - name: Test Selected Python Scripts
shell: pixi run --environment tests bash -e {0} shell: pixi run --environment tests bash -e {0}
run: | run: |
pytest --ruff setup/02_installing-python-libraries/tests.py pytest setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py pytest ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py pytest ch05/01_main-chapter-code/tests.py
pytest --ruff ch06/01_main-chapter-code/tests.py pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks - name: Validate Selected Jupyter Notebooks
shell: pixi run --environment tests bash -e {0} shell: pixi run --environment tests bash -e {0}
run: | run: |
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -39,14 +39,14 @@ jobs:
- name: Test Selected Python Scripts - name: Test Selected Python Scripts
run: | run: |
source .venv/bin/activate source .venv/bin/activate
pytest --ruff setup/02_installing-python-libraries/tests.py pytest setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py pytest ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py pytest ch05/01_main-chapter-code/tests.py
pytest --ruff ch06/01_main-chapter-code/tests.py pytest ch06/01_main-chapter-code/tests.py
- name: Validate Selected Jupyter Notebooks - name: Validate Selected Jupyter Notebooks
run: | run: |
source .venv/bin/activate source .venv/bin/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -49,18 +49,18 @@ jobs:
shell: bash shell: bash
run: | run: |
source .venv/Scripts/activate source .venv/Scripts/activate
pytest --ruff setup/02_installing-python-libraries/tests.py pytest setup/02_installing-python-libraries/tests.py
pytest --ruff ch04/01_main-chapter-code/tests.py pytest ch04/01_main-chapter-code/tests.py
pytest --ruff ch05/01_main-chapter-code/tests.py pytest ch05/01_main-chapter-code/tests.py
pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py
pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py pytest ch05/11_qwen3/tests/test_qwen3_nb.py
pytest --ruff ch06/01_main-chapter-code/tests.py pytest ch06/01_main-chapter-code/tests.py
- name: Run Jupyter Notebook Tests - name: Run Jupyter Notebook Tests
shell: bash shell: bash
run: | run: |
source .venv/Scripts/activate source .venv/Scripts/activate
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

View File

@@ -121,19 +121,40 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"import os\n", "import os\n",
"import urllib.request\n", "import requests\n",
"\n", "\n",
"file_path = \"the-verdict.txt\"\n", "file_path = \"the-verdict.txt\"\n",
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n", "url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
"\n", "\n",
"if not os.path.exists(file_path):\n", "if not os.path.exists(file_path):\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" text_data = response.text\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n",
"else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
"\"\"\"\n",
"import os\n",
"import urllib.request\n",
"\n",
"if not os.path.exists(file_path):\n",
" with urllib.request.urlopen(url) as response:\n", " with urllib.request.urlopen(url) as response:\n",
" text_data = response.read().decode('utf-8')\n", " text_data = response.read().decode('utf-8')\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", " with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n", " file.write(text_data)\n",
"else:\n", "else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()" " text_data = file.read()\n",
"\"\"\""
] ]
}, },
{ {

View File

@@ -190,7 +190,8 @@
} }
], ],
"source": [ "source": [
"import urllib\n", "# import urllib\n",
"import requests\n",
"from pathlib import Path\n", "from pathlib import Path\n",
"import pandas as pd\n", "import pandas as pd\n",
"from previous_chapters import (\n", "from previous_chapters import (\n",
@@ -215,13 +216,20 @@
"extracted_path = \"sms_spam_collection\"\n", "extracted_path = \"sms_spam_collection\"\n",
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n", "data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
"\n", "\n",
"\n",
"try:\n", "try:\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n", "except (requests.exceptions.RequestException, TimeoutError) as e:\n",
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n", " print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n", " url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"\n", "\n",
"# The book originally used\n",
"# except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
"# in the code above.\n",
"# However, some VPN users reported issues with `urllib`, so the code was updated\n",
"# to use `requests` instead\n",
"\n",
"df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n", "df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
"balanced_df = create_balanced_dataset(df)\n", "balanced_df = create_balanced_dataset(df)\n",
"balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})\n", "balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})\n",

View File

@@ -9,12 +9,12 @@
import os import os
from pathlib import Path from pathlib import Path
import urllib
import zipfile import zipfile
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import requests
import tiktoken import tiktoken
import torch import torch
import torch.nn as nn import torch.nn as nn
@@ -367,9 +367,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
return return
# Downloading the file # Downloading the file
with urllib.request.urlopen(url) as response: response = requests.get(url, stream=True, timeout=60)
with open(zip_path, "wb") as out_file: response.raise_for_status()
out_file.write(response.read()) with open(zip_path, "wb") as out_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
out_file.write(chunk)
# Unzipping the file # Unzipping the file
with zipfile.ZipFile(zip_path, "r") as zip_ref: with zipfile.ZipFile(zip_path, "r") as zip_ref:

View File

@@ -163,6 +163,30 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import os\n",
"import requests\n",
"\n",
"if not os.path.exists(\"the-verdict.txt\"):\n",
" url = (\n",
" \"https://raw.githubusercontent.com/rasbt/\"\n",
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
" \"the-verdict.txt\"\n",
" )\n",
" file_path = \"the-verdict.txt\"\n",
"\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" with open(file_path, \"wb\") as f:\n",
" f.write(response.content)\n",
"\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
"\"\"\"\n",
"import os\n", "import os\n",
"import urllib.request\n", "import urllib.request\n",
"\n", "\n",
@@ -171,7 +195,8 @@
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n", " \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
" \"the-verdict.txt\")\n", " \"the-verdict.txt\")\n",
" file_path = \"the-verdict.txt\"\n", " file_path = \"the-verdict.txt\"\n",
" urllib.request.urlretrieve(url, file_path)" " urllib.request.urlretrieve(url, file_path)\n",
"\"\"\""
] ]
}, },
{ {

View File

@@ -823,7 +823,7 @@
], ],
"source": [ "source": [
"import os\n", "import os\n",
"import urllib.request\n", "import requests\n",
"\n", "\n",
"def download_file_if_absent(url, filename, search_dirs):\n", "def download_file_if_absent(url, filename, search_dirs):\n",
" for directory in search_dirs:\n", " for directory in search_dirs:\n",
@@ -834,13 +834,19 @@
"\n", "\n",
" target_path = os.path.join(search_dirs[0], filename)\n", " target_path = os.path.join(search_dirs[0], filename)\n",
" try:\n", " try:\n",
" with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n", " response = requests.get(url, stream=True, timeout=60)\n",
" out_file.write(response.read())\n", " response.raise_for_status()\n",
" with open(target_path, \"wb\") as out_file:\n",
" for chunk in response.iter_content(chunk_size=8192):\n",
" if chunk:\n",
" out_file.write(chunk)\n",
" print(f\"Downloaded {filename} to {target_path}\")\n", " print(f\"Downloaded {filename} to {target_path}\")\n",
" except Exception as e:\n", " except Exception as e:\n",
" print(f\"Failed to download {filename}. Error: {e}\")\n", " print(f\"Failed to download {filename}. Error: {e}\")\n",
"\n",
" return target_path\n", " return target_path\n",
"\n", "\n",
"\n",
"verdict_path = download_file_if_absent(\n", "verdict_path = download_file_if_absent(\n",
" url=(\n", " url=(\n",
" \"https://raw.githubusercontent.com/rasbt/\"\n", " \"https://raw.githubusercontent.com/rasbt/\"\n",

View File

@@ -793,19 +793,43 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"import os\n", "import os\n",
"import urllib.request\n", "import requests\n",
"\n", "\n",
"file_path = \"the-verdict.txt\"\n", "file_path = \"the-verdict.txt\"\n",
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n", "url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
"\n", "\n",
"if not os.path.exists(file_path):\n", "if not os.path.exists(file_path):\n",
" with urllib.request.urlopen(url) as response:\n", " response = requests.get(url, timeout=30)\n",
" text_data = response.read().decode('utf-8')\n", " response.raise_for_status()\n",
" text_data = response.text\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", " with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n", " file.write(text_data)\n",
"else:\n", "else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()" " text_data = file.read()\n",
"\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
" \n",
"# import os\n",
"# import urllib.request\n",
"\n",
"# file_path = \"the-verdict.txt\"\n",
"# url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
"\n",
"# if not os.path.exists(file_path):\n",
"# with urllib.request.urlopen(url) as response:\n",
"# text_data = response.read().decode('utf-8')\n",
"# with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
"# file.write(text_data)\n",
"# else:\n",
"# with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
"# text_data = file.read()"
] ]
}, },
{ {

View File

@@ -491,7 +491,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"import os\n", "import os\n",
"import urllib.request\n", "import requests\n",
"from previous_chapters import create_dataloader_v1\n", "from previous_chapters import create_dataloader_v1\n",
"\n", "\n",
"\n", "\n",
@@ -499,6 +499,25 @@
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n", "url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
"\n", "\n",
"if not os.path.exists(file_path):\n", "if not os.path.exists(file_path):\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" text_data = response.text\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n",
"else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
"\"\"\"\n",
"import urllib.request\n",
"\n",
"if not os.path.exists(file_path):\n",
" with urllib.request.urlopen(url) as response:\n", " with urllib.request.urlopen(url) as response:\n",
" text_data = response.read().decode('utf-8')\n", " text_data = response.read().decode('utf-8')\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", " with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
@@ -506,6 +525,7 @@
"else:\n", "else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()\n", " text_data = file.read()\n",
"\"\"\"\n",
"\n", "\n",
"\n", "\n",
"# Train/validation ratio\n", "# Train/validation ratio\n",

View File

@@ -5,9 +5,8 @@
import os import os
import urllib.request
# import requests import requests
import json import json
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
@@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir):
def download_file(url, destination, backup_url=None): def download_file(url, destination, backup_url=None):
def _attempt_download(download_url): def _attempt_download(download_url):
with urllib.request.urlopen(download_url) as response: response = requests.get(download_url, stream=True, timeout=60)
# Get the total file size from headers, defaulting to 0 if not present response.raise_for_status()
file_size = int(response.headers.get("Content-Length", 0))
# Check if file exists and has the same size file_size = int(response.headers.get("Content-Length", 0))
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True # Indicate success without re-downloading
block_size = 1024 # 1 Kilobyte # Check if file exists and has same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size and file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True
# Initialize the progress bar with total file size block_size = 1024 # 1 KB
progress_bar_description = os.path.basename(download_url) desc = os.path.basename(download_url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
with open(destination, "wb") as file: with open(destination, "wb") as file:
while True: for chunk in response.iter_content(chunk_size=block_size):
chunk = response.read(block_size) if chunk:
if not chunk:
break
file.write(chunk) file.write(chunk)
progress_bar.update(len(chunk)) progress_bar.update(len(chunk))
return True return True
try: try:
if _attempt_download(url): if _attempt_download(url):
return return
except (urllib.error.HTTPError, urllib.error.URLError): except requests.exceptions.RequestException:
if backup_url is not None: if backup_url is not None:
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}") print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
try: try:
if _attempt_download(backup_url): if _attempt_download(backup_url):
return return
except urllib.error.HTTPError: except requests.exceptions.RequestException:
pass pass
# If we reach here, both attempts have failed
error_message = ( error_message = (
f"Failed to download from both primary URL ({url})" f"Failed to download from both primary URL ({url})"
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}." f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."

View File

@@ -7,9 +7,8 @@ import argparse
import json import json
import numpy as np import numpy as np
import os import os
import urllib.request
# import requests import requests
import tensorflow as tf import tensorflow as tf
import tiktoken import tiktoken
import torch import torch
@@ -60,18 +59,18 @@ def download_and_load_gpt2(model_size, models_dir):
return settings, params return settings, params
"""
def download_file(url, destination): def download_file(url, destination):
# Send a GET request to download the file in streaming mode # Send a GET request to download the file
response = requests.get(url, stream=True) response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
# Get the total file size from headers, defaulting to 0 if not present # Get the total file size from headers, defaulting to 0 if not present
file_size = int(response.headers.get("content-length", 0)) file_size = int(response.headers.get("Content-Length", 0))
# Check if file exists and has the same size # Check if file exists and has the same size
if os.path.exists(destination): if os.path.exists(destination):
file_size_local = os.path.getsize(destination) file_size_local = os.path.getsize(destination)
if file_size == file_size_local: if file_size and file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}") print(f"File already exists and is up-to-date: {destination}")
return return
@@ -79,43 +78,12 @@ def download_file(url, destination):
block_size = 1024 # 1 Kilobyte block_size = 1024 # 1 Kilobyte
# Initialize the progress bar with total file size # Initialize the progress bar with total file size
progress_bar_description = url.split("/")[-1] # Extract filename from URL progress_bar_description = os.path.basename(url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
# Open the destination file in binary write mode # Open the destination file in binary write mode
with open(destination, "wb") as file: with open(destination, "wb") as file:
# Iterate over the file data in chunks for chunk in response.iter_content(chunk_size=block_size):
for chunk in response.iter_content(block_size): if chunk:
progress_bar.update(len(chunk)) # Update progress bar
file.write(chunk) # Write the chunk to the file
"""
def download_file(url, destination):
# Send a GET request to download the file
with urllib.request.urlopen(url) as response:
# Get the total file size from headers, defaulting to 0 if not present
file_size = int(response.headers.get("Content-Length", 0))
# Check if file exists and has the same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return
# Define the block size for reading the file
block_size = 1024 # 1 Kilobyte
# Initialize the progress bar with total file size
progress_bar_description = os.path.basename(url) # Extract filename from URL
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
# Open the destination file in binary write mode
with open(destination, "wb") as file:
# Read the file in chunks and write to destination
while True:
chunk = response.read(block_size)
if not chunk:
break
file.write(chunk) file.write(chunk)
progress_bar.update(len(chunk)) # Update progress bar progress_bar.update(len(chunk)) # Update progress bar

View File

@@ -5,8 +5,8 @@
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import os import os
import requests
import torch import torch
import urllib.request
import tiktoken import tiktoken
@@ -141,14 +141,14 @@ def main(gpt_config, settings):
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt" url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
if not os.path.exists(file_path): if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response: response = requests.get(url, timeout=30)
text_data = response.read().decode('utf-8') response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file: with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data) file.write(text_data)
else: else:
with open(file_path, "r", encoding="utf-8") as file: with open(file_path, "r", encoding="utf-8") as file:
text_data = file.read() text_data = file.read()
############################## ##############################
# Initialize model # Initialize model
############################## ##############################

View File

@@ -7,9 +7,7 @@
import pytest import pytest
from gpt_train import main from gpt_train import main
import http.client import requests
from urllib.parse import urlparse
@pytest.fixture @pytest.fixture
def gpt_config(): def gpt_config():
@@ -43,23 +41,23 @@ def test_main(gpt_config, other_settings):
def check_file_size(url, expected_size): def check_file_size(url, expected_size):
parsed_url = urlparse(url) try:
if parsed_url.scheme == "https": response = requests.head(url, allow_redirects=True, timeout=30)
conn = http.client.HTTPSConnection(parsed_url.netloc) if response.status_code != 200:
else: return False, f"{url} not accessible"
conn = http.client.HTTPConnection(parsed_url.netloc)
conn.request("HEAD", parsed_url.path) size = response.headers.get("Content-Length")
response = conn.getresponse() if size is None:
if response.status != 200: return False, "Content-Length header is missing"
return False, f"{url} not accessible"
size = response.getheader("Content-Length") size = int(size)
if size is None: if size != expected_size:
return False, "Content-Length header is missing" return False, f"{url} file has expected size {expected_size}, but got {size}"
size = int(size)
if size != expected_size: return True, f"{url} file size is correct"
return False, f"{url} file has expected size {expected_size}, but got {size}"
return True, f"{url} file size is correct" except requests.exceptions.RequestException as e:
return False, f"Failed to access {url}: {e}"
def test_model_files(): def test_model_files():

View File

@@ -134,7 +134,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"import os\n", "import os\n",
"import urllib.request\n", "import requests\n",
"from safetensors.torch import load_file\n", "from safetensors.torch import load_file\n",
"\n", "\n",
"URL_DIR = {\n", "URL_DIR = {\n",
@@ -149,7 +149,10 @@
"\n", "\n",
"# Download file\n", "# Download file\n",
"if not os.path.exists(output_file):\n", "if not os.path.exists(output_file):\n",
" urllib.request.urlretrieve(url, output_file)\n", " response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" with open(output_file, \"wb\") as f:\n",
" f.write(response.content)\n",
"\n", "\n",
"# Load file\n", "# Load file\n",
"state_dict = load_file(output_file)" "state_dict = load_file(output_file)"

View File

@@ -144,12 +144,15 @@
], ],
"source": [ "source": [
"import os\n", "import os\n",
"import urllib.request\n", "import requests\n",
"\n", "\n",
"url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n", "url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n",
"\n", "\n",
"if not os.path.exists(file_name):\n", "if not os.path.exists(file_name):\n",
" urllib.request.urlretrieve(url, file_name)\n", " response = requests.get(url, timeout=60)\n",
" response.raise_for_status()\n",
" with open(file_name, \"wb\") as f:\n",
" f.write(response.content)\n",
" print(f\"Downloaded to {file_name}\")" " print(f\"Downloaded to {file_name}\")"
] ]
}, },
@@ -276,12 +279,15 @@
], ],
"source": [ "source": [
"import os\n", "import os\n",
"import urllib.request\n", "import requests\n",
"\n", "\n",
"url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n", "url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n",
"\n", "\n",
"if not os.path.exists(file_name):\n", "if not os.path.exists(file_name):\n",
" urllib.request.urlretrieve(url, file_name)\n", " response = requests.get(url, timeout=60)\n",
" response.raise_for_status()\n",
" with open(file_name, \"wb\") as f:\n",
" f.write(response.content)\n",
" print(f\"Downloaded to {file_name}\")" " print(f\"Downloaded to {file_name}\")"
] ]
}, },

View File

@@ -58,12 +58,17 @@ This automatically downloads the weight file based on the model choice above:
```python ```python
import os import os
import urllib.request import requests
url = f"https://huggingface.co/rasbt/llama-3.2-from-scratch/resolve/main/{MODEL_FILE}" url = f"https://huggingface.co/rasbt/llama-3.2-from-scratch/resolve/main/{MODEL_FILE}"
if not os.path.exists(MODEL_FILE): if not os.path.exists(MODEL_FILE):
urllib.request.urlretrieve(url, MODEL_FILE) response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
with open(MODEL_FILE, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print(f"Downloaded to {MODEL_FILE}") print(f"Downloaded to {MODEL_FILE}")
``` ```

View File

@@ -6,9 +6,9 @@
import os import os
import time import time
import urllib.request
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import requests
import torch import torch
import torch.nn as nn import torch.nn as nn
from torch.utils.data import Dataset, DataLoader from torch.utils.data import Dataset, DataLoader
@@ -397,8 +397,9 @@ def main(gpt_config, settings):
url = "https://www.gutenberg.org/cache/epub/145/pg145.txt" url = "https://www.gutenberg.org/cache/epub/145/pg145.txt"
if not os.path.exists(file_path): if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response: response = requests.get(url, timeout=30)
text_data = response.read().decode('utf-8') response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file: with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data) file.write(text_data)
else: else:

View File

@@ -6,9 +6,9 @@
import os import os
import time import time
import urllib.request
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import requests
import torch import torch
import torch.nn as nn import torch.nn as nn
from torch.utils.data import Dataset, DataLoader from torch.utils.data import Dataset, DataLoader
@@ -468,11 +468,11 @@ def main(gpt_config, settings, rank, world_size):
# NEW: Only download 1 time # NEW: Only download 1 time
if rank == 0: if rank == 0:
if not os.path.exists(file_path): if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response: response = requests.get(url, timeout=30)
text_data = response.read().decode('utf-8') response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file: with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data) file.write(text_data)
# NEW: All processes wait until rank 0 is done, using the GPU index. # NEW: All processes wait until rank 0 is done, using the GPU index.
torch.distributed.barrier(device_ids=[device.index]) torch.distributed.barrier(device_ids=[device.index])

View File

@@ -186,6 +186,56 @@
} }
], ],
"source": [ "source": [
"import requests\n",
"import zipfile\n",
"import os\n",
"from pathlib import Path\n",
"\n",
"url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
"zip_path = \"sms_spam_collection.zip\"\n",
"extracted_path = \"sms_spam_collection\"\n",
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
"\n",
"\n",
"def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):\n",
" if data_file_path.exists():\n",
" print(f\"{data_file_path} already exists. Skipping download and extraction.\")\n",
" return\n",
"\n",
" # Downloading the file\n",
" response = requests.get(url, stream=True, timeout=60)\n",
" response.raise_for_status()\n",
" with open(zip_path, \"wb\") as out_file:\n",
" for chunk in response.iter_content(chunk_size=8192):\n",
" if chunk:\n",
" out_file.write(chunk)\n",
"\n",
" # Unzipping the file\n",
" with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n",
" zip_ref.extractall(extracted_path)\n",
"\n",
" # Add .tsv file extension\n",
" original_file_path = Path(extracted_path) / \"SMSSpamCollection\"\n",
" os.rename(original_file_path, data_file_path)\n",
" print(f\"File downloaded and saved as {data_file_path}\")\n",
"\n",
"\n",
"try:\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"except (requests.exceptions.RequestException, TimeoutError) as e:\n",
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"\n",
"\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
"\"\"\"\n",
"import urllib.request\n", "import urllib.request\n",
"import zipfile\n", "import zipfile\n",
"import os\n", "import os\n",
@@ -220,7 +270,8 @@
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n", "except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n", " print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n", " url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) " " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"\"\"\""
] ]
}, },
{ {

View File

@@ -5,7 +5,7 @@
# This is a summary file containing the main takeaways from chapter 6. # This is a summary file containing the main takeaways from chapter 6.
import urllib.request import requests
import zipfile import zipfile
import os import os
from pathlib import Path from pathlib import Path
@@ -27,9 +27,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
return return
# Downloading the file # Downloading the file
with urllib.request.urlopen(url) as response: response = requests.get(url, stream=True, timeout=60)
with open(zip_path, "wb") as out_file: response.raise_for_status()
out_file.write(response.read()) with open(zip_path, "wb") as out_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
out_file.write(chunk)
# Unzipping the file # Unzipping the file
with zipfile.ZipFile(zip_path, "r") as zip_ref: with zipfile.ZipFile(zip_path, "r") as zip_ref:
@@ -259,7 +262,7 @@ if __name__ == "__main__":
try: try:
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: except (requests.exceptions.RequestException, TimeoutError) as e:
print(f"Primary URL failed: {e}. Trying backup URL...") print(f"Primary URL failed: {e}. Trying backup URL...")
url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

View File

@@ -8,10 +8,10 @@ import math
import os import os
from pathlib import Path from pathlib import Path
import time import time
import urllib.request
import zipfile import zipfile
import pandas as pd import pandas as pd
import requests
import tiktoken import tiktoken
import torch import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
@@ -113,9 +113,12 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path):
return return
# Downloading the file # Downloading the file
with urllib.request.urlopen(url) as response: response = requests.get(url, stream=True, timeout=60)
with open(zip_path, "wb") as out_file: response.raise_for_status()
out_file.write(response.read()) with open(zip_path, "wb") as out_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
out_file.write(chunk)
# Unzipping the file # Unzipping the file
with zipfile.ZipFile(zip_path, "r") as zip_ref: with zipfile.ZipFile(zip_path, "r") as zip_ref:
@@ -608,11 +611,11 @@ if __name__ == "__main__":
base_path = Path(".") base_path = Path(".")
file_names = ["train.csv", "validation.csv", "test.csv"] file_names = ["train.csv", "validation.csv", "test.csv"]
all_exist = all((base_path / file_name).exists() for file_name in file_names) all_exist = all((base_path / file_name).exists() for file_name in file_names)
if not all_exist: if not all_exist:
try: try:
download_and_unzip(url, zip_path, extract_to, new_file_path) download_and_unzip(url, zip_path, extract_to, new_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: except (requests.exceptions.RequestException, TimeoutError) as e:
print(f"Primary URL failed: {e}. Trying backup URL...") print(f"Primary URL failed: {e}. Trying backup URL...")
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
download_and_unzip(backup_url, zip_path, extract_to, new_file_path) download_and_unzip(backup_url, zip_path, extract_to, new_file_path)

View File

@@ -7,7 +7,7 @@ import os
import sys import sys
import tarfile import tarfile
import time import time
import urllib.request import requests
import pandas as pd import pandas as pd
@@ -32,7 +32,15 @@ def download_and_extract_dataset(dataset_url, target_file, directory):
if not os.path.exists(directory): if not os.path.exists(directory):
if os.path.exists(target_file): if os.path.exists(target_file):
os.remove(target_file) os.remove(target_file)
urllib.request.urlretrieve(dataset_url, target_file, reporthook)
response = requests.get(dataset_url, stream=True, timeout=60)
response.raise_for_status()
with open(target_file, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("\nExtracting dataset ...") print("\nExtracting dataset ...")
with tarfile.open(target_file, "r:gz") as tar: with tarfile.open(target_file, "r:gz") as tar:
tar.extractall() tar.extractall()

View File

@@ -7,7 +7,7 @@ import argparse
import os import os
from pathlib import Path from pathlib import Path
import time import time
import urllib import requests
import zipfile import zipfile
import pandas as pd import pandas as pd
@@ -62,9 +62,12 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path):
return return
# Downloading the file # Downloading the file
with urllib.request.urlopen(url) as response: response = requests.get(url, stream=True, timeout=60)
with open(zip_path, "wb") as out_file: response.raise_for_status()
out_file.write(response.read()) with open(zip_path, "wb") as out_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
out_file.write(chunk)
# Unzipping the file # Unzipping the file
with zipfile.ZipFile(zip_path, "r") as zip_ref: with zipfile.ZipFile(zip_path, "r") as zip_ref:
@@ -412,7 +415,7 @@ if __name__ == "__main__":
if not all_exist: if not all_exist:
try: try:
download_and_unzip(url, zip_path, extract_to, new_file_path) download_and_unzip(url, zip_path, extract_to, new_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: except (requests.exceptions.RequestException, TimeoutError) as e:
print(f"Primary URL failed: {e}. Trying backup URL...") print(f"Primary URL failed: {e}. Trying backup URL...")
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
download_and_unzip(backup_url, zip_path, extract_to, new_file_path) download_and_unzip(backup_url, zip_path, extract_to, new_file_path)

View File

@@ -169,10 +169,33 @@
"source": [ "source": [
"import json\n", "import json\n",
"import os\n", "import os\n",
"import urllib\n", "import requests\n",
"\n", "\n",
"\n", "\n",
"def download_and_load_file(file_path, url):\n", "def download_and_load_file(file_path, url):\n",
" if not os.path.exists(file_path):\n",
" response = requests.get(url, timeout=30)\n",
" response.raise_for_status()\n",
" text_data = response.text\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n",
"\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" data = json.load(file)\n",
"\n",
" return data\n",
"\n",
"\n",
"# The book originally used the following code below\n",
"# However, urllib uses older protocol settings that\n",
"# can cause problems for some readers using a VPN.\n",
"# The `requests` version above is more robust\n",
"# in that regard.\n",
"\n",
"\"\"\"\n",
"import urllib\n",
"\n",
"def download_and_load_file(file_path, url):\n",
"\n", "\n",
" if not os.path.exists(file_path):\n", " if not os.path.exists(file_path):\n",
" with urllib.request.urlopen(url) as response:\n", " with urllib.request.urlopen(url) as response:\n",
@@ -180,15 +203,15 @@
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", " with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n", " file.write(text_data)\n",
"\n", "\n",
" # The book originally contained this unnecessary \"else\" clause:\n", " else:\n",
" #else:\n", " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" # with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " text_data = file.read()\n",
" # text_data = file.read()\n",
"\n", "\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" data = json.load(file)\n", " data = json.load(file)\n",
"\n", "\n",
" return data\n", " return data\n",
"\"\"\"\n",
"\n", "\n",
"\n", "\n",
"file_path = \"instruction-data.json\"\n", "file_path = \"instruction-data.json\"\n",
@@ -2490,7 +2513,8 @@
} }
], ],
"source": [ "source": [
"import urllib.request\n", "import requests # noqa: F811\n",
"# import urllib.request\n",
"\n", "\n",
"def query_model(\n", "def query_model(\n",
" prompt,\n", " prompt,\n",
@@ -2512,7 +2536,8 @@
" }\n", " }\n",
" }\n", " }\n",
"\n", "\n",
"\n", " \n",
" \"\"\"\n",
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n", " # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
" payload = json.dumps(data).encode(\"utf-8\")\n", " payload = json.dumps(data).encode(\"utf-8\")\n",
"\n", "\n",
@@ -2536,6 +2561,26 @@
" response_data += response_json[\"message\"][\"content\"]\n", " response_data += response_json[\"message\"][\"content\"]\n",
"\n", "\n",
" return response_data\n", " return response_data\n",
" \"\"\"\n",
"\n",
" # The book originally used the commented-out above, which is based\n",
" # on urllib. It works generally fine, but some readers reported\n",
" # issues with using urlib when using a (company) VPN.\n",
" # The code below uses the requests library, which doesn't seem\n",
" # to have these issues.\n",
"\n",
" # Send the POST request\n",
" with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
" r.raise_for_status()\n",
" response_data = \"\"\n",
" for line in r.iter_lines(decode_unicode=True):\n",
" if not line:\n",
" continue\n",
" response_json = json.loads(line)\n",
" if \"message\" in response_json:\n",
" response_data += response_json[\"message\"][\"content\"]\n",
"\n",
" return response_data\n",
"\n", "\n",
"\n", "\n",
"model = \"llama3\"\n", "model = \"llama3\"\n",

View File

@@ -12,10 +12,10 @@ import math
import os import os
import re import re
import time import time
import urllib
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator from matplotlib.ticker import MaxNLocator
import requests
import tiktoken import tiktoken
import torch import torch
from torch.utils.data import Dataset, DataLoader from torch.utils.data import Dataset, DataLoader
@@ -234,17 +234,17 @@ def custom_collate_with_masking_fn(
def download_and_load_file(file_path, url): def download_and_load_file(file_path, url):
if not os.path.exists(file_path): if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response: response = requests.get(url, timeout=30)
text_data = response.read().decode("utf-8") response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file: with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data) file.write(text_data)
else: else:
with open(file_path, "r", encoding="utf-8") as file: with open(file_path, "r", encoding="utf-8") as file:
text_data = file.read() text_data = file.read()
with open(file_path, "r") as file: with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file) data = json.load(file)
return data return data

View File

@@ -5,11 +5,10 @@
import os import os
import urllib.request
# import requests
import json import json
import numpy as np import numpy as np
import requests
import tensorflow as tf import tensorflow as tf
from tqdm import tqdm from tqdm import tqdm
@@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir):
def download_file(url, destination, backup_url=None): def download_file(url, destination, backup_url=None):
def _attempt_download(download_url): def _attempt_download(download_url):
with urllib.request.urlopen(download_url) as response: response = requests.get(download_url, stream=True, timeout=60)
# Get the total file size from headers, defaulting to 0 if not present response.raise_for_status()
file_size = int(response.headers.get("Content-Length", 0))
# Check if file exists and has the same size file_size = int(response.headers.get("Content-Length", 0))
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True # Indicate success without re-downloading
block_size = 1024 # 1 Kilobyte # Check if file exists and has same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size and file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True
# Initialize the progress bar with total file size block_size = 1024 # 1 KB
progress_bar_description = os.path.basename(download_url) desc = os.path.basename(download_url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
with open(destination, "wb") as file: with open(destination, "wb") as file:
while True: for chunk in response.iter_content(chunk_size=block_size):
chunk = response.read(block_size) if chunk:
if not chunk:
break
file.write(chunk) file.write(chunk)
progress_bar.update(len(chunk)) progress_bar.update(len(chunk))
return True return True
try: try:
if _attempt_download(url): if _attempt_download(url):
return return
except (urllib.error.HTTPError, urllib.error.URLError): except requests.exceptions.RequestException:
if backup_url is not None: if backup_url is not None:
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}") print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
try: try:
if _attempt_download(backup_url): if _attempt_download(backup_url):
return return
except urllib.error.HTTPError: except requests.exceptions.RequestException:
pass pass
# If we reach here, both attempts have failed
error_message = ( error_message = (
f"Failed to download from both primary URL ({url})" f"Failed to download from both primary URL ({url})"
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}." f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
@@ -97,37 +92,6 @@ def download_file(url, destination, backup_url=None):
print(f"An unexpected error occurred: {e}") print(f"An unexpected error occurred: {e}")
# Alternative way using `requests`
"""
def download_file(url, destination):
# Send a GET request to download the file in streaming mode
response = requests.get(url, stream=True)
# Get the total file size from headers, defaulting to 0 if not present
file_size = int(response.headers.get("content-length", 0))
# Check if file exists and has the same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return
# Define the block size for reading the file
block_size = 1024 # 1 Kilobyte
# Initialize the progress bar with total file size
progress_bar_description = url.split("/")[-1] # Extract filename from URL
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
# Open the destination file in binary write mode
with open(destination, "wb") as file:
# Iterate over the file data in chunks
for chunk in response.iter_content(block_size):
progress_bar.update(len(chunk)) # Update progress bar
file.write(chunk) # Write the chunk to the file
"""
def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
# Initialize parameters dictionary with empty blocks for each layer # Initialize parameters dictionary with empty blocks for each layer
params = {"blocks": [{} for _ in range(settings["n_layer"])]} params = {"blocks": [{} for _ in range(settings["n_layer"])]}

View File

@@ -11,9 +11,9 @@ import json
import os import os
import re import re
import time import time
import urllib
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import requests
import tiktoken import tiktoken
import torch import torch
from torch.utils.data import Dataset, DataLoader from torch.utils.data import Dataset, DataLoader
@@ -97,14 +97,14 @@ def custom_collate_fn(
def download_and_load_file(file_path, url): def download_and_load_file(file_path, url):
if not os.path.exists(file_path): if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response: response = requests.get(url, timeout=30)
text_data = response.read().decode("utf-8") response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file: with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data) file.write(text_data)
with open(file_path, "r") as file: with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file) data = json.load(file)
return data return data

View File

@@ -8,7 +8,7 @@
import json import json
import psutil import psutil
from tqdm import tqdm from tqdm import tqdm
import urllib.request import requests
def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"): def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"):
@@ -25,23 +25,16 @@ def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"):
} }
} }
# Convert the dictionary to a JSON formatted string and encode it to bytes # Send the POST request
payload = json.dumps(data).encode("utf-8") with requests.post(url, json=data, stream=True, timeout=30) as r:
r.raise_for_status()
# Create a request object, setting the method to POST and adding necessary headers response_data = ""
request = urllib.request.Request(url, data=payload, method="POST") for line in r.iter_lines(decode_unicode=True):
request.add_header("Content-Type", "application/json")
# Send the request and capture the response
response_data = ""
with urllib.request.urlopen(request) as response:
# Read and decode the response
while True:
line = response.readline().decode("utf-8")
if not line: if not line:
break continue
response_json = json.loads(line) response_json = json.loads(line)
response_data += response_json["message"]["content"] if "message" in response_json:
response_data += response_json["message"]["content"]
return response_data return response_data

View File

@@ -215,8 +215,8 @@
} }
], ],
"source": [ "source": [
"import urllib.request\n",
"import json\n", "import json\n",
"import requests\n",
"\n", "\n",
"\n", "\n",
"def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\"):\n", "def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\"):\n",
@@ -236,27 +236,19 @@
" }\n", " }\n",
" }\n", " }\n",
"\n", "\n",
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n", " # Send the POST request\n",
" payload = json.dumps(data).encode(\"utf-8\")\n", " with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
"\n", " r.raise_for_status()\n",
" # Create a request object, setting the method to POST and adding necessary headers\n", " response_data = \"\"\n",
" request = urllib.request.Request(url, data=payload, method=\"POST\")\n", " for line in r.iter_lines(decode_unicode=True):\n",
" request.add_header(\"Content-Type\", \"application/json\")\n",
"\n",
" # Send the request and capture the response\n",
" response_data = \"\"\n",
" with urllib.request.urlopen(request) as response:\n",
" # Read and decode the response\n",
" while True:\n",
" line = response.readline().decode(\"utf-8\")\n",
" if not line:\n", " if not line:\n",
" break\n", " continue\n",
" response_json = json.loads(line)\n", " response_json = json.loads(line)\n",
" response_data += response_json[\"message\"][\"content\"]\n", " if \"message\" in response_json:\n",
" response_data += response_json[\"message\"][\"content\"]\n",
"\n", "\n",
" return response_data\n", " return response_data\n",
"\n", "\n",
"\n",
"result = query_model(\"What do Llamas eat?\")\n", "result = query_model(\"What do Llamas eat?\")\n",
"print(result)" "print(result)"
] ]
@@ -640,7 +632,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.6" "version": "3.10.16"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@@ -274,8 +274,8 @@
} }
], ],
"source": [ "source": [
"import urllib.request\n",
"import json\n", "import json\n",
"import requests\n",
"\n", "\n",
"\n", "\n",
"def query_model(prompt, model=\"llama3.1:70b\", url=\"http://localhost:11434/api/chat\"):\n", "def query_model(prompt, model=\"llama3.1:70b\", url=\"http://localhost:11434/api/chat\"):\n",
@@ -294,23 +294,16 @@
" }\n", " }\n",
" }\n", " }\n",
"\n", "\n",
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n", " # Send the POST request\n",
" payload = json.dumps(data).encode(\"utf-8\")\n", " with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
"\n", " r.raise_for_status()\n",
" # Create a request object, setting the method to POST and adding necessary headers\n", " response_data = \"\"\n",
" request = urllib.request.Request(url, data=payload, method=\"POST\")\n", " for line in r.iter_lines(decode_unicode=True):\n",
" request.add_header(\"Content-Type\", \"application/json\")\n",
"\n",
" # Send the request and capture the response\n",
" response_data = \"\"\n",
" with urllib.request.urlopen(request) as response:\n",
" # Read and decode the response\n",
" while True:\n",
" line = response.readline().decode(\"utf-8\")\n",
" if not line:\n", " if not line:\n",
" break\n", " continue\n",
" response_json = json.loads(line)\n", " response_json = json.loads(line)\n",
" response_data += response_json[\"message\"][\"content\"]\n", " if \"message\" in response_json:\n",
" response_data += response_json[\"message\"][\"content\"]\n",
"\n", "\n",
" return response_data\n", " return response_data\n",
"\n", "\n",
@@ -587,7 +580,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.6" "version": "3.10.16"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@@ -231,23 +231,21 @@
"source": [ "source": [
"import json\n", "import json\n",
"import os\n", "import os\n",
"import urllib\n", "import requests\n",
"\n", "\n",
"\n", "\n",
"def download_and_load_file(file_path, url):\n", "def download_and_load_file(file_path, url):\n",
"\n",
" if not os.path.exists(file_path):\n", " if not os.path.exists(file_path):\n",
" with urllib.request.urlopen(url) as response:\n", " response = requests.get(url, timeout=30)\n",
" text_data = response.read().decode(\"utf-8\")\n", " response.raise_for_status()\n",
" text_data = response.text\n",
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", " with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(text_data)\n", " file.write(text_data)\n",
" else:\n", " else:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" text_data = file.read()\n", " text_data = file.read()\n",
"\n", "\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " data = json.loads(text_data)\n",
" data = json.load(file)\n",
"\n",
" return data\n", " return data\n",
"\n", "\n",
"\n", "\n",

View File

@@ -194,8 +194,8 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import urllib.request\n",
"import json\n", "import json\n",
"import requests\n",
"\n", "\n",
"def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\", role=\"user\"):\n", "def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\", role=\"user\"):\n",
" # Create the data payload as a dictionary\n", " # Create the data payload as a dictionary\n",
@@ -209,25 +209,21 @@
" ]\n", " ]\n",
" }\n", " }\n",
"\n", "\n",
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n", " # Send the POST request\n",
" payload = json.dumps(data).encode(\"utf-8\")\n", " with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
"\n", " r.raise_for_status()\n",
" # Create a request object, setting the method to POST and adding necessary headers\n", " response_data = \"\"\n",
" request = urllib.request.Request(url, data=payload, method=\"POST\")\n", " for line in r.iter_lines(decode_unicode=True):\n",
" request.add_header(\"Content-Type\", \"application/json\")\n",
"\n",
" # Send the request and capture the response\n",
" response_data = \"\"\n",
" with urllib.request.urlopen(request) as response:\n",
" # Read and decode the response\n",
" while True:\n",
" line = response.readline().decode(\"utf-8\")\n",
" if not line:\n", " if not line:\n",
" break\n", " continue\n",
" response_json = json.loads(line)\n", " response_json = json.loads(line)\n",
" response_data += response_json[\"message\"][\"content\"]\n", " if \"message\" in response_json:\n",
" response_data += response_json[\"message\"][\"content\"]\n",
"\n", "\n",
" return response_data" " return response_data\n",
"\n",
"result = query_model(\"What do Llamas eat?\")\n",
"print(result)"
] ]
}, },
{ {
@@ -498,7 +494,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.11.4" "version": "3.10.16"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@@ -7,11 +7,11 @@ from .ch04 import generate_text_simple
import json import json
import os import os
import urllib.request
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator from matplotlib.ticker import MaxNLocator
import requests
import torch import torch
from tqdm import tqdm from tqdm import tqdm
@@ -279,44 +279,40 @@ def download_and_load_gpt2(model_size, models_dir):
def download_file(url, destination, backup_url=None): def download_file(url, destination, backup_url=None):
def _attempt_download(download_url): def _attempt_download(download_url):
with urllib.request.urlopen(download_url) as response: response = requests.get(download_url, stream=True, timeout=60)
# Get the total file size from headers, defaulting to 0 if not present response.raise_for_status()
file_size = int(response.headers.get("Content-Length", 0))
# Check if file exists and has the same size file_size = int(response.headers.get("Content-Length", 0))
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True # Indicate success without re-downloading
block_size = 1024 # 1 Kilobyte # Check if file exists and has same size
if os.path.exists(destination):
file_size_local = os.path.getsize(destination)
if file_size and file_size == file_size_local:
print(f"File already exists and is up-to-date: {destination}")
return True
# Initialize the progress bar with total file size block_size = 1024 # 1 KB
progress_bar_description = os.path.basename(download_url) desc = os.path.basename(download_url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
with open(destination, "wb") as file: with open(destination, "wb") as file:
while True: for chunk in response.iter_content(chunk_size=block_size):
chunk = response.read(block_size) if chunk:
if not chunk:
break
file.write(chunk) file.write(chunk)
progress_bar.update(len(chunk)) progress_bar.update(len(chunk))
return True return True
try: try:
if _attempt_download(url): if _attempt_download(url):
return return
except (urllib.error.HTTPError, urllib.error.URLError): except requests.exceptions.RequestException:
if backup_url is not None: if backup_url is not None:
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}") print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
try: try:
if _attempt_download(backup_url): if _attempt_download(backup_url):
return return
except urllib.error.HTTPError: except requests.exceptions.RequestException:
pass pass
# If we reach here, both attempts have failed
error_message = ( error_message = (
f"Failed to download from both primary URL ({url})" f"Failed to download from both primary URL ({url})"
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}." f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."

View File

@@ -4,11 +4,11 @@
# Code: https://github.com/rasbt/LLMs-from-scratch # Code: https://github.com/rasbt/LLMs-from-scratch
import urllib.request
import zipfile import zipfile
import os import os
from pathlib import Path from pathlib import Path
import requests
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from torch.utils.data import Dataset from torch.utils.data import Dataset
import torch import torch
@@ -21,9 +21,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
return return
# Downloading the file # Downloading the file
with urllib.request.urlopen(url) as response: response = requests.get(url, stream=True, timeout=60)
with open(zip_path, "wb") as out_file: response.raise_for_status()
out_file.write(response.read()) with open(zip_path, "wb") as out_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
out_file.write(chunk)
# Unzipping the file # Unzipping the file
with zipfile.ZipFile(zip_path, "r") as zip_ref: with zipfile.ZipFile(zip_path, "r") as zip_ref:

View File

@@ -6,7 +6,7 @@
import json import json
import os import os
import psutil import psutil
import urllib import requests
import torch import torch
from tqdm import tqdm from tqdm import tqdm
@@ -14,24 +14,46 @@ from torch.utils.data import Dataset
def download_and_load_file(file_path, url): def download_and_load_file(file_path, url):
if not os.path.exists(file_path): if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response: response = requests.get(url, timeout=30)
text_data = response.read().decode("utf-8") response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as file: with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data) file.write(text_data)
# The book originally contained this unnecessary "else" clause:
# else:
# with open(file_path, "r", encoding="utf-8") as file:
# text_data = file.read()
with open(file_path, "r", encoding="utf-8") as file: with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file) data = json.load(file)
return data return data
# The book originally used the following code below
# However, urllib uses older protocol settings that
# can cause problems for some readers using a VPN.
# The `requests` version above is more robust
# in that regard.
# import urllib
# def download_and_load_file(file_path, url):
# if not os.path.exists(file_path):
# with urllib.request.urlopen(url) as response:
# text_data = response.read().decode("utf-8")
# with open(file_path, "w", encoding="utf-8") as file:
# file.write(text_data)
# else:
# with open(file_path, "r", encoding="utf-8") as file:
# text_data = file.read()
# with open(file_path, "r", encoding="utf-8") as file:
# data = json.load(file)
# return data
def format_input(entry): def format_input(entry):
instruction_text = ( instruction_text = (
f"Below is an instruction that describes a task. " f"Below is an instruction that describes a task. "
@@ -202,27 +224,16 @@ def query_model(
} }
} }
# Convert the dictionary to a JSON formatted string and encode it to bytes # Send the POST request
payload = json.dumps(data).encode("utf-8") with requests.post(url, json=data, stream=True, timeout=30) as r:
r.raise_for_status()
# Create a request object, setting the method to POST and adding necessary headers response_data = ""
request = urllib.request.Request( for line in r.iter_lines(decode_unicode=True):
url,
data=payload,
method="POST"
)
request.add_header("Content-Type", "application/json")
# Send the request and capture the response
response_data = ""
with urllib.request.urlopen(request) as response:
# Read and decode the response
while True:
line = response.readline().decode("utf-8")
if not line: if not line:
break continue
response_json = json.loads(line) response_json = json.loads(line)
response_data += response_json["message"]["content"] if "message" in response_json:
response_data += response_json["message"]["content"]
return response_data return response_data

View File

@@ -6,9 +6,9 @@
import os import os
import json import json
import re import re
import urllib.request
from pathlib import Path from pathlib import Path
import requests
import torch import torch
import torch.nn as nn import torch.nn as nn
@@ -660,7 +660,12 @@ def download_from_huggingface(repo_id, filename, local_dir, revision="main"):
print(f"File already exists: {dest_path}") print(f"File already exists: {dest_path}")
else: else:
print(f"Downloading {url} to {dest_path}...") print(f"Downloading {url} to {dest_path}...")
urllib.request.urlretrieve(url, dest_path) response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
with open(dest_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return dest_path return dest_path

View File

@@ -12,9 +12,9 @@ from llms_from_scratch.ch06 import (
from llms_from_scratch.appendix_e import replace_linear_with_lora from llms_from_scratch.appendix_e import replace_linear_with_lora
from pathlib import Path from pathlib import Path
import urllib
import pandas as pd import pandas as pd
import requests
import tiktoken import tiktoken
import torch import torch
from torch.utils.data import DataLoader, Subset from torch.utils.data import DataLoader, Subset
@@ -35,7 +35,7 @@ def test_train_classifier_lora(tmp_path):
download_and_unzip_spam_data( download_and_unzip_spam_data(
url, zip_path, extracted_path, data_file_path url, zip_path, extracted_path, data_file_path
) )
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: except (requests.exceptions.RequestException, TimeoutError) as e:
print(f"Primary URL failed: {e}. Trying backup URL...") print(f"Primary URL failed: {e}. Trying backup URL...")
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
download_and_unzip_spam_data( download_and_unzip_spam_data(

View File

@@ -6,8 +6,8 @@
from llms_from_scratch.ch02 import create_dataloader_v1 from llms_from_scratch.ch02 import create_dataloader_v1
import os import os
import urllib.request
import requests
import pytest import pytest
import torch import torch
@@ -16,11 +16,17 @@ import torch
def test_dataloader(tmp_path, file_name): def test_dataloader(tmp_path, file_name):
if not os.path.exists("the-verdict.txt"): if not os.path.exists("the-verdict.txt"):
url = ("https://raw.githubusercontent.com/rasbt/" url = (
"LLMs-from-scratch/main/ch02/01_main-chapter-code/" "https://raw.githubusercontent.com/rasbt/"
"the-verdict.txt") "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
"the-verdict.txt"
)
file_path = "the-verdict.txt" file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)
response = requests.get(url, timeout=30)
response.raise_for_status()
with open(file_path, "wb") as f:
f.write(response.content)
with open("the-verdict.txt", "r", encoding="utf-8") as f: with open("the-verdict.txt", "r", encoding="utf-8") as f:
raw_text = f.read() raw_text = f.read()

View File

@@ -8,8 +8,8 @@ from llms_from_scratch.ch04 import GPTModel, GPTModelFast
from llms_from_scratch.ch05 import train_model_simple from llms_from_scratch.ch05 import train_model_simple
import os import os
import urllib
import requests
import pytest import pytest
import tiktoken import tiktoken
import torch import torch
@@ -46,8 +46,9 @@ def test_train_simple(tmp_path, ModelClass):
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt" url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
if not os.path.exists(file_path): if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response: response = requests.get(url, timeout=30)
text_data = response.read().decode("utf-8") response.raise_for_status()
text_data = response.text
with open(file_path, "w", encoding="utf-8") as f: with open(file_path, "w", encoding="utf-8") as f:
f.write(text_data) f.write(text_data)
else: else:

View File

@@ -11,8 +11,8 @@ from llms_from_scratch.ch06 import (
) )
from pathlib import Path from pathlib import Path
import urllib
import requests
import pandas as pd import pandas as pd
import tiktoken import tiktoken
import torch import torch
@@ -34,7 +34,7 @@ def test_train_classifier(tmp_path):
download_and_unzip_spam_data( download_and_unzip_spam_data(
url, zip_path, extracted_path, data_file_path url, zip_path, extracted_path, data_file_path
) )
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: except (requests.exceptions.RequestException, TimeoutError) as e:
print(f"Primary URL failed: {e}. Trying backup URL...") print(f"Primary URL failed: {e}. Trying backup URL...")
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
download_and_unzip_spam_data( download_and_unzip_spam_data(

View File

@@ -9,10 +9,9 @@ import ast
import re import re
import types import types
from pathlib import Path from pathlib import Path
import urllib.request
import urllib.parse
import nbformat import nbformat
import requests
def _extract_imports(src: str): def _extract_imports(src: str):
@@ -125,21 +124,24 @@ def import_definitions_from_notebook(nb_dir_or_path, notebook_name=None, *, extr
exec(src, mod.__dict__) exec(src, mod.__dict__)
return mod return mod
def download_file(url, out_dir="."): def download_file(url, out_dir="."):
"""Simple file download utility for tests.""" """Simple file download utility for tests."""
from pathlib import Path
out_dir = Path(out_dir) out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True) out_dir.mkdir(parents=True, exist_ok=True)
filename = Path(urllib.parse.urlparse(url).path).name filename = Path(url).name
dest = out_dir / filename dest = out_dir / filename
if dest.exists(): if dest.exists():
return dest return dest
try: try:
with urllib.request.urlopen(url) as response: response = requests.get(url, stream=True, timeout=30)
with open(dest, 'wb') as f: response.raise_for_status()
f.write(response.read()) with open(dest, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return dest return dest
except Exception as e: except Exception as e:
raise RuntimeError(f"Failed to download {url}: {e}") raise RuntimeError(f"Failed to download {url}: {e}")