diff --git a/.github/workflows/basic-tests-latest-python.yml b/.github/workflows/basic-tests-latest-python.yml index 2e60844..3359fed 100644 --- a/.github/workflows/basic-tests-latest-python.yml +++ b/.github/workflows/basic-tests-latest-python.yml @@ -38,14 +38,14 @@ jobs: - name: Test Selected Python Scripts run: | source .venv/bin/activate - pytest --ruff setup/02_installing-python-libraries/tests.py - pytest --ruff ch04/01_main-chapter-code/tests.py - pytest --ruff ch05/01_main-chapter-code/tests.py - pytest --ruff ch06/01_main-chapter-code/tests.py + pytest setup/02_installing-python-libraries/tests.py + pytest ch04/01_main-chapter-code/tests.py + pytest ch05/01_main-chapter-code/tests.py + pytest ch06/01_main-chapter-code/tests.py - name: Validate Selected Jupyter Notebooks run: | source .venv/bin/activate - pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb - pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb - pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb + pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb + pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb + pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb diff --git a/.github/workflows/basic-tests-linux-uv.yml b/.github/workflows/basic-tests-linux-uv.yml index 9109176..66a740b 100644 --- a/.github/workflows/basic-tests-linux-uv.yml +++ b/.github/workflows/basic-tests-linux-uv.yml @@ -47,24 +47,24 @@ jobs: shell: bash run: | source .venv/bin/activate - pytest --ruff setup/02_installing-python-libraries/tests.py - pytest --ruff ch04/01_main-chapter-code/tests.py - pytest --ruff ch04/03_kv-cache/tests.py - pytest --ruff ch05/01_main-chapter-code/tests.py - pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py - pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py - pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py - pytest --ruff ch05/12_gemma3/tests/test_gemma3_nb.py - pytest --ruff ch05/12_gemma3/tests/test_gemma3_kv_nb.py - pytest --ruff ch06/01_main-chapter-code/tests.py + pytest setup/02_installing-python-libraries/tests.py + pytest ch04/01_main-chapter-code/tests.py + pytest ch04/03_kv-cache/tests.py + pytest ch05/01_main-chapter-code/tests.py + pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py + pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py + pytest ch05/11_qwen3/tests/test_qwen3_nb.py + pytest ch05/12_gemma3/tests/test_gemma3_nb.py + pytest ch05/12_gemma3/tests/test_gemma3_kv_nb.py + pytest ch06/01_main-chapter-code/tests.py - name: Validate Selected Jupyter Notebooks (uv) shell: bash run: | source .venv/bin/activate - pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb - pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb - pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb + pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb + pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb + pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb - name: Test Selected Bonus Materials shell: bash diff --git a/.github/workflows/basic-tests-macos-uv.yml b/.github/workflows/basic-tests-macos-uv.yml index e673de7..ec3dbd2 100644 --- a/.github/workflows/basic-tests-macos-uv.yml +++ b/.github/workflows/basic-tests-macos-uv.yml @@ -47,20 +47,20 @@ jobs: shell: bash run: | source .venv/bin/activate - pytest --ruff setup/02_installing-python-libraries/tests.py - pytest --ruff ch04/01_main-chapter-code/tests.py - pytest --ruff ch05/01_main-chapter-code/tests.py - pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py - pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py - pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py - pytest --ruff ch05/12_gemma3/tests/test_gemma3_nb.py - pytest --ruff ch05/12_gemma3/tests/test_gemma3_kv_nb.py - pytest --ruff ch06/01_main-chapter-code/tests.py + pytest setup/02_installing-python-libraries/tests.py + pytest ch04/01_main-chapter-code/tests.py + pytest ch05/01_main-chapter-code/tests.py + pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py + pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py + pytest ch05/11_qwen3/tests/test_qwen3_nb.py + pytest ch05/12_gemma3/tests/test_gemma3_nb.py + pytest ch05/12_gemma3/tests/test_gemma3_kv_nb.py + pytest ch06/01_main-chapter-code/tests.py - name: Validate Selected Jupyter Notebooks (uv) shell: bash run: | source .venv/bin/activate - pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb - pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb - pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb + pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb + pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb + pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb diff --git a/.github/workflows/basic-tests-old-pytorch.yml b/.github/workflows/basic-tests-old-pytorch.yml index 0a7c6d8..b486196 100644 --- a/.github/workflows/basic-tests-old-pytorch.yml +++ b/.github/workflows/basic-tests-old-pytorch.yml @@ -43,14 +43,14 @@ jobs: - name: Test Selected Python Scripts run: | source .venv/bin/activate - pytest --ruff setup/02_installing-python-libraries/tests.py - pytest --ruff ch04/01_main-chapter-code/tests.py - pytest --ruff ch05/01_main-chapter-code/tests.py - pytest --ruff ch06/01_main-chapter-code/tests.py + pytest setup/02_installing-python-libraries/tests.py + pytest ch04/01_main-chapter-code/tests.py + pytest ch05/01_main-chapter-code/tests.py + pytest ch06/01_main-chapter-code/tests.py - name: Validate Selected Jupyter Notebooks run: | source .venv/bin/activate - pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb - pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb - pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb + pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb + pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb + pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb diff --git a/.github/workflows/basic-tests-pip.yml b/.github/workflows/basic-tests-pip.yml index b1d74e3..04d49ec 100644 --- a/.github/workflows/basic-tests-pip.yml +++ b/.github/workflows/basic-tests-pip.yml @@ -46,14 +46,14 @@ jobs: - name: Test Selected Python Scripts run: | source .venv/bin/activate - pytest --ruff setup/02_installing-python-libraries/tests.py - pytest --ruff ch04/01_main-chapter-code/tests.py - pytest --ruff ch05/01_main-chapter-code/tests.py - pytest --ruff ch06/01_main-chapter-code/tests.py + pytest setup/02_installing-python-libraries/tests.py + pytest ch04/01_main-chapter-code/tests.py + pytest ch05/01_main-chapter-code/tests.py + pytest ch06/01_main-chapter-code/tests.py - name: Validate Selected Jupyter Notebooks run: | source .venv/bin/activate - pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb - pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb - pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb \ No newline at end of file + pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb + pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb + pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb \ No newline at end of file diff --git a/.github/workflows/basic-tests-pixi.yml b/.github/workflows/basic-tests-pixi.yml index 85eba28..2195a21 100644 --- a/.github/workflows/basic-tests-pixi.yml +++ b/.github/workflows/basic-tests-pixi.yml @@ -47,14 +47,14 @@ jobs: - name: Test Selected Python Scripts shell: pixi run --environment tests bash -e {0} run: | - pytest --ruff setup/02_installing-python-libraries/tests.py - pytest --ruff ch04/01_main-chapter-code/tests.py - pytest --ruff ch05/01_main-chapter-code/tests.py - pytest --ruff ch06/01_main-chapter-code/tests.py + pytest setup/02_installing-python-libraries/tests.py + pytest ch04/01_main-chapter-code/tests.py + pytest ch05/01_main-chapter-code/tests.py + pytest ch06/01_main-chapter-code/tests.py - name: Validate Selected Jupyter Notebooks shell: pixi run --environment tests bash -e {0} run: | - pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb - pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb - pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb + pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb + pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb + pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb diff --git a/.github/workflows/basic-tests-pytorch-rc.yml b/.github/workflows/basic-tests-pytorch-rc.yml index f563504..e2cbe43 100644 --- a/.github/workflows/basic-tests-pytorch-rc.yml +++ b/.github/workflows/basic-tests-pytorch-rc.yml @@ -39,14 +39,14 @@ jobs: - name: Test Selected Python Scripts run: | source .venv/bin/activate - pytest --ruff setup/02_installing-python-libraries/tests.py - pytest --ruff ch04/01_main-chapter-code/tests.py - pytest --ruff ch05/01_main-chapter-code/tests.py - pytest --ruff ch06/01_main-chapter-code/tests.py + pytest setup/02_installing-python-libraries/tests.py + pytest ch04/01_main-chapter-code/tests.py + pytest ch05/01_main-chapter-code/tests.py + pytest ch06/01_main-chapter-code/tests.py - name: Validate Selected Jupyter Notebooks run: | source .venv/bin/activate - pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb - pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb - pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb + pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb + pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb + pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb diff --git a/.github/workflows/basic-tests-windows-uv-pip.yml b/.github/workflows/basic-tests-windows-uv-pip.yml index 8836332..ff9eee5 100644 --- a/.github/workflows/basic-tests-windows-uv-pip.yml +++ b/.github/workflows/basic-tests-windows-uv-pip.yml @@ -49,18 +49,18 @@ jobs: shell: bash run: | source .venv/Scripts/activate - pytest --ruff setup/02_installing-python-libraries/tests.py - pytest --ruff ch04/01_main-chapter-code/tests.py - pytest --ruff ch05/01_main-chapter-code/tests.py - pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py - pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py - pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py - pytest --ruff ch06/01_main-chapter-code/tests.py + pytest setup/02_installing-python-libraries/tests.py + pytest ch04/01_main-chapter-code/tests.py + pytest ch05/01_main-chapter-code/tests.py + pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py + pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py + pytest ch05/11_qwen3/tests/test_qwen3_nb.py + pytest ch06/01_main-chapter-code/tests.py - name: Run Jupyter Notebook Tests shell: bash run: | source .venv/Scripts/activate - pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb - pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb - pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb \ No newline at end of file + pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb + pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb + pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb \ No newline at end of file diff --git a/appendix-D/01_main-chapter-code/appendix-D.ipynb b/appendix-D/01_main-chapter-code/appendix-D.ipynb index f8a5aca..69621a7 100644 --- a/appendix-D/01_main-chapter-code/appendix-D.ipynb +++ b/appendix-D/01_main-chapter-code/appendix-D.ipynb @@ -121,19 +121,40 @@ "outputs": [], "source": [ "import os\n", - "import urllib.request\n", + "import requests\n", "\n", "file_path = \"the-verdict.txt\"\n", "url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n", "\n", "if not os.path.exists(file_path):\n", + " response = requests.get(url, timeout=30)\n", + " response.raise_for_status()\n", + " text_data = response.text\n", + " with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", + " file.write(text_data)\n", + "else:\n", + " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", + " text_data = file.read()\n", + "\n", + "# The book originally used the following code below\n", + "# However, urllib uses older protocol settings that\n", + "# can cause problems for some readers using a VPN.\n", + "# The `requests` version above is more robust\n", + "# in that regard.\n", + "\n", + "\"\"\"\n", + "import os\n", + "import urllib.request\n", + "\n", + "if not os.path.exists(file_path):\n", " with urllib.request.urlopen(url) as response:\n", " text_data = response.read().decode('utf-8')\n", " with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", " file.write(text_data)\n", "else:\n", " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", - " text_data = file.read()" + " text_data = file.read()\n", + "\"\"\"" ] }, { diff --git a/appendix-E/01_main-chapter-code/appendix-E.ipynb b/appendix-E/01_main-chapter-code/appendix-E.ipynb index 9c0efd3..0750a17 100644 --- a/appendix-E/01_main-chapter-code/appendix-E.ipynb +++ b/appendix-E/01_main-chapter-code/appendix-E.ipynb @@ -190,7 +190,8 @@ } ], "source": [ - "import urllib\n", + "# import urllib\n", + "import requests\n", "from pathlib import Path\n", "import pandas as pd\n", "from previous_chapters import (\n", @@ -215,13 +216,20 @@ "extracted_path = \"sms_spam_collection\"\n", "data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n", "\n", + "\n", "try:\n", " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", - "except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n", + "except (requests.exceptions.RequestException, TimeoutError) as e:\n", " print(f\"Primary URL failed: {e}. Trying backup URL...\")\n", " url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n", " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", "\n", + "# The book originally used\n", + "# except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n", + "# in the code above.\n", + "# However, some VPN users reported issues with `urllib`, so the code was updated\n", + "# to use `requests` instead\n", + "\n", "df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n", "balanced_df = create_balanced_dataset(df)\n", "balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})\n", diff --git a/appendix-E/01_main-chapter-code/previous_chapters.py b/appendix-E/01_main-chapter-code/previous_chapters.py index 5bf8adc..248995b 100644 --- a/appendix-E/01_main-chapter-code/previous_chapters.py +++ b/appendix-E/01_main-chapter-code/previous_chapters.py @@ -9,12 +9,12 @@ import os from pathlib import Path -import urllib import zipfile import matplotlib.pyplot as plt import numpy as np import pandas as pd +import requests import tiktoken import torch import torch.nn as nn @@ -367,9 +367,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path): return # Downloading the file - with urllib.request.urlopen(url) as response: - with open(zip_path, "wb") as out_file: - out_file.write(response.read()) + response = requests.get(url, stream=True, timeout=60) + response.raise_for_status() + with open(zip_path, "wb") as out_file: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + out_file.write(chunk) # Unzipping the file with zipfile.ZipFile(zip_path, "r") as zip_ref: diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index 6e7e154..272e4d7 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -163,6 +163,30 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", + "import requests\n", + "\n", + "if not os.path.exists(\"the-verdict.txt\"):\n", + " url = (\n", + " \"https://raw.githubusercontent.com/rasbt/\"\n", + " \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n", + " \"the-verdict.txt\"\n", + " )\n", + " file_path = \"the-verdict.txt\"\n", + "\n", + " response = requests.get(url, timeout=30)\n", + " response.raise_for_status()\n", + " with open(file_path, \"wb\") as f:\n", + " f.write(response.content)\n", + "\n", + "\n", + "# The book originally used the following code below\n", + "# However, urllib uses older protocol settings that\n", + "# can cause problems for some readers using a VPN.\n", + "# The `requests` version above is more robust\n", + "# in that regard.\n", + "\n", + "\"\"\"\n", "import os\n", "import urllib.request\n", "\n", @@ -171,7 +195,8 @@ " \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n", " \"the-verdict.txt\")\n", " file_path = \"the-verdict.txt\"\n", - " urllib.request.urlretrieve(url, file_path)" + " urllib.request.urlretrieve(url, file_path)\n", + "\"\"\"" ] }, { diff --git a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb index bb754b7..e9b07ec 100644 --- a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb +++ b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb @@ -823,7 +823,7 @@ ], "source": [ "import os\n", - "import urllib.request\n", + "import requests\n", "\n", "def download_file_if_absent(url, filename, search_dirs):\n", " for directory in search_dirs:\n", @@ -834,13 +834,19 @@ "\n", " target_path = os.path.join(search_dirs[0], filename)\n", " try:\n", - " with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n", - " out_file.write(response.read())\n", + " response = requests.get(url, stream=True, timeout=60)\n", + " response.raise_for_status()\n", + " with open(target_path, \"wb\") as out_file:\n", + " for chunk in response.iter_content(chunk_size=8192):\n", + " if chunk:\n", + " out_file.write(chunk)\n", " print(f\"Downloaded {filename} to {target_path}\")\n", " except Exception as e:\n", " print(f\"Failed to download {filename}. Error: {e}\")\n", + "\n", " return target_path\n", "\n", + "\n", "verdict_path = download_file_if_absent(\n", " url=(\n", " \"https://raw.githubusercontent.com/rasbt/\"\n", diff --git a/ch05/01_main-chapter-code/ch05.ipynb b/ch05/01_main-chapter-code/ch05.ipynb index adbbcc2..22809c7 100644 --- a/ch05/01_main-chapter-code/ch05.ipynb +++ b/ch05/01_main-chapter-code/ch05.ipynb @@ -793,19 +793,43 @@ "outputs": [], "source": [ "import os\n", - "import urllib.request\n", + "import requests\n", "\n", "file_path = \"the-verdict.txt\"\n", "url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n", "\n", "if not os.path.exists(file_path):\n", - " with urllib.request.urlopen(url) as response:\n", - " text_data = response.read().decode('utf-8')\n", + " response = requests.get(url, timeout=30)\n", + " response.raise_for_status()\n", + " text_data = response.text\n", " with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", " file.write(text_data)\n", "else:\n", " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", - " text_data = file.read()" + " text_data = file.read()\n", + "\n", + "\n", + "# The book originally used the following code below\n", + "# However, urllib uses older protocol settings that\n", + "# can cause problems for some readers using a VPN.\n", + "# The `requests` version above is more robust\n", + "# in that regard.\n", + "\n", + " \n", + "# import os\n", + "# import urllib.request\n", + "\n", + "# file_path = \"the-verdict.txt\"\n", + "# url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n", + "\n", + "# if not os.path.exists(file_path):\n", + "# with urllib.request.urlopen(url) as response:\n", + "# text_data = response.read().decode('utf-8')\n", + "# with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", + "# file.write(text_data)\n", + "# else:\n", + "# with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", + "# text_data = file.read()" ] }, { diff --git a/ch05/01_main-chapter-code/exercise-solutions.ipynb b/ch05/01_main-chapter-code/exercise-solutions.ipynb index 8baad8a..8f1f0aa 100644 --- a/ch05/01_main-chapter-code/exercise-solutions.ipynb +++ b/ch05/01_main-chapter-code/exercise-solutions.ipynb @@ -491,7 +491,7 @@ "outputs": [], "source": [ "import os\n", - "import urllib.request\n", + "import requests\n", "from previous_chapters import create_dataloader_v1\n", "\n", "\n", @@ -499,6 +499,25 @@ "url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n", "\n", "if not os.path.exists(file_path):\n", + " response = requests.get(url, timeout=30)\n", + " response.raise_for_status()\n", + " text_data = response.text\n", + " with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", + " file.write(text_data)\n", + "else:\n", + " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", + " text_data = file.read()\n", + "\n", + "# The book originally used the following code below\n", + "# However, urllib uses older protocol settings that\n", + "# can cause problems for some readers using a VPN.\n", + "# The `requests` version above is more robust\n", + "# in that regard.\n", + "\n", + "\"\"\"\n", + "import urllib.request\n", + "\n", + "if not os.path.exists(file_path):\n", " with urllib.request.urlopen(url) as response:\n", " text_data = response.read().decode('utf-8')\n", " with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", @@ -506,6 +525,7 @@ "else:\n", " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " text_data = file.read()\n", + "\"\"\"\n", "\n", "\n", "# Train/validation ratio\n", diff --git a/ch05/01_main-chapter-code/gpt_download.py b/ch05/01_main-chapter-code/gpt_download.py index 6e27a4f..51d9c4b 100644 --- a/ch05/01_main-chapter-code/gpt_download.py +++ b/ch05/01_main-chapter-code/gpt_download.py @@ -5,9 +5,8 @@ import os -import urllib.request -# import requests +import requests import json import numpy as np import tensorflow as tf @@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir): def download_file(url, destination, backup_url=None): def _attempt_download(download_url): - with urllib.request.urlopen(download_url) as response: - # Get the total file size from headers, defaulting to 0 if not present - file_size = int(response.headers.get("Content-Length", 0)) + response = requests.get(download_url, stream=True, timeout=60) + response.raise_for_status() - # Check if file exists and has the same size - if os.path.exists(destination): - file_size_local = os.path.getsize(destination) - if file_size == file_size_local: - print(f"File already exists and is up-to-date: {destination}") - return True # Indicate success without re-downloading + file_size = int(response.headers.get("Content-Length", 0)) - block_size = 1024 # 1 Kilobyte + # Check if file exists and has same size + if os.path.exists(destination): + file_size_local = os.path.getsize(destination) + if file_size and file_size == file_size_local: + print(f"File already exists and is up-to-date: {destination}") + return True - # Initialize the progress bar with total file size - progress_bar_description = os.path.basename(download_url) - with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: - with open(destination, "wb") as file: - while True: - chunk = response.read(block_size) - if not chunk: - break + block_size = 1024 # 1 KB + desc = os.path.basename(download_url) + with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar: + with open(destination, "wb") as file: + for chunk in response.iter_content(chunk_size=block_size): + if chunk: file.write(chunk) progress_bar.update(len(chunk)) - return True + return True try: if _attempt_download(url): return - except (urllib.error.HTTPError, urllib.error.URLError): + except requests.exceptions.RequestException: if backup_url is not None: print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}") try: if _attempt_download(backup_url): return - except urllib.error.HTTPError: + except requests.exceptions.RequestException: pass - # If we reach here, both attempts have failed error_message = ( f"Failed to download from both primary URL ({url})" f"{' and backup URL (' + backup_url + ')' if backup_url else ''}." diff --git a/ch05/01_main-chapter-code/gpt_generate.py b/ch05/01_main-chapter-code/gpt_generate.py index b68d170..3fdfd51 100644 --- a/ch05/01_main-chapter-code/gpt_generate.py +++ b/ch05/01_main-chapter-code/gpt_generate.py @@ -7,9 +7,8 @@ import argparse import json import numpy as np import os -import urllib.request -# import requests +import requests import tensorflow as tf import tiktoken import torch @@ -60,18 +59,18 @@ def download_and_load_gpt2(model_size, models_dir): return settings, params -""" def download_file(url, destination): - # Send a GET request to download the file in streaming mode - response = requests.get(url, stream=True) + # Send a GET request to download the file + response = requests.get(url, stream=True, timeout=60) + response.raise_for_status() # Get the total file size from headers, defaulting to 0 if not present - file_size = int(response.headers.get("content-length", 0)) + file_size = int(response.headers.get("Content-Length", 0)) # Check if file exists and has the same size if os.path.exists(destination): file_size_local = os.path.getsize(destination) - if file_size == file_size_local: + if file_size and file_size == file_size_local: print(f"File already exists and is up-to-date: {destination}") return @@ -79,43 +78,12 @@ def download_file(url, destination): block_size = 1024 # 1 Kilobyte # Initialize the progress bar with total file size - progress_bar_description = url.split("/")[-1] # Extract filename from URL + progress_bar_description = os.path.basename(url) with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: # Open the destination file in binary write mode with open(destination, "wb") as file: - # Iterate over the file data in chunks - for chunk in response.iter_content(block_size): - progress_bar.update(len(chunk)) # Update progress bar - file.write(chunk) # Write the chunk to the file -""" - - -def download_file(url, destination): - # Send a GET request to download the file - with urllib.request.urlopen(url) as response: - # Get the total file size from headers, defaulting to 0 if not present - file_size = int(response.headers.get("Content-Length", 0)) - - # Check if file exists and has the same size - if os.path.exists(destination): - file_size_local = os.path.getsize(destination) - if file_size == file_size_local: - print(f"File already exists and is up-to-date: {destination}") - return - - # Define the block size for reading the file - block_size = 1024 # 1 Kilobyte - - # Initialize the progress bar with total file size - progress_bar_description = os.path.basename(url) # Extract filename from URL - with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: - # Open the destination file in binary write mode - with open(destination, "wb") as file: - # Read the file in chunks and write to destination - while True: - chunk = response.read(block_size) - if not chunk: - break + for chunk in response.iter_content(chunk_size=block_size): + if chunk: file.write(chunk) progress_bar.update(len(chunk)) # Update progress bar diff --git a/ch05/01_main-chapter-code/gpt_train.py b/ch05/01_main-chapter-code/gpt_train.py index c39d979..3cc1085 100644 --- a/ch05/01_main-chapter-code/gpt_train.py +++ b/ch05/01_main-chapter-code/gpt_train.py @@ -5,8 +5,8 @@ import matplotlib.pyplot as plt import os +import requests import torch -import urllib.request import tiktoken @@ -141,14 +141,14 @@ def main(gpt_config, settings): url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt" if not os.path.exists(file_path): - with urllib.request.urlopen(url) as response: - text_data = response.read().decode('utf-8') + response = requests.get(url, timeout=30) + response.raise_for_status() + text_data = response.text with open(file_path, "w", encoding="utf-8") as file: file.write(text_data) else: with open(file_path, "r", encoding="utf-8") as file: text_data = file.read() - ############################## # Initialize model ############################## diff --git a/ch05/01_main-chapter-code/tests.py b/ch05/01_main-chapter-code/tests.py index 5a7d7c6..7fe7607 100644 --- a/ch05/01_main-chapter-code/tests.py +++ b/ch05/01_main-chapter-code/tests.py @@ -7,9 +7,7 @@ import pytest from gpt_train import main -import http.client -from urllib.parse import urlparse - +import requests @pytest.fixture def gpt_config(): @@ -43,23 +41,23 @@ def test_main(gpt_config, other_settings): def check_file_size(url, expected_size): - parsed_url = urlparse(url) - if parsed_url.scheme == "https": - conn = http.client.HTTPSConnection(parsed_url.netloc) - else: - conn = http.client.HTTPConnection(parsed_url.netloc) + try: + response = requests.head(url, allow_redirects=True, timeout=30) + if response.status_code != 200: + return False, f"{url} not accessible" - conn.request("HEAD", parsed_url.path) - response = conn.getresponse() - if response.status != 200: - return False, f"{url} not accessible" - size = response.getheader("Content-Length") - if size is None: - return False, "Content-Length header is missing" - size = int(size) - if size != expected_size: - return False, f"{url} file has expected size {expected_size}, but got {size}" - return True, f"{url} file size is correct" + size = response.headers.get("Content-Length") + if size is None: + return False, "Content-Length header is missing" + + size = int(size) + if size != expected_size: + return False, f"{url} file has expected size {expected_size}, but got {size}" + + return True, f"{url} file size is correct" + + except requests.exceptions.RequestException as e: + return False, f"Failed to access {url}: {e}" def test_model_files(): diff --git a/ch05/02_alternative_weight_loading/weight-loading-hf-safetensors.ipynb b/ch05/02_alternative_weight_loading/weight-loading-hf-safetensors.ipynb index 6a634e3..0508789 100644 --- a/ch05/02_alternative_weight_loading/weight-loading-hf-safetensors.ipynb +++ b/ch05/02_alternative_weight_loading/weight-loading-hf-safetensors.ipynb @@ -134,7 +134,7 @@ "outputs": [], "source": [ "import os\n", - "import urllib.request\n", + "import requests\n", "from safetensors.torch import load_file\n", "\n", "URL_DIR = {\n", @@ -149,7 +149,10 @@ "\n", "# Download file\n", "if not os.path.exists(output_file):\n", - " urllib.request.urlretrieve(url, output_file)\n", + " response = requests.get(url, timeout=30)\n", + " response.raise_for_status()\n", + " with open(output_file, \"wb\") as f:\n", + " f.write(response.content)\n", "\n", "# Load file\n", "state_dict = load_file(output_file)" diff --git a/ch05/02_alternative_weight_loading/weight-loading-pytorch.ipynb b/ch05/02_alternative_weight_loading/weight-loading-pytorch.ipynb index 7081b8f..fe80c1f 100644 --- a/ch05/02_alternative_weight_loading/weight-loading-pytorch.ipynb +++ b/ch05/02_alternative_weight_loading/weight-loading-pytorch.ipynb @@ -144,12 +144,15 @@ ], "source": [ "import os\n", - "import urllib.request\n", + "import requests\n", "\n", "url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n", "\n", "if not os.path.exists(file_name):\n", - " urllib.request.urlretrieve(url, file_name)\n", + " response = requests.get(url, timeout=60)\n", + " response.raise_for_status()\n", + " with open(file_name, \"wb\") as f:\n", + " f.write(response.content)\n", " print(f\"Downloaded to {file_name}\")" ] }, @@ -276,12 +279,15 @@ ], "source": [ "import os\n", - "import urllib.request\n", + "import requests\n", "\n", "url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n", "\n", "if not os.path.exists(file_name):\n", - " urllib.request.urlretrieve(url, file_name)\n", + " response = requests.get(url, timeout=60)\n", + " response.raise_for_status()\n", + " with open(file_name, \"wb\") as f:\n", + " f.write(response.content)\n", " print(f\"Downloaded to {file_name}\")" ] }, diff --git a/ch05/07_gpt_to_llama/README.md b/ch05/07_gpt_to_llama/README.md index 3fcb120..a332375 100644 --- a/ch05/07_gpt_to_llama/README.md +++ b/ch05/07_gpt_to_llama/README.md @@ -58,12 +58,17 @@ This automatically downloads the weight file based on the model choice above: ```python import os -import urllib.request +import requests url = f"https://huggingface.co/rasbt/llama-3.2-from-scratch/resolve/main/{MODEL_FILE}" if not os.path.exists(MODEL_FILE): - urllib.request.urlretrieve(url, MODEL_FILE) + response = requests.get(url, stream=True, timeout=60) + response.raise_for_status() + with open(MODEL_FILE, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) print(f"Downloaded to {MODEL_FILE}") ``` diff --git a/ch05/10_llm-training-speed/01_opt_single_gpu.py b/ch05/10_llm-training-speed/01_opt_single_gpu.py index 155b57e..b99c969 100644 --- a/ch05/10_llm-training-speed/01_opt_single_gpu.py +++ b/ch05/10_llm-training-speed/01_opt_single_gpu.py @@ -6,9 +6,9 @@ import os import time -import urllib.request import matplotlib.pyplot as plt +import requests import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader @@ -397,8 +397,9 @@ def main(gpt_config, settings): url = "https://www.gutenberg.org/cache/epub/145/pg145.txt" if not os.path.exists(file_path): - with urllib.request.urlopen(url) as response: - text_data = response.read().decode('utf-8') + response = requests.get(url, timeout=30) + response.raise_for_status() + text_data = response.text with open(file_path, "w", encoding="utf-8") as file: file.write(text_data) else: diff --git a/ch05/10_llm-training-speed/02_opt_multi_gpu_ddp.py b/ch05/10_llm-training-speed/02_opt_multi_gpu_ddp.py index 746cc7f..ec382a0 100644 --- a/ch05/10_llm-training-speed/02_opt_multi_gpu_ddp.py +++ b/ch05/10_llm-training-speed/02_opt_multi_gpu_ddp.py @@ -6,9 +6,9 @@ import os import time -import urllib.request import matplotlib.pyplot as plt +import requests import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader @@ -468,11 +468,11 @@ def main(gpt_config, settings, rank, world_size): # NEW: Only download 1 time if rank == 0: if not os.path.exists(file_path): - with urllib.request.urlopen(url) as response: - text_data = response.read().decode('utf-8') + response = requests.get(url, timeout=30) + response.raise_for_status() + text_data = response.text with open(file_path, "w", encoding="utf-8") as file: file.write(text_data) - # NEW: All processes wait until rank 0 is done, using the GPU index. torch.distributed.barrier(device_ids=[device.index]) diff --git a/ch06/01_main-chapter-code/ch06.ipynb b/ch06/01_main-chapter-code/ch06.ipynb index d7b723a..ee4090c 100644 --- a/ch06/01_main-chapter-code/ch06.ipynb +++ b/ch06/01_main-chapter-code/ch06.ipynb @@ -186,6 +186,56 @@ } ], "source": [ + "import requests\n", + "import zipfile\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n", + "zip_path = \"sms_spam_collection.zip\"\n", + "extracted_path = \"sms_spam_collection\"\n", + "data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n", + "\n", + "\n", + "def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):\n", + " if data_file_path.exists():\n", + " print(f\"{data_file_path} already exists. Skipping download and extraction.\")\n", + " return\n", + "\n", + " # Downloading the file\n", + " response = requests.get(url, stream=True, timeout=60)\n", + " response.raise_for_status()\n", + " with open(zip_path, \"wb\") as out_file:\n", + " for chunk in response.iter_content(chunk_size=8192):\n", + " if chunk:\n", + " out_file.write(chunk)\n", + "\n", + " # Unzipping the file\n", + " with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n", + " zip_ref.extractall(extracted_path)\n", + "\n", + " # Add .tsv file extension\n", + " original_file_path = Path(extracted_path) / \"SMSSpamCollection\"\n", + " os.rename(original_file_path, data_file_path)\n", + " print(f\"File downloaded and saved as {data_file_path}\")\n", + "\n", + "\n", + "try:\n", + " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", + "except (requests.exceptions.RequestException, TimeoutError) as e:\n", + " print(f\"Primary URL failed: {e}. Trying backup URL...\")\n", + " url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n", + " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", + "\n", + "\n", + "\n", + "# The book originally used the following code below\n", + "# However, urllib uses older protocol settings that\n", + "# can cause problems for some readers using a VPN.\n", + "# The `requests` version above is more robust\n", + "# in that regard.\n", + "\n", + "\"\"\"\n", "import urllib.request\n", "import zipfile\n", "import os\n", @@ -220,7 +270,8 @@ "except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n", " print(f\"Primary URL failed: {e}. Trying backup URL...\")\n", " url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n", - " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) " + " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", + "\"\"\"" ] }, { diff --git a/ch06/01_main-chapter-code/gpt_class_finetune.py b/ch06/01_main-chapter-code/gpt_class_finetune.py index 239f374..523b85d 100644 --- a/ch06/01_main-chapter-code/gpt_class_finetune.py +++ b/ch06/01_main-chapter-code/gpt_class_finetune.py @@ -5,7 +5,7 @@ # This is a summary file containing the main takeaways from chapter 6. -import urllib.request +import requests import zipfile import os from pathlib import Path @@ -27,9 +27,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path): return # Downloading the file - with urllib.request.urlopen(url) as response: - with open(zip_path, "wb") as out_file: - out_file.write(response.read()) + response = requests.get(url, stream=True, timeout=60) + response.raise_for_status() + with open(zip_path, "wb") as out_file: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + out_file.write(chunk) # Unzipping the file with zipfile.ZipFile(zip_path, "r") as zip_ref: @@ -259,7 +262,7 @@ if __name__ == "__main__": try: download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) - except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: + except (requests.exceptions.RequestException, TimeoutError) as e: print(f"Primary URL failed: {e}. Trying backup URL...") url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) diff --git a/ch06/02_bonus_additional-experiments/additional_experiments.py b/ch06/02_bonus_additional-experiments/additional_experiments.py index 72d0da7..87e660f 100644 --- a/ch06/02_bonus_additional-experiments/additional_experiments.py +++ b/ch06/02_bonus_additional-experiments/additional_experiments.py @@ -8,10 +8,10 @@ import math import os from pathlib import Path import time -import urllib.request import zipfile import pandas as pd +import requests import tiktoken import torch from torch.utils.data import DataLoader @@ -113,9 +113,12 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path): return # Downloading the file - with urllib.request.urlopen(url) as response: - with open(zip_path, "wb") as out_file: - out_file.write(response.read()) + response = requests.get(url, stream=True, timeout=60) + response.raise_for_status() + with open(zip_path, "wb") as out_file: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + out_file.write(chunk) # Unzipping the file with zipfile.ZipFile(zip_path, "r") as zip_ref: @@ -608,11 +611,11 @@ if __name__ == "__main__": base_path = Path(".") file_names = ["train.csv", "validation.csv", "test.csv"] all_exist = all((base_path / file_name).exists() for file_name in file_names) - + if not all_exist: try: download_and_unzip(url, zip_path, extract_to, new_file_path) - except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: + except (requests.exceptions.RequestException, TimeoutError) as e: print(f"Primary URL failed: {e}. Trying backup URL...") backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" download_and_unzip(backup_url, zip_path, extract_to, new_file_path) diff --git a/ch06/03_bonus_imdb-classification/download_prepare_dataset.py b/ch06/03_bonus_imdb-classification/download_prepare_dataset.py index f5ab61c..ce82aaa 100644 --- a/ch06/03_bonus_imdb-classification/download_prepare_dataset.py +++ b/ch06/03_bonus_imdb-classification/download_prepare_dataset.py @@ -7,7 +7,7 @@ import os import sys import tarfile import time -import urllib.request +import requests import pandas as pd @@ -32,7 +32,15 @@ def download_and_extract_dataset(dataset_url, target_file, directory): if not os.path.exists(directory): if os.path.exists(target_file): os.remove(target_file) - urllib.request.urlretrieve(dataset_url, target_file, reporthook) + + response = requests.get(dataset_url, stream=True, timeout=60) + response.raise_for_status() + + with open(target_file, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + print("\nExtracting dataset ...") with tarfile.open(target_file, "r:gz") as tar: tar.extractall() diff --git a/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py b/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py index 72fbd1a..cf9d56c 100644 --- a/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py +++ b/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py @@ -7,7 +7,7 @@ import argparse import os from pathlib import Path import time -import urllib +import requests import zipfile import pandas as pd @@ -62,9 +62,12 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path): return # Downloading the file - with urllib.request.urlopen(url) as response: - with open(zip_path, "wb") as out_file: - out_file.write(response.read()) + response = requests.get(url, stream=True, timeout=60) + response.raise_for_status() + with open(zip_path, "wb") as out_file: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + out_file.write(chunk) # Unzipping the file with zipfile.ZipFile(zip_path, "r") as zip_ref: @@ -412,7 +415,7 @@ if __name__ == "__main__": if not all_exist: try: download_and_unzip(url, zip_path, extract_to, new_file_path) - except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: + except (requests.exceptions.RequestException, TimeoutError) as e: print(f"Primary URL failed: {e}. Trying backup URL...") backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" download_and_unzip(backup_url, zip_path, extract_to, new_file_path) diff --git a/ch07/01_main-chapter-code/ch07.ipynb b/ch07/01_main-chapter-code/ch07.ipynb index 757b554..6fe9073 100644 --- a/ch07/01_main-chapter-code/ch07.ipynb +++ b/ch07/01_main-chapter-code/ch07.ipynb @@ -169,10 +169,33 @@ "source": [ "import json\n", "import os\n", - "import urllib\n", + "import requests\n", "\n", "\n", "def download_and_load_file(file_path, url):\n", + " if not os.path.exists(file_path):\n", + " response = requests.get(url, timeout=30)\n", + " response.raise_for_status()\n", + " text_data = response.text\n", + " with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", + " file.write(text_data)\n", + "\n", + " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", + " data = json.load(file)\n", + "\n", + " return data\n", + "\n", + "\n", + "# The book originally used the following code below\n", + "# However, urllib uses older protocol settings that\n", + "# can cause problems for some readers using a VPN.\n", + "# The `requests` version above is more robust\n", + "# in that regard.\n", + "\n", + "\"\"\"\n", + "import urllib\n", + "\n", + "def download_and_load_file(file_path, url):\n", "\n", " if not os.path.exists(file_path):\n", " with urllib.request.urlopen(url) as response:\n", @@ -180,15 +203,15 @@ " with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", " file.write(text_data)\n", "\n", - " # The book originally contained this unnecessary \"else\" clause:\n", - " #else:\n", - " # with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", - " # text_data = file.read()\n", + " else:\n", + " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", + " text_data = file.read()\n", "\n", " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " data = json.load(file)\n", "\n", " return data\n", + "\"\"\"\n", "\n", "\n", "file_path = \"instruction-data.json\"\n", @@ -2490,7 +2513,8 @@ } ], "source": [ - "import urllib.request\n", + "import requests # noqa: F811\n", + "# import urllib.request\n", "\n", "def query_model(\n", " prompt,\n", @@ -2512,7 +2536,8 @@ " }\n", " }\n", "\n", - "\n", + " \n", + " \"\"\"\n", " # Convert the dictionary to a JSON formatted string and encode it to bytes\n", " payload = json.dumps(data).encode(\"utf-8\")\n", "\n", @@ -2536,6 +2561,26 @@ " response_data += response_json[\"message\"][\"content\"]\n", "\n", " return response_data\n", + " \"\"\"\n", + "\n", + " # The book originally used the commented-out above, which is based\n", + " # on urllib. It works generally fine, but some readers reported\n", + " # issues with using urlib when using a (company) VPN.\n", + " # The code below uses the requests library, which doesn't seem\n", + " # to have these issues.\n", + "\n", + " # Send the POST request\n", + " with requests.post(url, json=data, stream=True, timeout=30) as r:\n", + " r.raise_for_status()\n", + " response_data = \"\"\n", + " for line in r.iter_lines(decode_unicode=True):\n", + " if not line:\n", + " continue\n", + " response_json = json.loads(line)\n", + " if \"message\" in response_json:\n", + " response_data += response_json[\"message\"][\"content\"]\n", + "\n", + " return response_data\n", "\n", "\n", "model = \"llama3\"\n", diff --git a/ch07/01_main-chapter-code/exercise_experiments.py b/ch07/01_main-chapter-code/exercise_experiments.py index 88e44d6..773e1e2 100644 --- a/ch07/01_main-chapter-code/exercise_experiments.py +++ b/ch07/01_main-chapter-code/exercise_experiments.py @@ -12,10 +12,10 @@ import math import os import re import time -import urllib import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator +import requests import tiktoken import torch from torch.utils.data import Dataset, DataLoader @@ -234,17 +234,17 @@ def custom_collate_with_masking_fn( def download_and_load_file(file_path, url): - if not os.path.exists(file_path): - with urllib.request.urlopen(url) as response: - text_data = response.read().decode("utf-8") + response = requests.get(url, timeout=30) + response.raise_for_status() + text_data = response.text with open(file_path, "w", encoding="utf-8") as file: file.write(text_data) else: with open(file_path, "r", encoding="utf-8") as file: text_data = file.read() - with open(file_path, "r") as file: + with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) return data diff --git a/ch07/01_main-chapter-code/gpt_download.py b/ch07/01_main-chapter-code/gpt_download.py index 6e27a4f..b5ce887 100644 --- a/ch07/01_main-chapter-code/gpt_download.py +++ b/ch07/01_main-chapter-code/gpt_download.py @@ -5,11 +5,10 @@ import os -import urllib.request - -# import requests import json + import numpy as np +import requests import tensorflow as tf from tqdm import tqdm @@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir): def download_file(url, destination, backup_url=None): def _attempt_download(download_url): - with urllib.request.urlopen(download_url) as response: - # Get the total file size from headers, defaulting to 0 if not present - file_size = int(response.headers.get("Content-Length", 0)) + response = requests.get(download_url, stream=True, timeout=60) + response.raise_for_status() - # Check if file exists and has the same size - if os.path.exists(destination): - file_size_local = os.path.getsize(destination) - if file_size == file_size_local: - print(f"File already exists and is up-to-date: {destination}") - return True # Indicate success without re-downloading + file_size = int(response.headers.get("Content-Length", 0)) - block_size = 1024 # 1 Kilobyte + # Check if file exists and has same size + if os.path.exists(destination): + file_size_local = os.path.getsize(destination) + if file_size and file_size == file_size_local: + print(f"File already exists and is up-to-date: {destination}") + return True - # Initialize the progress bar with total file size - progress_bar_description = os.path.basename(download_url) - with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: - with open(destination, "wb") as file: - while True: - chunk = response.read(block_size) - if not chunk: - break + block_size = 1024 # 1 KB + desc = os.path.basename(download_url) + with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar: + with open(destination, "wb") as file: + for chunk in response.iter_content(chunk_size=block_size): + if chunk: file.write(chunk) progress_bar.update(len(chunk)) - return True + return True try: if _attempt_download(url): return - except (urllib.error.HTTPError, urllib.error.URLError): + except requests.exceptions.RequestException: if backup_url is not None: print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}") try: if _attempt_download(backup_url): return - except urllib.error.HTTPError: + except requests.exceptions.RequestException: pass - # If we reach here, both attempts have failed error_message = ( f"Failed to download from both primary URL ({url})" f"{' and backup URL (' + backup_url + ')' if backup_url else ''}." @@ -97,37 +92,6 @@ def download_file(url, destination, backup_url=None): print(f"An unexpected error occurred: {e}") -# Alternative way using `requests` -""" -def download_file(url, destination): - # Send a GET request to download the file in streaming mode - response = requests.get(url, stream=True) - - # Get the total file size from headers, defaulting to 0 if not present - file_size = int(response.headers.get("content-length", 0)) - - # Check if file exists and has the same size - if os.path.exists(destination): - file_size_local = os.path.getsize(destination) - if file_size == file_size_local: - print(f"File already exists and is up-to-date: {destination}") - return - - # Define the block size for reading the file - block_size = 1024 # 1 Kilobyte - - # Initialize the progress bar with total file size - progress_bar_description = url.split("/")[-1] # Extract filename from URL - with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: - # Open the destination file in binary write mode - with open(destination, "wb") as file: - # Iterate over the file data in chunks - for chunk in response.iter_content(block_size): - progress_bar.update(len(chunk)) # Update progress bar - file.write(chunk) # Write the chunk to the file -""" - - def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): # Initialize parameters dictionary with empty blocks for each layer params = {"blocks": [{} for _ in range(settings["n_layer"])]} diff --git a/ch07/01_main-chapter-code/gpt_instruction_finetuning.py b/ch07/01_main-chapter-code/gpt_instruction_finetuning.py index 541cd52..248bf6a 100644 --- a/ch07/01_main-chapter-code/gpt_instruction_finetuning.py +++ b/ch07/01_main-chapter-code/gpt_instruction_finetuning.py @@ -11,9 +11,9 @@ import json import os import re import time -import urllib import matplotlib.pyplot as plt +import requests import tiktoken import torch from torch.utils.data import Dataset, DataLoader @@ -97,14 +97,14 @@ def custom_collate_fn( def download_and_load_file(file_path, url): - if not os.path.exists(file_path): - with urllib.request.urlopen(url) as response: - text_data = response.read().decode("utf-8") + response = requests.get(url, timeout=30) + response.raise_for_status() + text_data = response.text with open(file_path, "w", encoding="utf-8") as file: file.write(text_data) - with open(file_path, "r") as file: + with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) return data diff --git a/ch07/01_main-chapter-code/ollama_evaluate.py b/ch07/01_main-chapter-code/ollama_evaluate.py index e2e0204..a75f592 100644 --- a/ch07/01_main-chapter-code/ollama_evaluate.py +++ b/ch07/01_main-chapter-code/ollama_evaluate.py @@ -8,7 +8,7 @@ import json import psutil from tqdm import tqdm -import urllib.request +import requests def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"): @@ -25,23 +25,16 @@ def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"): } } - # Convert the dictionary to a JSON formatted string and encode it to bytes - payload = json.dumps(data).encode("utf-8") - - # Create a request object, setting the method to POST and adding necessary headers - request = urllib.request.Request(url, data=payload, method="POST") - request.add_header("Content-Type", "application/json") - - # Send the request and capture the response - response_data = "" - with urllib.request.urlopen(request) as response: - # Read and decode the response - while True: - line = response.readline().decode("utf-8") + # Send the POST request + with requests.post(url, json=data, stream=True, timeout=30) as r: + r.raise_for_status() + response_data = "" + for line in r.iter_lines(decode_unicode=True): if not line: - break + continue response_json = json.loads(line) - response_data += response_json["message"]["content"] + if "message" in response_json: + response_data += response_json["message"]["content"] return response_data diff --git a/ch07/03_model-evaluation/llm-instruction-eval-ollama.ipynb b/ch07/03_model-evaluation/llm-instruction-eval-ollama.ipynb index cc9673f..0f752a4 100644 --- a/ch07/03_model-evaluation/llm-instruction-eval-ollama.ipynb +++ b/ch07/03_model-evaluation/llm-instruction-eval-ollama.ipynb @@ -215,8 +215,8 @@ } ], "source": [ - "import urllib.request\n", "import json\n", + "import requests\n", "\n", "\n", "def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\"):\n", @@ -236,27 +236,19 @@ " }\n", " }\n", "\n", - " # Convert the dictionary to a JSON formatted string and encode it to bytes\n", - " payload = json.dumps(data).encode(\"utf-8\")\n", - "\n", - " # Create a request object, setting the method to POST and adding necessary headers\n", - " request = urllib.request.Request(url, data=payload, method=\"POST\")\n", - " request.add_header(\"Content-Type\", \"application/json\")\n", - "\n", - " # Send the request and capture the response\n", - " response_data = \"\"\n", - " with urllib.request.urlopen(request) as response:\n", - " # Read and decode the response\n", - " while True:\n", - " line = response.readline().decode(\"utf-8\")\n", + " # Send the POST request\n", + " with requests.post(url, json=data, stream=True, timeout=30) as r:\n", + " r.raise_for_status()\n", + " response_data = \"\"\n", + " for line in r.iter_lines(decode_unicode=True):\n", " if not line:\n", - " break\n", + " continue\n", " response_json = json.loads(line)\n", - " response_data += response_json[\"message\"][\"content\"]\n", + " if \"message\" in response_json:\n", + " response_data += response_json[\"message\"][\"content\"]\n", "\n", " return response_data\n", "\n", - "\n", "result = query_model(\"What do Llamas eat?\")\n", "print(result)" ] @@ -640,7 +632,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/ch07/04_preference-tuning-with-dpo/create-preference-data-ollama.ipynb b/ch07/04_preference-tuning-with-dpo/create-preference-data-ollama.ipynb index 41c948e..4a2c598 100644 --- a/ch07/04_preference-tuning-with-dpo/create-preference-data-ollama.ipynb +++ b/ch07/04_preference-tuning-with-dpo/create-preference-data-ollama.ipynb @@ -274,8 +274,8 @@ } ], "source": [ - "import urllib.request\n", "import json\n", + "import requests\n", "\n", "\n", "def query_model(prompt, model=\"llama3.1:70b\", url=\"http://localhost:11434/api/chat\"):\n", @@ -294,23 +294,16 @@ " }\n", " }\n", "\n", - " # Convert the dictionary to a JSON formatted string and encode it to bytes\n", - " payload = json.dumps(data).encode(\"utf-8\")\n", - "\n", - " # Create a request object, setting the method to POST and adding necessary headers\n", - " request = urllib.request.Request(url, data=payload, method=\"POST\")\n", - " request.add_header(\"Content-Type\", \"application/json\")\n", - "\n", - " # Send the request and capture the response\n", - " response_data = \"\"\n", - " with urllib.request.urlopen(request) as response:\n", - " # Read and decode the response\n", - " while True:\n", - " line = response.readline().decode(\"utf-8\")\n", + " # Send the POST request\n", + " with requests.post(url, json=data, stream=True, timeout=30) as r:\n", + " r.raise_for_status()\n", + " response_data = \"\"\n", + " for line in r.iter_lines(decode_unicode=True):\n", " if not line:\n", - " break\n", + " continue\n", " response_json = json.loads(line)\n", - " response_data += response_json[\"message\"][\"content\"]\n", + " if \"message\" in response_json:\n", + " response_data += response_json[\"message\"][\"content\"]\n", "\n", " return response_data\n", "\n", @@ -587,7 +580,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb b/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb index 761fe67..8853104 100644 --- a/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb +++ b/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb @@ -231,23 +231,21 @@ "source": [ "import json\n", "import os\n", - "import urllib\n", + "import requests\n", "\n", "\n", "def download_and_load_file(file_path, url):\n", - "\n", " if not os.path.exists(file_path):\n", - " with urllib.request.urlopen(url) as response:\n", - " text_data = response.read().decode(\"utf-8\")\n", + " response = requests.get(url, timeout=30)\n", + " response.raise_for_status()\n", + " text_data = response.text\n", " with open(file_path, \"w\", encoding=\"utf-8\") as file:\n", " file.write(text_data)\n", " else:\n", " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " text_data = file.read()\n", "\n", - " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", - " data = json.load(file)\n", - "\n", + " data = json.loads(text_data)\n", " return data\n", "\n", "\n", diff --git a/ch07/05_dataset-generation/llama3-ollama.ipynb b/ch07/05_dataset-generation/llama3-ollama.ipynb index 7fa95ff..bc24774 100644 --- a/ch07/05_dataset-generation/llama3-ollama.ipynb +++ b/ch07/05_dataset-generation/llama3-ollama.ipynb @@ -194,8 +194,8 @@ "metadata": {}, "outputs": [], "source": [ - "import urllib.request\n", "import json\n", + "import requests\n", "\n", "def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\", role=\"user\"):\n", " # Create the data payload as a dictionary\n", @@ -209,25 +209,21 @@ " ]\n", " }\n", "\n", - " # Convert the dictionary to a JSON formatted string and encode it to bytes\n", - " payload = json.dumps(data).encode(\"utf-8\")\n", - "\n", - " # Create a request object, setting the method to POST and adding necessary headers\n", - " request = urllib.request.Request(url, data=payload, method=\"POST\")\n", - " request.add_header(\"Content-Type\", \"application/json\")\n", - "\n", - " # Send the request and capture the response\n", - " response_data = \"\"\n", - " with urllib.request.urlopen(request) as response:\n", - " # Read and decode the response\n", - " while True:\n", - " line = response.readline().decode(\"utf-8\")\n", + " # Send the POST request\n", + " with requests.post(url, json=data, stream=True, timeout=30) as r:\n", + " r.raise_for_status()\n", + " response_data = \"\"\n", + " for line in r.iter_lines(decode_unicode=True):\n", " if not line:\n", - " break\n", + " continue\n", " response_json = json.loads(line)\n", - " response_data += response_json[\"message\"][\"content\"]\n", + " if \"message\" in response_json:\n", + " response_data += response_json[\"message\"][\"content\"]\n", "\n", - " return response_data" + " return response_data\n", + "\n", + "result = query_model(\"What do Llamas eat?\")\n", + "print(result)" ] }, { @@ -498,7 +494,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/pkg/llms_from_scratch/ch05.py b/pkg/llms_from_scratch/ch05.py index 315e050..afca998 100644 --- a/pkg/llms_from_scratch/ch05.py +++ b/pkg/llms_from_scratch/ch05.py @@ -7,11 +7,11 @@ from .ch04 import generate_text_simple import json import os -import urllib.request import numpy as np import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator +import requests import torch from tqdm import tqdm @@ -279,44 +279,40 @@ def download_and_load_gpt2(model_size, models_dir): def download_file(url, destination, backup_url=None): def _attempt_download(download_url): - with urllib.request.urlopen(download_url) as response: - # Get the total file size from headers, defaulting to 0 if not present - file_size = int(response.headers.get("Content-Length", 0)) + response = requests.get(download_url, stream=True, timeout=60) + response.raise_for_status() - # Check if file exists and has the same size - if os.path.exists(destination): - file_size_local = os.path.getsize(destination) - if file_size == file_size_local: - print(f"File already exists and is up-to-date: {destination}") - return True # Indicate success without re-downloading + file_size = int(response.headers.get("Content-Length", 0)) - block_size = 1024 # 1 Kilobyte + # Check if file exists and has same size + if os.path.exists(destination): + file_size_local = os.path.getsize(destination) + if file_size and file_size == file_size_local: + print(f"File already exists and is up-to-date: {destination}") + return True - # Initialize the progress bar with total file size - progress_bar_description = os.path.basename(download_url) - with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: - with open(destination, "wb") as file: - while True: - chunk = response.read(block_size) - if not chunk: - break + block_size = 1024 # 1 KB + desc = os.path.basename(download_url) + with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar: + with open(destination, "wb") as file: + for chunk in response.iter_content(chunk_size=block_size): + if chunk: file.write(chunk) progress_bar.update(len(chunk)) - return True + return True try: if _attempt_download(url): return - except (urllib.error.HTTPError, urllib.error.URLError): + except requests.exceptions.RequestException: if backup_url is not None: print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}") try: if _attempt_download(backup_url): return - except urllib.error.HTTPError: + except requests.exceptions.RequestException: pass - # If we reach here, both attempts have failed error_message = ( f"Failed to download from both primary URL ({url})" f"{' and backup URL (' + backup_url + ')' if backup_url else ''}." diff --git a/pkg/llms_from_scratch/ch06.py b/pkg/llms_from_scratch/ch06.py index 281017d..8537f9f 100644 --- a/pkg/llms_from_scratch/ch06.py +++ b/pkg/llms_from_scratch/ch06.py @@ -4,11 +4,11 @@ # Code: https://github.com/rasbt/LLMs-from-scratch -import urllib.request import zipfile import os from pathlib import Path +import requests import matplotlib.pyplot as plt from torch.utils.data import Dataset import torch @@ -21,9 +21,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path): return # Downloading the file - with urllib.request.urlopen(url) as response: - with open(zip_path, "wb") as out_file: - out_file.write(response.read()) + response = requests.get(url, stream=True, timeout=60) + response.raise_for_status() + with open(zip_path, "wb") as out_file: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + out_file.write(chunk) # Unzipping the file with zipfile.ZipFile(zip_path, "r") as zip_ref: diff --git a/pkg/llms_from_scratch/ch07.py b/pkg/llms_from_scratch/ch07.py index 3d50572..b155b4d 100644 --- a/pkg/llms_from_scratch/ch07.py +++ b/pkg/llms_from_scratch/ch07.py @@ -6,7 +6,7 @@ import json import os import psutil -import urllib +import requests import torch from tqdm import tqdm @@ -14,24 +14,46 @@ from torch.utils.data import Dataset def download_and_load_file(file_path, url): - if not os.path.exists(file_path): - with urllib.request.urlopen(url) as response: - text_data = response.read().decode("utf-8") + response = requests.get(url, timeout=30) + response.raise_for_status() + text_data = response.text with open(file_path, "w", encoding="utf-8") as file: file.write(text_data) - # The book originally contained this unnecessary "else" clause: - # else: - # with open(file_path, "r", encoding="utf-8") as file: - # text_data = file.read() - with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) return data +# The book originally used the following code below +# However, urllib uses older protocol settings that +# can cause problems for some readers using a VPN. +# The `requests` version above is more robust +# in that regard. + + +# import urllib + +# def download_and_load_file(file_path, url): + +# if not os.path.exists(file_path): +# with urllib.request.urlopen(url) as response: +# text_data = response.read().decode("utf-8") +# with open(file_path, "w", encoding="utf-8") as file: +# file.write(text_data) + +# else: +# with open(file_path, "r", encoding="utf-8") as file: +# text_data = file.read() + +# with open(file_path, "r", encoding="utf-8") as file: +# data = json.load(file) + +# return data + + def format_input(entry): instruction_text = ( f"Below is an instruction that describes a task. " @@ -202,27 +224,16 @@ def query_model( } } - # Convert the dictionary to a JSON formatted string and encode it to bytes - payload = json.dumps(data).encode("utf-8") - - # Create a request object, setting the method to POST and adding necessary headers - request = urllib.request.Request( - url, - data=payload, - method="POST" - ) - request.add_header("Content-Type", "application/json") - - # Send the request and capture the response - response_data = "" - with urllib.request.urlopen(request) as response: - # Read and decode the response - while True: - line = response.readline().decode("utf-8") + # Send the POST request + with requests.post(url, json=data, stream=True, timeout=30) as r: + r.raise_for_status() + response_data = "" + for line in r.iter_lines(decode_unicode=True): if not line: - break + continue response_json = json.loads(line) - response_data += response_json["message"]["content"] + if "message" in response_json: + response_data += response_json["message"]["content"] return response_data diff --git a/pkg/llms_from_scratch/qwen3.py b/pkg/llms_from_scratch/qwen3.py index 214b47a..e68649d 100644 --- a/pkg/llms_from_scratch/qwen3.py +++ b/pkg/llms_from_scratch/qwen3.py @@ -6,9 +6,9 @@ import os import json import re -import urllib.request from pathlib import Path +import requests import torch import torch.nn as nn @@ -660,7 +660,12 @@ def download_from_huggingface(repo_id, filename, local_dir, revision="main"): print(f"File already exists: {dest_path}") else: print(f"Downloading {url} to {dest_path}...") - urllib.request.urlretrieve(url, dest_path) + response = requests.get(url, stream=True, timeout=60) + response.raise_for_status() + with open(dest_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) return dest_path diff --git a/pkg/llms_from_scratch/tests/test_appendix_e.py b/pkg/llms_from_scratch/tests/test_appendix_e.py index 72b1cf5..761bc19 100644 --- a/pkg/llms_from_scratch/tests/test_appendix_e.py +++ b/pkg/llms_from_scratch/tests/test_appendix_e.py @@ -12,9 +12,9 @@ from llms_from_scratch.ch06 import ( from llms_from_scratch.appendix_e import replace_linear_with_lora from pathlib import Path -import urllib import pandas as pd +import requests import tiktoken import torch from torch.utils.data import DataLoader, Subset @@ -35,7 +35,7 @@ def test_train_classifier_lora(tmp_path): download_and_unzip_spam_data( url, zip_path, extracted_path, data_file_path ) - except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: + except (requests.exceptions.RequestException, TimeoutError) as e: print(f"Primary URL failed: {e}. Trying backup URL...") backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" download_and_unzip_spam_data( diff --git a/pkg/llms_from_scratch/tests/test_ch02.py b/pkg/llms_from_scratch/tests/test_ch02.py index 11d8a52..2b4e229 100644 --- a/pkg/llms_from_scratch/tests/test_ch02.py +++ b/pkg/llms_from_scratch/tests/test_ch02.py @@ -6,8 +6,8 @@ from llms_from_scratch.ch02 import create_dataloader_v1 import os -import urllib.request +import requests import pytest import torch @@ -16,11 +16,17 @@ import torch def test_dataloader(tmp_path, file_name): if not os.path.exists("the-verdict.txt"): - url = ("https://raw.githubusercontent.com/rasbt/" - "LLMs-from-scratch/main/ch02/01_main-chapter-code/" - "the-verdict.txt") + url = ( + "https://raw.githubusercontent.com/rasbt/" + "LLMs-from-scratch/main/ch02/01_main-chapter-code/" + "the-verdict.txt" + ) file_path = "the-verdict.txt" - urllib.request.urlretrieve(url, file_path) + + response = requests.get(url, timeout=30) + response.raise_for_status() + with open(file_path, "wb") as f: + f.write(response.content) with open("the-verdict.txt", "r", encoding="utf-8") as f: raw_text = f.read() diff --git a/pkg/llms_from_scratch/tests/test_ch05.py b/pkg/llms_from_scratch/tests/test_ch05.py index 3a9778a..366f347 100644 --- a/pkg/llms_from_scratch/tests/test_ch05.py +++ b/pkg/llms_from_scratch/tests/test_ch05.py @@ -8,8 +8,8 @@ from llms_from_scratch.ch04 import GPTModel, GPTModelFast from llms_from_scratch.ch05 import train_model_simple import os -import urllib +import requests import pytest import tiktoken import torch @@ -46,8 +46,9 @@ def test_train_simple(tmp_path, ModelClass): url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt" if not os.path.exists(file_path): - with urllib.request.urlopen(url) as response: - text_data = response.read().decode("utf-8") + response = requests.get(url, timeout=30) + response.raise_for_status() + text_data = response.text with open(file_path, "w", encoding="utf-8") as f: f.write(text_data) else: diff --git a/pkg/llms_from_scratch/tests/test_ch06.py b/pkg/llms_from_scratch/tests/test_ch06.py index f2e3249..29bbdd3 100644 --- a/pkg/llms_from_scratch/tests/test_ch06.py +++ b/pkg/llms_from_scratch/tests/test_ch06.py @@ -11,8 +11,8 @@ from llms_from_scratch.ch06 import ( ) from pathlib import Path -import urllib +import requests import pandas as pd import tiktoken import torch @@ -34,7 +34,7 @@ def test_train_classifier(tmp_path): download_and_unzip_spam_data( url, zip_path, extracted_path, data_file_path ) - except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: + except (requests.exceptions.RequestException, TimeoutError) as e: print(f"Primary URL failed: {e}. Trying backup URL...") backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" download_and_unzip_spam_data( diff --git a/pkg/llms_from_scratch/utils.py b/pkg/llms_from_scratch/utils.py index 174f83a..84f98ed 100644 --- a/pkg/llms_from_scratch/utils.py +++ b/pkg/llms_from_scratch/utils.py @@ -9,10 +9,9 @@ import ast import re import types from pathlib import Path -import urllib.request -import urllib.parse import nbformat +import requests def _extract_imports(src: str): @@ -125,21 +124,24 @@ def import_definitions_from_notebook(nb_dir_or_path, notebook_name=None, *, extr exec(src, mod.__dict__) return mod + def download_file(url, out_dir="."): """Simple file download utility for tests.""" - from pathlib import Path out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) - filename = Path(urllib.parse.urlparse(url).path).name + filename = Path(url).name dest = out_dir / filename - + if dest.exists(): return dest - + try: - with urllib.request.urlopen(url) as response: - with open(dest, 'wb') as f: - f.write(response.read()) + response = requests.get(url, stream=True, timeout=30) + response.raise_for_status() + with open(dest, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) return dest except Exception as e: raise RuntimeError(f"Failed to download {url}: {e}")