mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Switch from urllib to requests to improve reliability (#867)
* Switch from urllib to requests to improve reliability * Keep ruff linter-specific * update * update * update
This commit is contained in:
committed by
GitHub
parent
8552565bda
commit
7bd263144e
14
.github/workflows/basic-tests-latest-python.yml
vendored
14
.github/workflows/basic-tests-latest-python.yml
vendored
@@ -38,14 +38,14 @@ jobs:
|
|||||||
- name: Test Selected Python Scripts
|
- name: Test Selected Python Scripts
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
pytest setup/02_installing-python-libraries/tests.py
|
||||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
pytest ch04/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
pytest ch05/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
pytest ch06/01_main-chapter-code/tests.py
|
||||||
|
|
||||||
- name: Validate Selected Jupyter Notebooks
|
- name: Validate Selected Jupyter Notebooks
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||||
|
|||||||
26
.github/workflows/basic-tests-linux-uv.yml
vendored
26
.github/workflows/basic-tests-linux-uv.yml
vendored
@@ -47,24 +47,24 @@ jobs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
pytest setup/02_installing-python-libraries/tests.py
|
||||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
pytest ch04/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch04/03_kv-cache/tests.py
|
pytest ch04/03_kv-cache/tests.py
|
||||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
pytest ch05/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
|
pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
|
||||||
pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py
|
pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py
|
||||||
pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py
|
pytest ch05/11_qwen3/tests/test_qwen3_nb.py
|
||||||
pytest --ruff ch05/12_gemma3/tests/test_gemma3_nb.py
|
pytest ch05/12_gemma3/tests/test_gemma3_nb.py
|
||||||
pytest --ruff ch05/12_gemma3/tests/test_gemma3_kv_nb.py
|
pytest ch05/12_gemma3/tests/test_gemma3_kv_nb.py
|
||||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
pytest ch06/01_main-chapter-code/tests.py
|
||||||
|
|
||||||
- name: Validate Selected Jupyter Notebooks (uv)
|
- name: Validate Selected Jupyter Notebooks (uv)
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||||
|
|
||||||
- name: Test Selected Bonus Materials
|
- name: Test Selected Bonus Materials
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|||||||
24
.github/workflows/basic-tests-macos-uv.yml
vendored
24
.github/workflows/basic-tests-macos-uv.yml
vendored
@@ -47,20 +47,20 @@ jobs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
pytest setup/02_installing-python-libraries/tests.py
|
||||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
pytest ch04/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
pytest ch05/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
|
pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
|
||||||
pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py
|
pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py
|
||||||
pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py
|
pytest ch05/11_qwen3/tests/test_qwen3_nb.py
|
||||||
pytest --ruff ch05/12_gemma3/tests/test_gemma3_nb.py
|
pytest ch05/12_gemma3/tests/test_gemma3_nb.py
|
||||||
pytest --ruff ch05/12_gemma3/tests/test_gemma3_kv_nb.py
|
pytest ch05/12_gemma3/tests/test_gemma3_kv_nb.py
|
||||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
pytest ch06/01_main-chapter-code/tests.py
|
||||||
|
|
||||||
- name: Validate Selected Jupyter Notebooks (uv)
|
- name: Validate Selected Jupyter Notebooks (uv)
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||||
|
|||||||
14
.github/workflows/basic-tests-old-pytorch.yml
vendored
14
.github/workflows/basic-tests-old-pytorch.yml
vendored
@@ -43,14 +43,14 @@ jobs:
|
|||||||
- name: Test Selected Python Scripts
|
- name: Test Selected Python Scripts
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
pytest setup/02_installing-python-libraries/tests.py
|
||||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
pytest ch04/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
pytest ch05/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
pytest ch06/01_main-chapter-code/tests.py
|
||||||
|
|
||||||
- name: Validate Selected Jupyter Notebooks
|
- name: Validate Selected Jupyter Notebooks
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||||
|
|||||||
14
.github/workflows/basic-tests-pip.yml
vendored
14
.github/workflows/basic-tests-pip.yml
vendored
@@ -46,14 +46,14 @@ jobs:
|
|||||||
- name: Test Selected Python Scripts
|
- name: Test Selected Python Scripts
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
pytest setup/02_installing-python-libraries/tests.py
|
||||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
pytest ch04/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
pytest ch05/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
pytest ch06/01_main-chapter-code/tests.py
|
||||||
|
|
||||||
- name: Validate Selected Jupyter Notebooks
|
- name: Validate Selected Jupyter Notebooks
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||||
14
.github/workflows/basic-tests-pixi.yml
vendored
14
.github/workflows/basic-tests-pixi.yml
vendored
@@ -47,14 +47,14 @@ jobs:
|
|||||||
- name: Test Selected Python Scripts
|
- name: Test Selected Python Scripts
|
||||||
shell: pixi run --environment tests bash -e {0}
|
shell: pixi run --environment tests bash -e {0}
|
||||||
run: |
|
run: |
|
||||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
pytest setup/02_installing-python-libraries/tests.py
|
||||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
pytest ch04/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
pytest ch05/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
pytest ch06/01_main-chapter-code/tests.py
|
||||||
|
|
||||||
- name: Validate Selected Jupyter Notebooks
|
- name: Validate Selected Jupyter Notebooks
|
||||||
shell: pixi run --environment tests bash -e {0}
|
shell: pixi run --environment tests bash -e {0}
|
||||||
run: |
|
run: |
|
||||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||||
|
|||||||
14
.github/workflows/basic-tests-pytorch-rc.yml
vendored
14
.github/workflows/basic-tests-pytorch-rc.yml
vendored
@@ -39,14 +39,14 @@ jobs:
|
|||||||
- name: Test Selected Python Scripts
|
- name: Test Selected Python Scripts
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
pytest setup/02_installing-python-libraries/tests.py
|
||||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
pytest ch04/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
pytest ch05/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
pytest ch06/01_main-chapter-code/tests.py
|
||||||
|
|
||||||
- name: Validate Selected Jupyter Notebooks
|
- name: Validate Selected Jupyter Notebooks
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||||
|
|||||||
20
.github/workflows/basic-tests-windows-uv-pip.yml
vendored
20
.github/workflows/basic-tests-windows-uv-pip.yml
vendored
@@ -49,18 +49,18 @@ jobs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
source .venv/Scripts/activate
|
source .venv/Scripts/activate
|
||||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
pytest setup/02_installing-python-libraries/tests.py
|
||||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
pytest ch04/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
pytest ch05/01_main-chapter-code/tests.py
|
||||||
pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
|
pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
|
||||||
pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py
|
pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py
|
||||||
pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py
|
pytest ch05/11_qwen3/tests/test_qwen3_nb.py
|
||||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
pytest ch06/01_main-chapter-code/tests.py
|
||||||
|
|
||||||
- name: Run Jupyter Notebook Tests
|
- name: Run Jupyter Notebook Tests
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
source .venv/Scripts/activate
|
source .venv/Scripts/activate
|
||||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||||
@@ -121,19 +121,40 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import urllib.request\n",
|
"import requests\n",
|
||||||
"\n",
|
"\n",
|
||||||
"file_path = \"the-verdict.txt\"\n",
|
"file_path = \"the-verdict.txt\"\n",
|
||||||
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if not os.path.exists(file_path):\n",
|
"if not os.path.exists(file_path):\n",
|
||||||
|
" response = requests.get(url, timeout=30)\n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" text_data = response.text\n",
|
||||||
|
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||||
|
" file.write(text_data)\n",
|
||||||
|
"else:\n",
|
||||||
|
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||||
|
" text_data = file.read()\n",
|
||||||
|
"\n",
|
||||||
|
"# The book originally used the following code below\n",
|
||||||
|
"# However, urllib uses older protocol settings that\n",
|
||||||
|
"# can cause problems for some readers using a VPN.\n",
|
||||||
|
"# The `requests` version above is more robust\n",
|
||||||
|
"# in that regard.\n",
|
||||||
|
"\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"import os\n",
|
||||||
|
"import urllib.request\n",
|
||||||
|
"\n",
|
||||||
|
"if not os.path.exists(file_path):\n",
|
||||||
" with urllib.request.urlopen(url) as response:\n",
|
" with urllib.request.urlopen(url) as response:\n",
|
||||||
" text_data = response.read().decode('utf-8')\n",
|
" text_data = response.read().decode('utf-8')\n",
|
||||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||||
" file.write(text_data)\n",
|
" file.write(text_data)\n",
|
||||||
"else:\n",
|
"else:\n",
|
||||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||||
" text_data = file.read()"
|
" text_data = file.read()\n",
|
||||||
|
"\"\"\""
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -190,7 +190,8 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import urllib\n",
|
"# import urllib\n",
|
||||||
|
"import requests\n",
|
||||||
"from pathlib import Path\n",
|
"from pathlib import Path\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"from previous_chapters import (\n",
|
"from previous_chapters import (\n",
|
||||||
@@ -215,13 +216,20 @@
|
|||||||
"extracted_path = \"sms_spam_collection\"\n",
|
"extracted_path = \"sms_spam_collection\"\n",
|
||||||
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
|
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"\n",
|
||||||
"try:\n",
|
"try:\n",
|
||||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||||
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
|
"except (requests.exceptions.RequestException, TimeoutError) as e:\n",
|
||||||
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
|
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
|
||||||
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
||||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"# The book originally used\n",
|
||||||
|
"# except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
|
||||||
|
"# in the code above.\n",
|
||||||
|
"# However, some VPN users reported issues with `urllib`, so the code was updated\n",
|
||||||
|
"# to use `requests` instead\n",
|
||||||
|
"\n",
|
||||||
"df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
|
"df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
|
||||||
"balanced_df = create_balanced_dataset(df)\n",
|
"balanced_df = create_balanced_dataset(df)\n",
|
||||||
"balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})\n",
|
"balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})\n",
|
||||||
|
|||||||
@@ -9,12 +9,12 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import urllib
|
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import requests
|
||||||
import tiktoken
|
import tiktoken
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
@@ -367,9 +367,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Downloading the file
|
# Downloading the file
|
||||||
with urllib.request.urlopen(url) as response:
|
response = requests.get(url, stream=True, timeout=60)
|
||||||
with open(zip_path, "wb") as out_file:
|
response.raise_for_status()
|
||||||
out_file.write(response.read())
|
with open(zip_path, "wb") as out_file:
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
if chunk:
|
||||||
|
out_file.write(chunk)
|
||||||
|
|
||||||
# Unzipping the file
|
# Unzipping the file
|
||||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||||
|
|||||||
@@ -163,6 +163,30 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import requests\n",
|
||||||
|
"\n",
|
||||||
|
"if not os.path.exists(\"the-verdict.txt\"):\n",
|
||||||
|
" url = (\n",
|
||||||
|
" \"https://raw.githubusercontent.com/rasbt/\"\n",
|
||||||
|
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
|
||||||
|
" \"the-verdict.txt\"\n",
|
||||||
|
" )\n",
|
||||||
|
" file_path = \"the-verdict.txt\"\n",
|
||||||
|
"\n",
|
||||||
|
" response = requests.get(url, timeout=30)\n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" with open(file_path, \"wb\") as f:\n",
|
||||||
|
" f.write(response.content)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# The book originally used the following code below\n",
|
||||||
|
"# However, urllib uses older protocol settings that\n",
|
||||||
|
"# can cause problems for some readers using a VPN.\n",
|
||||||
|
"# The `requests` version above is more robust\n",
|
||||||
|
"# in that regard.\n",
|
||||||
|
"\n",
|
||||||
|
"\"\"\"\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import urllib.request\n",
|
"import urllib.request\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -171,7 +195,8 @@
|
|||||||
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
|
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
|
||||||
" \"the-verdict.txt\")\n",
|
" \"the-verdict.txt\")\n",
|
||||||
" file_path = \"the-verdict.txt\"\n",
|
" file_path = \"the-verdict.txt\"\n",
|
||||||
" urllib.request.urlretrieve(url, file_path)"
|
" urllib.request.urlretrieve(url, file_path)\n",
|
||||||
|
"\"\"\""
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -823,7 +823,7 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import urllib.request\n",
|
"import requests\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def download_file_if_absent(url, filename, search_dirs):\n",
|
"def download_file_if_absent(url, filename, search_dirs):\n",
|
||||||
" for directory in search_dirs:\n",
|
" for directory in search_dirs:\n",
|
||||||
@@ -834,13 +834,19 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" target_path = os.path.join(search_dirs[0], filename)\n",
|
" target_path = os.path.join(search_dirs[0], filename)\n",
|
||||||
" try:\n",
|
" try:\n",
|
||||||
" with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n",
|
" response = requests.get(url, stream=True, timeout=60)\n",
|
||||||
" out_file.write(response.read())\n",
|
" response.raise_for_status()\n",
|
||||||
|
" with open(target_path, \"wb\") as out_file:\n",
|
||||||
|
" for chunk in response.iter_content(chunk_size=8192):\n",
|
||||||
|
" if chunk:\n",
|
||||||
|
" out_file.write(chunk)\n",
|
||||||
" print(f\"Downloaded {filename} to {target_path}\")\n",
|
" print(f\"Downloaded {filename} to {target_path}\")\n",
|
||||||
" except Exception as e:\n",
|
" except Exception as e:\n",
|
||||||
" print(f\"Failed to download {filename}. Error: {e}\")\n",
|
" print(f\"Failed to download {filename}. Error: {e}\")\n",
|
||||||
|
"\n",
|
||||||
" return target_path\n",
|
" return target_path\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"\n",
|
||||||
"verdict_path = download_file_if_absent(\n",
|
"verdict_path = download_file_if_absent(\n",
|
||||||
" url=(\n",
|
" url=(\n",
|
||||||
" \"https://raw.githubusercontent.com/rasbt/\"\n",
|
" \"https://raw.githubusercontent.com/rasbt/\"\n",
|
||||||
|
|||||||
@@ -793,19 +793,43 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import urllib.request\n",
|
"import requests\n",
|
||||||
"\n",
|
"\n",
|
||||||
"file_path = \"the-verdict.txt\"\n",
|
"file_path = \"the-verdict.txt\"\n",
|
||||||
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if not os.path.exists(file_path):\n",
|
"if not os.path.exists(file_path):\n",
|
||||||
" with urllib.request.urlopen(url) as response:\n",
|
" response = requests.get(url, timeout=30)\n",
|
||||||
" text_data = response.read().decode('utf-8')\n",
|
" response.raise_for_status()\n",
|
||||||
|
" text_data = response.text\n",
|
||||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||||
" file.write(text_data)\n",
|
" file.write(text_data)\n",
|
||||||
"else:\n",
|
"else:\n",
|
||||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||||
" text_data = file.read()"
|
" text_data = file.read()\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# The book originally used the following code below\n",
|
||||||
|
"# However, urllib uses older protocol settings that\n",
|
||||||
|
"# can cause problems for some readers using a VPN.\n",
|
||||||
|
"# The `requests` version above is more robust\n",
|
||||||
|
"# in that regard.\n",
|
||||||
|
"\n",
|
||||||
|
" \n",
|
||||||
|
"# import os\n",
|
||||||
|
"# import urllib.request\n",
|
||||||
|
"\n",
|
||||||
|
"# file_path = \"the-verdict.txt\"\n",
|
||||||
|
"# url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
||||||
|
"\n",
|
||||||
|
"# if not os.path.exists(file_path):\n",
|
||||||
|
"# with urllib.request.urlopen(url) as response:\n",
|
||||||
|
"# text_data = response.read().decode('utf-8')\n",
|
||||||
|
"# with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||||
|
"# file.write(text_data)\n",
|
||||||
|
"# else:\n",
|
||||||
|
"# with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||||
|
"# text_data = file.read()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -491,7 +491,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import urllib.request\n",
|
"import requests\n",
|
||||||
"from previous_chapters import create_dataloader_v1\n",
|
"from previous_chapters import create_dataloader_v1\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -499,6 +499,25 @@
|
|||||||
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if not os.path.exists(file_path):\n",
|
"if not os.path.exists(file_path):\n",
|
||||||
|
" response = requests.get(url, timeout=30)\n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" text_data = response.text\n",
|
||||||
|
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||||
|
" file.write(text_data)\n",
|
||||||
|
"else:\n",
|
||||||
|
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||||
|
" text_data = file.read()\n",
|
||||||
|
"\n",
|
||||||
|
"# The book originally used the following code below\n",
|
||||||
|
"# However, urllib uses older protocol settings that\n",
|
||||||
|
"# can cause problems for some readers using a VPN.\n",
|
||||||
|
"# The `requests` version above is more robust\n",
|
||||||
|
"# in that regard.\n",
|
||||||
|
"\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"import urllib.request\n",
|
||||||
|
"\n",
|
||||||
|
"if not os.path.exists(file_path):\n",
|
||||||
" with urllib.request.urlopen(url) as response:\n",
|
" with urllib.request.urlopen(url) as response:\n",
|
||||||
" text_data = response.read().decode('utf-8')\n",
|
" text_data = response.read().decode('utf-8')\n",
|
||||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||||
@@ -506,6 +525,7 @@
|
|||||||
"else:\n",
|
"else:\n",
|
||||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||||
" text_data = file.read()\n",
|
" text_data = file.read()\n",
|
||||||
|
"\"\"\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Train/validation ratio\n",
|
"# Train/validation ratio\n",
|
||||||
|
|||||||
@@ -5,9 +5,8 @@
|
|||||||
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
# import requests
|
import requests
|
||||||
import json
|
import json
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
@@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir):
|
|||||||
|
|
||||||
def download_file(url, destination, backup_url=None):
|
def download_file(url, destination, backup_url=None):
|
||||||
def _attempt_download(download_url):
|
def _attempt_download(download_url):
|
||||||
with urllib.request.urlopen(download_url) as response:
|
response = requests.get(download_url, stream=True, timeout=60)
|
||||||
# Get the total file size from headers, defaulting to 0 if not present
|
response.raise_for_status()
|
||||||
file_size = int(response.headers.get("Content-Length", 0))
|
|
||||||
|
|
||||||
# Check if file exists and has the same size
|
file_size = int(response.headers.get("Content-Length", 0))
|
||||||
if os.path.exists(destination):
|
|
||||||
file_size_local = os.path.getsize(destination)
|
|
||||||
if file_size == file_size_local:
|
|
||||||
print(f"File already exists and is up-to-date: {destination}")
|
|
||||||
return True # Indicate success without re-downloading
|
|
||||||
|
|
||||||
block_size = 1024 # 1 Kilobyte
|
# Check if file exists and has same size
|
||||||
|
if os.path.exists(destination):
|
||||||
|
file_size_local = os.path.getsize(destination)
|
||||||
|
if file_size and file_size == file_size_local:
|
||||||
|
print(f"File already exists and is up-to-date: {destination}")
|
||||||
|
return True
|
||||||
|
|
||||||
# Initialize the progress bar with total file size
|
block_size = 1024 # 1 KB
|
||||||
progress_bar_description = os.path.basename(download_url)
|
desc = os.path.basename(download_url)
|
||||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
|
||||||
with open(destination, "wb") as file:
|
with open(destination, "wb") as file:
|
||||||
while True:
|
for chunk in response.iter_content(chunk_size=block_size):
|
||||||
chunk = response.read(block_size)
|
if chunk:
|
||||||
if not chunk:
|
|
||||||
break
|
|
||||||
file.write(chunk)
|
file.write(chunk)
|
||||||
progress_bar.update(len(chunk))
|
progress_bar.update(len(chunk))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if _attempt_download(url):
|
if _attempt_download(url):
|
||||||
return
|
return
|
||||||
except (urllib.error.HTTPError, urllib.error.URLError):
|
except requests.exceptions.RequestException:
|
||||||
if backup_url is not None:
|
if backup_url is not None:
|
||||||
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
|
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
|
||||||
try:
|
try:
|
||||||
if _attempt_download(backup_url):
|
if _attempt_download(backup_url):
|
||||||
return
|
return
|
||||||
except urllib.error.HTTPError:
|
except requests.exceptions.RequestException:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# If we reach here, both attempts have failed
|
|
||||||
error_message = (
|
error_message = (
|
||||||
f"Failed to download from both primary URL ({url})"
|
f"Failed to download from both primary URL ({url})"
|
||||||
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
|
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
|
||||||
|
|||||||
@@ -7,9 +7,8 @@ import argparse
|
|||||||
import json
|
import json
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import os
|
import os
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
# import requests
|
import requests
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import tiktoken
|
import tiktoken
|
||||||
import torch
|
import torch
|
||||||
@@ -60,18 +59,18 @@ def download_and_load_gpt2(model_size, models_dir):
|
|||||||
return settings, params
|
return settings, params
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
def download_file(url, destination):
|
def download_file(url, destination):
|
||||||
# Send a GET request to download the file in streaming mode
|
# Send a GET request to download the file
|
||||||
response = requests.get(url, stream=True)
|
response = requests.get(url, stream=True, timeout=60)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
# Get the total file size from headers, defaulting to 0 if not present
|
# Get the total file size from headers, defaulting to 0 if not present
|
||||||
file_size = int(response.headers.get("content-length", 0))
|
file_size = int(response.headers.get("Content-Length", 0))
|
||||||
|
|
||||||
# Check if file exists and has the same size
|
# Check if file exists and has the same size
|
||||||
if os.path.exists(destination):
|
if os.path.exists(destination):
|
||||||
file_size_local = os.path.getsize(destination)
|
file_size_local = os.path.getsize(destination)
|
||||||
if file_size == file_size_local:
|
if file_size and file_size == file_size_local:
|
||||||
print(f"File already exists and is up-to-date: {destination}")
|
print(f"File already exists and is up-to-date: {destination}")
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -79,43 +78,12 @@ def download_file(url, destination):
|
|||||||
block_size = 1024 # 1 Kilobyte
|
block_size = 1024 # 1 Kilobyte
|
||||||
|
|
||||||
# Initialize the progress bar with total file size
|
# Initialize the progress bar with total file size
|
||||||
progress_bar_description = url.split("/")[-1] # Extract filename from URL
|
progress_bar_description = os.path.basename(url)
|
||||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
||||||
# Open the destination file in binary write mode
|
# Open the destination file in binary write mode
|
||||||
with open(destination, "wb") as file:
|
with open(destination, "wb") as file:
|
||||||
# Iterate over the file data in chunks
|
for chunk in response.iter_content(chunk_size=block_size):
|
||||||
for chunk in response.iter_content(block_size):
|
if chunk:
|
||||||
progress_bar.update(len(chunk)) # Update progress bar
|
|
||||||
file.write(chunk) # Write the chunk to the file
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def download_file(url, destination):
|
|
||||||
# Send a GET request to download the file
|
|
||||||
with urllib.request.urlopen(url) as response:
|
|
||||||
# Get the total file size from headers, defaulting to 0 if not present
|
|
||||||
file_size = int(response.headers.get("Content-Length", 0))
|
|
||||||
|
|
||||||
# Check if file exists and has the same size
|
|
||||||
if os.path.exists(destination):
|
|
||||||
file_size_local = os.path.getsize(destination)
|
|
||||||
if file_size == file_size_local:
|
|
||||||
print(f"File already exists and is up-to-date: {destination}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Define the block size for reading the file
|
|
||||||
block_size = 1024 # 1 Kilobyte
|
|
||||||
|
|
||||||
# Initialize the progress bar with total file size
|
|
||||||
progress_bar_description = os.path.basename(url) # Extract filename from URL
|
|
||||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
|
||||||
# Open the destination file in binary write mode
|
|
||||||
with open(destination, "wb") as file:
|
|
||||||
# Read the file in chunks and write to destination
|
|
||||||
while True:
|
|
||||||
chunk = response.read(block_size)
|
|
||||||
if not chunk:
|
|
||||||
break
|
|
||||||
file.write(chunk)
|
file.write(chunk)
|
||||||
progress_bar.update(len(chunk)) # Update progress bar
|
progress_bar.update(len(chunk)) # Update progress bar
|
||||||
|
|
||||||
|
|||||||
@@ -5,8 +5,8 @@
|
|||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import os
|
import os
|
||||||
|
import requests
|
||||||
import torch
|
import torch
|
||||||
import urllib.request
|
|
||||||
import tiktoken
|
import tiktoken
|
||||||
|
|
||||||
|
|
||||||
@@ -141,14 +141,14 @@ def main(gpt_config, settings):
|
|||||||
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
|
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
|
||||||
|
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
with urllib.request.urlopen(url) as response:
|
response = requests.get(url, timeout=30)
|
||||||
text_data = response.read().decode('utf-8')
|
response.raise_for_status()
|
||||||
|
text_data = response.text
|
||||||
with open(file_path, "w", encoding="utf-8") as file:
|
with open(file_path, "w", encoding="utf-8") as file:
|
||||||
file.write(text_data)
|
file.write(text_data)
|
||||||
else:
|
else:
|
||||||
with open(file_path, "r", encoding="utf-8") as file:
|
with open(file_path, "r", encoding="utf-8") as file:
|
||||||
text_data = file.read()
|
text_data = file.read()
|
||||||
|
|
||||||
##############################
|
##############################
|
||||||
# Initialize model
|
# Initialize model
|
||||||
##############################
|
##############################
|
||||||
|
|||||||
@@ -7,9 +7,7 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from gpt_train import main
|
from gpt_train import main
|
||||||
import http.client
|
import requests
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def gpt_config():
|
def gpt_config():
|
||||||
@@ -43,23 +41,23 @@ def test_main(gpt_config, other_settings):
|
|||||||
|
|
||||||
|
|
||||||
def check_file_size(url, expected_size):
|
def check_file_size(url, expected_size):
|
||||||
parsed_url = urlparse(url)
|
try:
|
||||||
if parsed_url.scheme == "https":
|
response = requests.head(url, allow_redirects=True, timeout=30)
|
||||||
conn = http.client.HTTPSConnection(parsed_url.netloc)
|
if response.status_code != 200:
|
||||||
else:
|
return False, f"{url} not accessible"
|
||||||
conn = http.client.HTTPConnection(parsed_url.netloc)
|
|
||||||
|
|
||||||
conn.request("HEAD", parsed_url.path)
|
size = response.headers.get("Content-Length")
|
||||||
response = conn.getresponse()
|
if size is None:
|
||||||
if response.status != 200:
|
return False, "Content-Length header is missing"
|
||||||
return False, f"{url} not accessible"
|
|
||||||
size = response.getheader("Content-Length")
|
size = int(size)
|
||||||
if size is None:
|
if size != expected_size:
|
||||||
return False, "Content-Length header is missing"
|
return False, f"{url} file has expected size {expected_size}, but got {size}"
|
||||||
size = int(size)
|
|
||||||
if size != expected_size:
|
return True, f"{url} file size is correct"
|
||||||
return False, f"{url} file has expected size {expected_size}, but got {size}"
|
|
||||||
return True, f"{url} file size is correct"
|
except requests.exceptions.RequestException as e:
|
||||||
|
return False, f"Failed to access {url}: {e}"
|
||||||
|
|
||||||
|
|
||||||
def test_model_files():
|
def test_model_files():
|
||||||
|
|||||||
@@ -134,7 +134,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import urllib.request\n",
|
"import requests\n",
|
||||||
"from safetensors.torch import load_file\n",
|
"from safetensors.torch import load_file\n",
|
||||||
"\n",
|
"\n",
|
||||||
"URL_DIR = {\n",
|
"URL_DIR = {\n",
|
||||||
@@ -149,7 +149,10 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"# Download file\n",
|
"# Download file\n",
|
||||||
"if not os.path.exists(output_file):\n",
|
"if not os.path.exists(output_file):\n",
|
||||||
" urllib.request.urlretrieve(url, output_file)\n",
|
" response = requests.get(url, timeout=30)\n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" with open(output_file, \"wb\") as f:\n",
|
||||||
|
" f.write(response.content)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Load file\n",
|
"# Load file\n",
|
||||||
"state_dict = load_file(output_file)"
|
"state_dict = load_file(output_file)"
|
||||||
|
|||||||
@@ -144,12 +144,15 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import urllib.request\n",
|
"import requests\n",
|
||||||
"\n",
|
"\n",
|
||||||
"url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n",
|
"url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if not os.path.exists(file_name):\n",
|
"if not os.path.exists(file_name):\n",
|
||||||
" urllib.request.urlretrieve(url, file_name)\n",
|
" response = requests.get(url, timeout=60)\n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" with open(file_name, \"wb\") as f:\n",
|
||||||
|
" f.write(response.content)\n",
|
||||||
" print(f\"Downloaded to {file_name}\")"
|
" print(f\"Downloaded to {file_name}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -276,12 +279,15 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import urllib.request\n",
|
"import requests\n",
|
||||||
"\n",
|
"\n",
|
||||||
"url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n",
|
"url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if not os.path.exists(file_name):\n",
|
"if not os.path.exists(file_name):\n",
|
||||||
" urllib.request.urlretrieve(url, file_name)\n",
|
" response = requests.get(url, timeout=60)\n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" with open(file_name, \"wb\") as f:\n",
|
||||||
|
" f.write(response.content)\n",
|
||||||
" print(f\"Downloaded to {file_name}\")"
|
" print(f\"Downloaded to {file_name}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -58,12 +58,17 @@ This automatically downloads the weight file based on the model choice above:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import os
|
import os
|
||||||
import urllib.request
|
import requests
|
||||||
|
|
||||||
url = f"https://huggingface.co/rasbt/llama-3.2-from-scratch/resolve/main/{MODEL_FILE}"
|
url = f"https://huggingface.co/rasbt/llama-3.2-from-scratch/resolve/main/{MODEL_FILE}"
|
||||||
|
|
||||||
if not os.path.exists(MODEL_FILE):
|
if not os.path.exists(MODEL_FILE):
|
||||||
urllib.request.urlretrieve(url, MODEL_FILE)
|
response = requests.get(url, stream=True, timeout=60)
|
||||||
|
response.raise_for_status()
|
||||||
|
with open(MODEL_FILE, "wb") as f:
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
if chunk:
|
||||||
|
f.write(chunk)
|
||||||
print(f"Downloaded to {MODEL_FILE}")
|
print(f"Downloaded to {MODEL_FILE}")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -6,9 +6,9 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
import requests
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from torch.utils.data import Dataset, DataLoader
|
from torch.utils.data import Dataset, DataLoader
|
||||||
@@ -397,8 +397,9 @@ def main(gpt_config, settings):
|
|||||||
url = "https://www.gutenberg.org/cache/epub/145/pg145.txt"
|
url = "https://www.gutenberg.org/cache/epub/145/pg145.txt"
|
||||||
|
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
with urllib.request.urlopen(url) as response:
|
response = requests.get(url, timeout=30)
|
||||||
text_data = response.read().decode('utf-8')
|
response.raise_for_status()
|
||||||
|
text_data = response.text
|
||||||
with open(file_path, "w", encoding="utf-8") as file:
|
with open(file_path, "w", encoding="utf-8") as file:
|
||||||
file.write(text_data)
|
file.write(text_data)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -6,9 +6,9 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
import requests
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from torch.utils.data import Dataset, DataLoader
|
from torch.utils.data import Dataset, DataLoader
|
||||||
@@ -468,11 +468,11 @@ def main(gpt_config, settings, rank, world_size):
|
|||||||
# NEW: Only download 1 time
|
# NEW: Only download 1 time
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
with urllib.request.urlopen(url) as response:
|
response = requests.get(url, timeout=30)
|
||||||
text_data = response.read().decode('utf-8')
|
response.raise_for_status()
|
||||||
|
text_data = response.text
|
||||||
with open(file_path, "w", encoding="utf-8") as file:
|
with open(file_path, "w", encoding="utf-8") as file:
|
||||||
file.write(text_data)
|
file.write(text_data)
|
||||||
|
|
||||||
# NEW: All processes wait until rank 0 is done, using the GPU index.
|
# NEW: All processes wait until rank 0 is done, using the GPU index.
|
||||||
torch.distributed.barrier(device_ids=[device.index])
|
torch.distributed.barrier(device_ids=[device.index])
|
||||||
|
|
||||||
|
|||||||
@@ -186,6 +186,56 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import requests\n",
|
||||||
|
"import zipfile\n",
|
||||||
|
"import os\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"\n",
|
||||||
|
"url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
|
||||||
|
"zip_path = \"sms_spam_collection.zip\"\n",
|
||||||
|
"extracted_path = \"sms_spam_collection\"\n",
|
||||||
|
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):\n",
|
||||||
|
" if data_file_path.exists():\n",
|
||||||
|
" print(f\"{data_file_path} already exists. Skipping download and extraction.\")\n",
|
||||||
|
" return\n",
|
||||||
|
"\n",
|
||||||
|
" # Downloading the file\n",
|
||||||
|
" response = requests.get(url, stream=True, timeout=60)\n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" with open(zip_path, \"wb\") as out_file:\n",
|
||||||
|
" for chunk in response.iter_content(chunk_size=8192):\n",
|
||||||
|
" if chunk:\n",
|
||||||
|
" out_file.write(chunk)\n",
|
||||||
|
"\n",
|
||||||
|
" # Unzipping the file\n",
|
||||||
|
" with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n",
|
||||||
|
" zip_ref.extractall(extracted_path)\n",
|
||||||
|
"\n",
|
||||||
|
" # Add .tsv file extension\n",
|
||||||
|
" original_file_path = Path(extracted_path) / \"SMSSpamCollection\"\n",
|
||||||
|
" os.rename(original_file_path, data_file_path)\n",
|
||||||
|
" print(f\"File downloaded and saved as {data_file_path}\")\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"try:\n",
|
||||||
|
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||||
|
"except (requests.exceptions.RequestException, TimeoutError) as e:\n",
|
||||||
|
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
|
||||||
|
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
||||||
|
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# The book originally used the following code below\n",
|
||||||
|
"# However, urllib uses older protocol settings that\n",
|
||||||
|
"# can cause problems for some readers using a VPN.\n",
|
||||||
|
"# The `requests` version above is more robust\n",
|
||||||
|
"# in that regard.\n",
|
||||||
|
"\n",
|
||||||
|
"\"\"\"\n",
|
||||||
"import urllib.request\n",
|
"import urllib.request\n",
|
||||||
"import zipfile\n",
|
"import zipfile\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
@@ -220,7 +270,8 @@
|
|||||||
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
|
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
|
||||||
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
|
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
|
||||||
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
||||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) "
|
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||||
|
"\"\"\""
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
# This is a summary file containing the main takeaways from chapter 6.
|
# This is a summary file containing the main takeaways from chapter 6.
|
||||||
|
|
||||||
import urllib.request
|
import requests
|
||||||
import zipfile
|
import zipfile
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -27,9 +27,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Downloading the file
|
# Downloading the file
|
||||||
with urllib.request.urlopen(url) as response:
|
response = requests.get(url, stream=True, timeout=60)
|
||||||
with open(zip_path, "wb") as out_file:
|
response.raise_for_status()
|
||||||
out_file.write(response.read())
|
with open(zip_path, "wb") as out_file:
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
if chunk:
|
||||||
|
out_file.write(chunk)
|
||||||
|
|
||||||
# Unzipping the file
|
# Unzipping the file
|
||||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||||
@@ -259,7 +262,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
|
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
|
||||||
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
except (requests.exceptions.RequestException, TimeoutError) as e:
|
||||||
print(f"Primary URL failed: {e}. Trying backup URL...")
|
print(f"Primary URL failed: {e}. Trying backup URL...")
|
||||||
url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||||
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
|
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
|
||||||
|
|||||||
@@ -8,10 +8,10 @@ import math
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import time
|
import time
|
||||||
import urllib.request
|
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import requests
|
||||||
import tiktoken
|
import tiktoken
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
@@ -113,9 +113,12 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Downloading the file
|
# Downloading the file
|
||||||
with urllib.request.urlopen(url) as response:
|
response = requests.get(url, stream=True, timeout=60)
|
||||||
with open(zip_path, "wb") as out_file:
|
response.raise_for_status()
|
||||||
out_file.write(response.read())
|
with open(zip_path, "wb") as out_file:
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
if chunk:
|
||||||
|
out_file.write(chunk)
|
||||||
|
|
||||||
# Unzipping the file
|
# Unzipping the file
|
||||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||||
@@ -608,11 +611,11 @@ if __name__ == "__main__":
|
|||||||
base_path = Path(".")
|
base_path = Path(".")
|
||||||
file_names = ["train.csv", "validation.csv", "test.csv"]
|
file_names = ["train.csv", "validation.csv", "test.csv"]
|
||||||
all_exist = all((base_path / file_name).exists() for file_name in file_names)
|
all_exist = all((base_path / file_name).exists() for file_name in file_names)
|
||||||
|
|
||||||
if not all_exist:
|
if not all_exist:
|
||||||
try:
|
try:
|
||||||
download_and_unzip(url, zip_path, extract_to, new_file_path)
|
download_and_unzip(url, zip_path, extract_to, new_file_path)
|
||||||
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
except (requests.exceptions.RequestException, TimeoutError) as e:
|
||||||
print(f"Primary URL failed: {e}. Trying backup URL...")
|
print(f"Primary URL failed: {e}. Trying backup URL...")
|
||||||
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||||
download_and_unzip(backup_url, zip_path, extract_to, new_file_path)
|
download_and_unzip(backup_url, zip_path, extract_to, new_file_path)
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import tarfile
|
import tarfile
|
||||||
import time
|
import time
|
||||||
import urllib.request
|
import requests
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
@@ -32,7 +32,15 @@ def download_and_extract_dataset(dataset_url, target_file, directory):
|
|||||||
if not os.path.exists(directory):
|
if not os.path.exists(directory):
|
||||||
if os.path.exists(target_file):
|
if os.path.exists(target_file):
|
||||||
os.remove(target_file)
|
os.remove(target_file)
|
||||||
urllib.request.urlretrieve(dataset_url, target_file, reporthook)
|
|
||||||
|
response = requests.get(dataset_url, stream=True, timeout=60)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
with open(target_file, "wb") as f:
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
if chunk:
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
print("\nExtracting dataset ...")
|
print("\nExtracting dataset ...")
|
||||||
with tarfile.open(target_file, "r:gz") as tar:
|
with tarfile.open(target_file, "r:gz") as tar:
|
||||||
tar.extractall()
|
tar.extractall()
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ import argparse
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import time
|
import time
|
||||||
import urllib
|
import requests
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -62,9 +62,12 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Downloading the file
|
# Downloading the file
|
||||||
with urllib.request.urlopen(url) as response:
|
response = requests.get(url, stream=True, timeout=60)
|
||||||
with open(zip_path, "wb") as out_file:
|
response.raise_for_status()
|
||||||
out_file.write(response.read())
|
with open(zip_path, "wb") as out_file:
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
if chunk:
|
||||||
|
out_file.write(chunk)
|
||||||
|
|
||||||
# Unzipping the file
|
# Unzipping the file
|
||||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||||
@@ -412,7 +415,7 @@ if __name__ == "__main__":
|
|||||||
if not all_exist:
|
if not all_exist:
|
||||||
try:
|
try:
|
||||||
download_and_unzip(url, zip_path, extract_to, new_file_path)
|
download_and_unzip(url, zip_path, extract_to, new_file_path)
|
||||||
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
except (requests.exceptions.RequestException, TimeoutError) as e:
|
||||||
print(f"Primary URL failed: {e}. Trying backup URL...")
|
print(f"Primary URL failed: {e}. Trying backup URL...")
|
||||||
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||||
download_and_unzip(backup_url, zip_path, extract_to, new_file_path)
|
download_and_unzip(backup_url, zip_path, extract_to, new_file_path)
|
||||||
|
|||||||
@@ -169,10 +169,33 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"import json\n",
|
"import json\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import urllib\n",
|
"import requests\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def download_and_load_file(file_path, url):\n",
|
"def download_and_load_file(file_path, url):\n",
|
||||||
|
" if not os.path.exists(file_path):\n",
|
||||||
|
" response = requests.get(url, timeout=30)\n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" text_data = response.text\n",
|
||||||
|
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||||
|
" file.write(text_data)\n",
|
||||||
|
"\n",
|
||||||
|
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||||
|
" data = json.load(file)\n",
|
||||||
|
"\n",
|
||||||
|
" return data\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# The book originally used the following code below\n",
|
||||||
|
"# However, urllib uses older protocol settings that\n",
|
||||||
|
"# can cause problems for some readers using a VPN.\n",
|
||||||
|
"# The `requests` version above is more robust\n",
|
||||||
|
"# in that regard.\n",
|
||||||
|
"\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"import urllib\n",
|
||||||
|
"\n",
|
||||||
|
"def download_and_load_file(file_path, url):\n",
|
||||||
"\n",
|
"\n",
|
||||||
" if not os.path.exists(file_path):\n",
|
" if not os.path.exists(file_path):\n",
|
||||||
" with urllib.request.urlopen(url) as response:\n",
|
" with urllib.request.urlopen(url) as response:\n",
|
||||||
@@ -180,15 +203,15 @@
|
|||||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||||
" file.write(text_data)\n",
|
" file.write(text_data)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # The book originally contained this unnecessary \"else\" clause:\n",
|
" else:\n",
|
||||||
" #else:\n",
|
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||||
" # with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
" text_data = file.read()\n",
|
||||||
" # text_data = file.read()\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||||
" data = json.load(file)\n",
|
" data = json.load(file)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return data\n",
|
" return data\n",
|
||||||
|
"\"\"\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"file_path = \"instruction-data.json\"\n",
|
"file_path = \"instruction-data.json\"\n",
|
||||||
@@ -2490,7 +2513,8 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import urllib.request\n",
|
"import requests # noqa: F811\n",
|
||||||
|
"# import urllib.request\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def query_model(\n",
|
"def query_model(\n",
|
||||||
" prompt,\n",
|
" prompt,\n",
|
||||||
@@ -2512,7 +2536,8 @@
|
|||||||
" }\n",
|
" }\n",
|
||||||
" }\n",
|
" }\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
" \n",
|
||||||
|
" \"\"\"\n",
|
||||||
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
|
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
|
||||||
" payload = json.dumps(data).encode(\"utf-8\")\n",
|
" payload = json.dumps(data).encode(\"utf-8\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -2536,6 +2561,26 @@
|
|||||||
" response_data += response_json[\"message\"][\"content\"]\n",
|
" response_data += response_json[\"message\"][\"content\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return response_data\n",
|
" return response_data\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
" # The book originally used the commented-out above, which is based\n",
|
||||||
|
" # on urllib. It works generally fine, but some readers reported\n",
|
||||||
|
" # issues with using urlib when using a (company) VPN.\n",
|
||||||
|
" # The code below uses the requests library, which doesn't seem\n",
|
||||||
|
" # to have these issues.\n",
|
||||||
|
"\n",
|
||||||
|
" # Send the POST request\n",
|
||||||
|
" with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
|
||||||
|
" r.raise_for_status()\n",
|
||||||
|
" response_data = \"\"\n",
|
||||||
|
" for line in r.iter_lines(decode_unicode=True):\n",
|
||||||
|
" if not line:\n",
|
||||||
|
" continue\n",
|
||||||
|
" response_json = json.loads(line)\n",
|
||||||
|
" if \"message\" in response_json:\n",
|
||||||
|
" response_data += response_json[\"message\"][\"content\"]\n",
|
||||||
|
"\n",
|
||||||
|
" return response_data\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"model = \"llama3\"\n",
|
"model = \"llama3\"\n",
|
||||||
|
|||||||
@@ -12,10 +12,10 @@ import math
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import urllib
|
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
from matplotlib.ticker import MaxNLocator
|
from matplotlib.ticker import MaxNLocator
|
||||||
|
import requests
|
||||||
import tiktoken
|
import tiktoken
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import Dataset, DataLoader
|
from torch.utils.data import Dataset, DataLoader
|
||||||
@@ -234,17 +234,17 @@ def custom_collate_with_masking_fn(
|
|||||||
|
|
||||||
|
|
||||||
def download_and_load_file(file_path, url):
|
def download_and_load_file(file_path, url):
|
||||||
|
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
with urllib.request.urlopen(url) as response:
|
response = requests.get(url, timeout=30)
|
||||||
text_data = response.read().decode("utf-8")
|
response.raise_for_status()
|
||||||
|
text_data = response.text
|
||||||
with open(file_path, "w", encoding="utf-8") as file:
|
with open(file_path, "w", encoding="utf-8") as file:
|
||||||
file.write(text_data)
|
file.write(text_data)
|
||||||
else:
|
else:
|
||||||
with open(file_path, "r", encoding="utf-8") as file:
|
with open(file_path, "r", encoding="utf-8") as file:
|
||||||
text_data = file.read()
|
text_data = file.read()
|
||||||
|
|
||||||
with open(file_path, "r") as file:
|
with open(file_path, "r", encoding="utf-8") as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|||||||
@@ -5,11 +5,10 @@
|
|||||||
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
# import requests
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import requests
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
@@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir):
|
|||||||
|
|
||||||
def download_file(url, destination, backup_url=None):
|
def download_file(url, destination, backup_url=None):
|
||||||
def _attempt_download(download_url):
|
def _attempt_download(download_url):
|
||||||
with urllib.request.urlopen(download_url) as response:
|
response = requests.get(download_url, stream=True, timeout=60)
|
||||||
# Get the total file size from headers, defaulting to 0 if not present
|
response.raise_for_status()
|
||||||
file_size = int(response.headers.get("Content-Length", 0))
|
|
||||||
|
|
||||||
# Check if file exists and has the same size
|
file_size = int(response.headers.get("Content-Length", 0))
|
||||||
if os.path.exists(destination):
|
|
||||||
file_size_local = os.path.getsize(destination)
|
|
||||||
if file_size == file_size_local:
|
|
||||||
print(f"File already exists and is up-to-date: {destination}")
|
|
||||||
return True # Indicate success without re-downloading
|
|
||||||
|
|
||||||
block_size = 1024 # 1 Kilobyte
|
# Check if file exists and has same size
|
||||||
|
if os.path.exists(destination):
|
||||||
|
file_size_local = os.path.getsize(destination)
|
||||||
|
if file_size and file_size == file_size_local:
|
||||||
|
print(f"File already exists and is up-to-date: {destination}")
|
||||||
|
return True
|
||||||
|
|
||||||
# Initialize the progress bar with total file size
|
block_size = 1024 # 1 KB
|
||||||
progress_bar_description = os.path.basename(download_url)
|
desc = os.path.basename(download_url)
|
||||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
|
||||||
with open(destination, "wb") as file:
|
with open(destination, "wb") as file:
|
||||||
while True:
|
for chunk in response.iter_content(chunk_size=block_size):
|
||||||
chunk = response.read(block_size)
|
if chunk:
|
||||||
if not chunk:
|
|
||||||
break
|
|
||||||
file.write(chunk)
|
file.write(chunk)
|
||||||
progress_bar.update(len(chunk))
|
progress_bar.update(len(chunk))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if _attempt_download(url):
|
if _attempt_download(url):
|
||||||
return
|
return
|
||||||
except (urllib.error.HTTPError, urllib.error.URLError):
|
except requests.exceptions.RequestException:
|
||||||
if backup_url is not None:
|
if backup_url is not None:
|
||||||
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
|
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
|
||||||
try:
|
try:
|
||||||
if _attempt_download(backup_url):
|
if _attempt_download(backup_url):
|
||||||
return
|
return
|
||||||
except urllib.error.HTTPError:
|
except requests.exceptions.RequestException:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# If we reach here, both attempts have failed
|
|
||||||
error_message = (
|
error_message = (
|
||||||
f"Failed to download from both primary URL ({url})"
|
f"Failed to download from both primary URL ({url})"
|
||||||
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
|
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
|
||||||
@@ -97,37 +92,6 @@ def download_file(url, destination, backup_url=None):
|
|||||||
print(f"An unexpected error occurred: {e}")
|
print(f"An unexpected error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
# Alternative way using `requests`
|
|
||||||
"""
|
|
||||||
def download_file(url, destination):
|
|
||||||
# Send a GET request to download the file in streaming mode
|
|
||||||
response = requests.get(url, stream=True)
|
|
||||||
|
|
||||||
# Get the total file size from headers, defaulting to 0 if not present
|
|
||||||
file_size = int(response.headers.get("content-length", 0))
|
|
||||||
|
|
||||||
# Check if file exists and has the same size
|
|
||||||
if os.path.exists(destination):
|
|
||||||
file_size_local = os.path.getsize(destination)
|
|
||||||
if file_size == file_size_local:
|
|
||||||
print(f"File already exists and is up-to-date: {destination}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Define the block size for reading the file
|
|
||||||
block_size = 1024 # 1 Kilobyte
|
|
||||||
|
|
||||||
# Initialize the progress bar with total file size
|
|
||||||
progress_bar_description = url.split("/")[-1] # Extract filename from URL
|
|
||||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
|
||||||
# Open the destination file in binary write mode
|
|
||||||
with open(destination, "wb") as file:
|
|
||||||
# Iterate over the file data in chunks
|
|
||||||
for chunk in response.iter_content(block_size):
|
|
||||||
progress_bar.update(len(chunk)) # Update progress bar
|
|
||||||
file.write(chunk) # Write the chunk to the file
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
|
def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
|
||||||
# Initialize parameters dictionary with empty blocks for each layer
|
# Initialize parameters dictionary with empty blocks for each layer
|
||||||
params = {"blocks": [{} for _ in range(settings["n_layer"])]}
|
params = {"blocks": [{} for _ in range(settings["n_layer"])]}
|
||||||
|
|||||||
@@ -11,9 +11,9 @@ import json
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import urllib
|
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
import requests
|
||||||
import tiktoken
|
import tiktoken
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import Dataset, DataLoader
|
from torch.utils.data import Dataset, DataLoader
|
||||||
@@ -97,14 +97,14 @@ def custom_collate_fn(
|
|||||||
|
|
||||||
|
|
||||||
def download_and_load_file(file_path, url):
|
def download_and_load_file(file_path, url):
|
||||||
|
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
with urllib.request.urlopen(url) as response:
|
response = requests.get(url, timeout=30)
|
||||||
text_data = response.read().decode("utf-8")
|
response.raise_for_status()
|
||||||
|
text_data = response.text
|
||||||
with open(file_path, "w", encoding="utf-8") as file:
|
with open(file_path, "w", encoding="utf-8") as file:
|
||||||
file.write(text_data)
|
file.write(text_data)
|
||||||
|
|
||||||
with open(file_path, "r") as file:
|
with open(file_path, "r", encoding="utf-8") as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
import json
|
import json
|
||||||
import psutil
|
import psutil
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import urllib.request
|
import requests
|
||||||
|
|
||||||
|
|
||||||
def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"):
|
def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"):
|
||||||
@@ -25,23 +25,16 @@ def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"):
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Convert the dictionary to a JSON formatted string and encode it to bytes
|
# Send the POST request
|
||||||
payload = json.dumps(data).encode("utf-8")
|
with requests.post(url, json=data, stream=True, timeout=30) as r:
|
||||||
|
r.raise_for_status()
|
||||||
# Create a request object, setting the method to POST and adding necessary headers
|
response_data = ""
|
||||||
request = urllib.request.Request(url, data=payload, method="POST")
|
for line in r.iter_lines(decode_unicode=True):
|
||||||
request.add_header("Content-Type", "application/json")
|
|
||||||
|
|
||||||
# Send the request and capture the response
|
|
||||||
response_data = ""
|
|
||||||
with urllib.request.urlopen(request) as response:
|
|
||||||
# Read and decode the response
|
|
||||||
while True:
|
|
||||||
line = response.readline().decode("utf-8")
|
|
||||||
if not line:
|
if not line:
|
||||||
break
|
continue
|
||||||
response_json = json.loads(line)
|
response_json = json.loads(line)
|
||||||
response_data += response_json["message"]["content"]
|
if "message" in response_json:
|
||||||
|
response_data += response_json["message"]["content"]
|
||||||
|
|
||||||
return response_data
|
return response_data
|
||||||
|
|
||||||
|
|||||||
@@ -215,8 +215,8 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import urllib.request\n",
|
|
||||||
"import json\n",
|
"import json\n",
|
||||||
|
"import requests\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\"):\n",
|
"def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\"):\n",
|
||||||
@@ -236,27 +236,19 @@
|
|||||||
" }\n",
|
" }\n",
|
||||||
" }\n",
|
" }\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
|
" # Send the POST request\n",
|
||||||
" payload = json.dumps(data).encode(\"utf-8\")\n",
|
" with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
|
||||||
"\n",
|
" r.raise_for_status()\n",
|
||||||
" # Create a request object, setting the method to POST and adding necessary headers\n",
|
" response_data = \"\"\n",
|
||||||
" request = urllib.request.Request(url, data=payload, method=\"POST\")\n",
|
" for line in r.iter_lines(decode_unicode=True):\n",
|
||||||
" request.add_header(\"Content-Type\", \"application/json\")\n",
|
|
||||||
"\n",
|
|
||||||
" # Send the request and capture the response\n",
|
|
||||||
" response_data = \"\"\n",
|
|
||||||
" with urllib.request.urlopen(request) as response:\n",
|
|
||||||
" # Read and decode the response\n",
|
|
||||||
" while True:\n",
|
|
||||||
" line = response.readline().decode(\"utf-8\")\n",
|
|
||||||
" if not line:\n",
|
" if not line:\n",
|
||||||
" break\n",
|
" continue\n",
|
||||||
" response_json = json.loads(line)\n",
|
" response_json = json.loads(line)\n",
|
||||||
" response_data += response_json[\"message\"][\"content\"]\n",
|
" if \"message\" in response_json:\n",
|
||||||
|
" response_data += response_json[\"message\"][\"content\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return response_data\n",
|
" return response_data\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
|
||||||
"result = query_model(\"What do Llamas eat?\")\n",
|
"result = query_model(\"What do Llamas eat?\")\n",
|
||||||
"print(result)"
|
"print(result)"
|
||||||
]
|
]
|
||||||
@@ -640,7 +632,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
"version": "3.10.16"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@@ -274,8 +274,8 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import urllib.request\n",
|
|
||||||
"import json\n",
|
"import json\n",
|
||||||
|
"import requests\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def query_model(prompt, model=\"llama3.1:70b\", url=\"http://localhost:11434/api/chat\"):\n",
|
"def query_model(prompt, model=\"llama3.1:70b\", url=\"http://localhost:11434/api/chat\"):\n",
|
||||||
@@ -294,23 +294,16 @@
|
|||||||
" }\n",
|
" }\n",
|
||||||
" }\n",
|
" }\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
|
" # Send the POST request\n",
|
||||||
" payload = json.dumps(data).encode(\"utf-8\")\n",
|
" with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
|
||||||
"\n",
|
" r.raise_for_status()\n",
|
||||||
" # Create a request object, setting the method to POST and adding necessary headers\n",
|
" response_data = \"\"\n",
|
||||||
" request = urllib.request.Request(url, data=payload, method=\"POST\")\n",
|
" for line in r.iter_lines(decode_unicode=True):\n",
|
||||||
" request.add_header(\"Content-Type\", \"application/json\")\n",
|
|
||||||
"\n",
|
|
||||||
" # Send the request and capture the response\n",
|
|
||||||
" response_data = \"\"\n",
|
|
||||||
" with urllib.request.urlopen(request) as response:\n",
|
|
||||||
" # Read and decode the response\n",
|
|
||||||
" while True:\n",
|
|
||||||
" line = response.readline().decode(\"utf-8\")\n",
|
|
||||||
" if not line:\n",
|
" if not line:\n",
|
||||||
" break\n",
|
" continue\n",
|
||||||
" response_json = json.loads(line)\n",
|
" response_json = json.loads(line)\n",
|
||||||
" response_data += response_json[\"message\"][\"content\"]\n",
|
" if \"message\" in response_json:\n",
|
||||||
|
" response_data += response_json[\"message\"][\"content\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return response_data\n",
|
" return response_data\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -587,7 +580,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
"version": "3.10.16"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@@ -231,23 +231,21 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"import json\n",
|
"import json\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import urllib\n",
|
"import requests\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def download_and_load_file(file_path, url):\n",
|
"def download_and_load_file(file_path, url):\n",
|
||||||
"\n",
|
|
||||||
" if not os.path.exists(file_path):\n",
|
" if not os.path.exists(file_path):\n",
|
||||||
" with urllib.request.urlopen(url) as response:\n",
|
" response = requests.get(url, timeout=30)\n",
|
||||||
" text_data = response.read().decode(\"utf-8\")\n",
|
" response.raise_for_status()\n",
|
||||||
|
" text_data = response.text\n",
|
||||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||||
" file.write(text_data)\n",
|
" file.write(text_data)\n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||||
" text_data = file.read()\n",
|
" text_data = file.read()\n",
|
||||||
"\n",
|
"\n",
|
||||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
" data = json.loads(text_data)\n",
|
||||||
" data = json.load(file)\n",
|
|
||||||
"\n",
|
|
||||||
" return data\n",
|
" return data\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|||||||
@@ -194,8 +194,8 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import urllib.request\n",
|
|
||||||
"import json\n",
|
"import json\n",
|
||||||
|
"import requests\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\", role=\"user\"):\n",
|
"def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\", role=\"user\"):\n",
|
||||||
" # Create the data payload as a dictionary\n",
|
" # Create the data payload as a dictionary\n",
|
||||||
@@ -209,25 +209,21 @@
|
|||||||
" ]\n",
|
" ]\n",
|
||||||
" }\n",
|
" }\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
|
" # Send the POST request\n",
|
||||||
" payload = json.dumps(data).encode(\"utf-8\")\n",
|
" with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
|
||||||
"\n",
|
" r.raise_for_status()\n",
|
||||||
" # Create a request object, setting the method to POST and adding necessary headers\n",
|
" response_data = \"\"\n",
|
||||||
" request = urllib.request.Request(url, data=payload, method=\"POST\")\n",
|
" for line in r.iter_lines(decode_unicode=True):\n",
|
||||||
" request.add_header(\"Content-Type\", \"application/json\")\n",
|
|
||||||
"\n",
|
|
||||||
" # Send the request and capture the response\n",
|
|
||||||
" response_data = \"\"\n",
|
|
||||||
" with urllib.request.urlopen(request) as response:\n",
|
|
||||||
" # Read and decode the response\n",
|
|
||||||
" while True:\n",
|
|
||||||
" line = response.readline().decode(\"utf-8\")\n",
|
|
||||||
" if not line:\n",
|
" if not line:\n",
|
||||||
" break\n",
|
" continue\n",
|
||||||
" response_json = json.loads(line)\n",
|
" response_json = json.loads(line)\n",
|
||||||
" response_data += response_json[\"message\"][\"content\"]\n",
|
" if \"message\" in response_json:\n",
|
||||||
|
" response_data += response_json[\"message\"][\"content\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return response_data"
|
" return response_data\n",
|
||||||
|
"\n",
|
||||||
|
"result = query_model(\"What do Llamas eat?\")\n",
|
||||||
|
"print(result)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -498,7 +494,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.4"
|
"version": "3.10.16"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@@ -7,11 +7,11 @@ from .ch04 import generate_text_simple
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
from matplotlib.ticker import MaxNLocator
|
from matplotlib.ticker import MaxNLocator
|
||||||
|
import requests
|
||||||
import torch
|
import torch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
@@ -279,44 +279,40 @@ def download_and_load_gpt2(model_size, models_dir):
|
|||||||
|
|
||||||
def download_file(url, destination, backup_url=None):
|
def download_file(url, destination, backup_url=None):
|
||||||
def _attempt_download(download_url):
|
def _attempt_download(download_url):
|
||||||
with urllib.request.urlopen(download_url) as response:
|
response = requests.get(download_url, stream=True, timeout=60)
|
||||||
# Get the total file size from headers, defaulting to 0 if not present
|
response.raise_for_status()
|
||||||
file_size = int(response.headers.get("Content-Length", 0))
|
|
||||||
|
|
||||||
# Check if file exists and has the same size
|
file_size = int(response.headers.get("Content-Length", 0))
|
||||||
if os.path.exists(destination):
|
|
||||||
file_size_local = os.path.getsize(destination)
|
|
||||||
if file_size == file_size_local:
|
|
||||||
print(f"File already exists and is up-to-date: {destination}")
|
|
||||||
return True # Indicate success without re-downloading
|
|
||||||
|
|
||||||
block_size = 1024 # 1 Kilobyte
|
# Check if file exists and has same size
|
||||||
|
if os.path.exists(destination):
|
||||||
|
file_size_local = os.path.getsize(destination)
|
||||||
|
if file_size and file_size == file_size_local:
|
||||||
|
print(f"File already exists and is up-to-date: {destination}")
|
||||||
|
return True
|
||||||
|
|
||||||
# Initialize the progress bar with total file size
|
block_size = 1024 # 1 KB
|
||||||
progress_bar_description = os.path.basename(download_url)
|
desc = os.path.basename(download_url)
|
||||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
|
||||||
with open(destination, "wb") as file:
|
with open(destination, "wb") as file:
|
||||||
while True:
|
for chunk in response.iter_content(chunk_size=block_size):
|
||||||
chunk = response.read(block_size)
|
if chunk:
|
||||||
if not chunk:
|
|
||||||
break
|
|
||||||
file.write(chunk)
|
file.write(chunk)
|
||||||
progress_bar.update(len(chunk))
|
progress_bar.update(len(chunk))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if _attempt_download(url):
|
if _attempt_download(url):
|
||||||
return
|
return
|
||||||
except (urllib.error.HTTPError, urllib.error.URLError):
|
except requests.exceptions.RequestException:
|
||||||
if backup_url is not None:
|
if backup_url is not None:
|
||||||
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
|
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
|
||||||
try:
|
try:
|
||||||
if _attempt_download(backup_url):
|
if _attempt_download(backup_url):
|
||||||
return
|
return
|
||||||
except urllib.error.HTTPError:
|
except requests.exceptions.RequestException:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# If we reach here, both attempts have failed
|
|
||||||
error_message = (
|
error_message = (
|
||||||
f"Failed to download from both primary URL ({url})"
|
f"Failed to download from both primary URL ({url})"
|
||||||
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
|
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
|
||||||
|
|||||||
@@ -4,11 +4,11 @@
|
|||||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||||
|
|
||||||
|
|
||||||
import urllib.request
|
|
||||||
import zipfile
|
import zipfile
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
from torch.utils.data import Dataset
|
from torch.utils.data import Dataset
|
||||||
import torch
|
import torch
|
||||||
@@ -21,9 +21,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Downloading the file
|
# Downloading the file
|
||||||
with urllib.request.urlopen(url) as response:
|
response = requests.get(url, stream=True, timeout=60)
|
||||||
with open(zip_path, "wb") as out_file:
|
response.raise_for_status()
|
||||||
out_file.write(response.read())
|
with open(zip_path, "wb") as out_file:
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
if chunk:
|
||||||
|
out_file.write(chunk)
|
||||||
|
|
||||||
# Unzipping the file
|
# Unzipping the file
|
||||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import psutil
|
import psutil
|
||||||
import urllib
|
import requests
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
@@ -14,24 +14,46 @@ from torch.utils.data import Dataset
|
|||||||
|
|
||||||
|
|
||||||
def download_and_load_file(file_path, url):
|
def download_and_load_file(file_path, url):
|
||||||
|
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
with urllib.request.urlopen(url) as response:
|
response = requests.get(url, timeout=30)
|
||||||
text_data = response.read().decode("utf-8")
|
response.raise_for_status()
|
||||||
|
text_data = response.text
|
||||||
with open(file_path, "w", encoding="utf-8") as file:
|
with open(file_path, "w", encoding="utf-8") as file:
|
||||||
file.write(text_data)
|
file.write(text_data)
|
||||||
|
|
||||||
# The book originally contained this unnecessary "else" clause:
|
|
||||||
# else:
|
|
||||||
# with open(file_path, "r", encoding="utf-8") as file:
|
|
||||||
# text_data = file.read()
|
|
||||||
|
|
||||||
with open(file_path, "r", encoding="utf-8") as file:
|
with open(file_path, "r", encoding="utf-8") as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
# The book originally used the following code below
|
||||||
|
# However, urllib uses older protocol settings that
|
||||||
|
# can cause problems for some readers using a VPN.
|
||||||
|
# The `requests` version above is more robust
|
||||||
|
# in that regard.
|
||||||
|
|
||||||
|
|
||||||
|
# import urllib
|
||||||
|
|
||||||
|
# def download_and_load_file(file_path, url):
|
||||||
|
|
||||||
|
# if not os.path.exists(file_path):
|
||||||
|
# with urllib.request.urlopen(url) as response:
|
||||||
|
# text_data = response.read().decode("utf-8")
|
||||||
|
# with open(file_path, "w", encoding="utf-8") as file:
|
||||||
|
# file.write(text_data)
|
||||||
|
|
||||||
|
# else:
|
||||||
|
# with open(file_path, "r", encoding="utf-8") as file:
|
||||||
|
# text_data = file.read()
|
||||||
|
|
||||||
|
# with open(file_path, "r", encoding="utf-8") as file:
|
||||||
|
# data = json.load(file)
|
||||||
|
|
||||||
|
# return data
|
||||||
|
|
||||||
|
|
||||||
def format_input(entry):
|
def format_input(entry):
|
||||||
instruction_text = (
|
instruction_text = (
|
||||||
f"Below is an instruction that describes a task. "
|
f"Below is an instruction that describes a task. "
|
||||||
@@ -202,27 +224,16 @@ def query_model(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Convert the dictionary to a JSON formatted string and encode it to bytes
|
# Send the POST request
|
||||||
payload = json.dumps(data).encode("utf-8")
|
with requests.post(url, json=data, stream=True, timeout=30) as r:
|
||||||
|
r.raise_for_status()
|
||||||
# Create a request object, setting the method to POST and adding necessary headers
|
response_data = ""
|
||||||
request = urllib.request.Request(
|
for line in r.iter_lines(decode_unicode=True):
|
||||||
url,
|
|
||||||
data=payload,
|
|
||||||
method="POST"
|
|
||||||
)
|
|
||||||
request.add_header("Content-Type", "application/json")
|
|
||||||
|
|
||||||
# Send the request and capture the response
|
|
||||||
response_data = ""
|
|
||||||
with urllib.request.urlopen(request) as response:
|
|
||||||
# Read and decode the response
|
|
||||||
while True:
|
|
||||||
line = response.readline().decode("utf-8")
|
|
||||||
if not line:
|
if not line:
|
||||||
break
|
continue
|
||||||
response_json = json.loads(line)
|
response_json = json.loads(line)
|
||||||
response_data += response_json["message"]["content"]
|
if "message" in response_json:
|
||||||
|
response_data += response_json["message"]["content"]
|
||||||
|
|
||||||
return response_data
|
return response_data
|
||||||
|
|
||||||
|
|||||||
@@ -6,9 +6,9 @@
|
|||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import urllib.request
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
@@ -660,7 +660,12 @@ def download_from_huggingface(repo_id, filename, local_dir, revision="main"):
|
|||||||
print(f"File already exists: {dest_path}")
|
print(f"File already exists: {dest_path}")
|
||||||
else:
|
else:
|
||||||
print(f"Downloading {url} to {dest_path}...")
|
print(f"Downloading {url} to {dest_path}...")
|
||||||
urllib.request.urlretrieve(url, dest_path)
|
response = requests.get(url, stream=True, timeout=60)
|
||||||
|
response.raise_for_status()
|
||||||
|
with open(dest_path, "wb") as f:
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
if chunk:
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
return dest_path
|
return dest_path
|
||||||
|
|
||||||
|
|||||||
@@ -12,9 +12,9 @@ from llms_from_scratch.ch06 import (
|
|||||||
from llms_from_scratch.appendix_e import replace_linear_with_lora
|
from llms_from_scratch.appendix_e import replace_linear_with_lora
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import urllib
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import requests
|
||||||
import tiktoken
|
import tiktoken
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import DataLoader, Subset
|
from torch.utils.data import DataLoader, Subset
|
||||||
@@ -35,7 +35,7 @@ def test_train_classifier_lora(tmp_path):
|
|||||||
download_and_unzip_spam_data(
|
download_and_unzip_spam_data(
|
||||||
url, zip_path, extracted_path, data_file_path
|
url, zip_path, extracted_path, data_file_path
|
||||||
)
|
)
|
||||||
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
except (requests.exceptions.RequestException, TimeoutError) as e:
|
||||||
print(f"Primary URL failed: {e}. Trying backup URL...")
|
print(f"Primary URL failed: {e}. Trying backup URL...")
|
||||||
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||||
download_and_unzip_spam_data(
|
download_and_unzip_spam_data(
|
||||||
|
|||||||
@@ -6,8 +6,8 @@
|
|||||||
from llms_from_scratch.ch02 import create_dataloader_v1
|
from llms_from_scratch.ch02 import create_dataloader_v1
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
|
import requests
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -16,11 +16,17 @@ import torch
|
|||||||
def test_dataloader(tmp_path, file_name):
|
def test_dataloader(tmp_path, file_name):
|
||||||
|
|
||||||
if not os.path.exists("the-verdict.txt"):
|
if not os.path.exists("the-verdict.txt"):
|
||||||
url = ("https://raw.githubusercontent.com/rasbt/"
|
url = (
|
||||||
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
|
"https://raw.githubusercontent.com/rasbt/"
|
||||||
"the-verdict.txt")
|
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
|
||||||
|
"the-verdict.txt"
|
||||||
|
)
|
||||||
file_path = "the-verdict.txt"
|
file_path = "the-verdict.txt"
|
||||||
urllib.request.urlretrieve(url, file_path)
|
|
||||||
|
response = requests.get(url, timeout=30)
|
||||||
|
response.raise_for_status()
|
||||||
|
with open(file_path, "wb") as f:
|
||||||
|
f.write(response.content)
|
||||||
|
|
||||||
with open("the-verdict.txt", "r", encoding="utf-8") as f:
|
with open("the-verdict.txt", "r", encoding="utf-8") as f:
|
||||||
raw_text = f.read()
|
raw_text = f.read()
|
||||||
|
|||||||
@@ -8,8 +8,8 @@ from llms_from_scratch.ch04 import GPTModel, GPTModelFast
|
|||||||
from llms_from_scratch.ch05 import train_model_simple
|
from llms_from_scratch.ch05 import train_model_simple
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import urllib
|
|
||||||
|
|
||||||
|
import requests
|
||||||
import pytest
|
import pytest
|
||||||
import tiktoken
|
import tiktoken
|
||||||
import torch
|
import torch
|
||||||
@@ -46,8 +46,9 @@ def test_train_simple(tmp_path, ModelClass):
|
|||||||
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
|
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
|
||||||
|
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
with urllib.request.urlopen(url) as response:
|
response = requests.get(url, timeout=30)
|
||||||
text_data = response.read().decode("utf-8")
|
response.raise_for_status()
|
||||||
|
text_data = response.text
|
||||||
with open(file_path, "w", encoding="utf-8") as f:
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
f.write(text_data)
|
f.write(text_data)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ from llms_from_scratch.ch06 import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import urllib
|
|
||||||
|
|
||||||
|
import requests
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import tiktoken
|
import tiktoken
|
||||||
import torch
|
import torch
|
||||||
@@ -34,7 +34,7 @@ def test_train_classifier(tmp_path):
|
|||||||
download_and_unzip_spam_data(
|
download_and_unzip_spam_data(
|
||||||
url, zip_path, extracted_path, data_file_path
|
url, zip_path, extracted_path, data_file_path
|
||||||
)
|
)
|
||||||
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
except (requests.exceptions.RequestException, TimeoutError) as e:
|
||||||
print(f"Primary URL failed: {e}. Trying backup URL...")
|
print(f"Primary URL failed: {e}. Trying backup URL...")
|
||||||
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||||
download_and_unzip_spam_data(
|
download_and_unzip_spam_data(
|
||||||
|
|||||||
@@ -9,10 +9,9 @@ import ast
|
|||||||
import re
|
import re
|
||||||
import types
|
import types
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import urllib.request
|
|
||||||
import urllib.parse
|
|
||||||
|
|
||||||
import nbformat
|
import nbformat
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
def _extract_imports(src: str):
|
def _extract_imports(src: str):
|
||||||
@@ -125,21 +124,24 @@ def import_definitions_from_notebook(nb_dir_or_path, notebook_name=None, *, extr
|
|||||||
exec(src, mod.__dict__)
|
exec(src, mod.__dict__)
|
||||||
return mod
|
return mod
|
||||||
|
|
||||||
|
|
||||||
def download_file(url, out_dir="."):
|
def download_file(url, out_dir="."):
|
||||||
"""Simple file download utility for tests."""
|
"""Simple file download utility for tests."""
|
||||||
from pathlib import Path
|
|
||||||
out_dir = Path(out_dir)
|
out_dir = Path(out_dir)
|
||||||
out_dir.mkdir(parents=True, exist_ok=True)
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
filename = Path(urllib.parse.urlparse(url).path).name
|
filename = Path(url).name
|
||||||
dest = out_dir / filename
|
dest = out_dir / filename
|
||||||
|
|
||||||
if dest.exists():
|
if dest.exists():
|
||||||
return dest
|
return dest
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with urllib.request.urlopen(url) as response:
|
response = requests.get(url, stream=True, timeout=30)
|
||||||
with open(dest, 'wb') as f:
|
response.raise_for_status()
|
||||||
f.write(response.read())
|
with open(dest, "wb") as f:
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
if chunk:
|
||||||
|
f.write(chunk)
|
||||||
return dest
|
return dest
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Failed to download {url}: {e}")
|
raise RuntimeError(f"Failed to download {url}: {e}")
|
||||||
|
|||||||
Reference in New Issue
Block a user