mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Switch from urllib to requests to improve reliability (#867)
* Switch from urllib to requests to improve reliability * Keep ruff linter-specific * update * update * update
This commit is contained in:
committed by
GitHub
parent
8552565bda
commit
7bd263144e
14
.github/workflows/basic-tests-latest-python.yml
vendored
14
.github/workflows/basic-tests-latest-python.yml
vendored
@@ -38,14 +38,14 @@ jobs:
|
||||
- name: Test Selected Python Scripts
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
||||
pytest setup/02_installing-python-libraries/tests.py
|
||||
pytest ch04/01_main-chapter-code/tests.py
|
||||
pytest ch05/01_main-chapter-code/tests.py
|
||||
pytest ch06/01_main-chapter-code/tests.py
|
||||
|
||||
- name: Validate Selected Jupyter Notebooks
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
|
||||
26
.github/workflows/basic-tests-linux-uv.yml
vendored
26
.github/workflows/basic-tests-linux-uv.yml
vendored
@@ -47,24 +47,24 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch04/03_kv-cache/tests.py
|
||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
|
||||
pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py
|
||||
pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py
|
||||
pytest --ruff ch05/12_gemma3/tests/test_gemma3_nb.py
|
||||
pytest --ruff ch05/12_gemma3/tests/test_gemma3_kv_nb.py
|
||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
||||
pytest setup/02_installing-python-libraries/tests.py
|
||||
pytest ch04/01_main-chapter-code/tests.py
|
||||
pytest ch04/03_kv-cache/tests.py
|
||||
pytest ch05/01_main-chapter-code/tests.py
|
||||
pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
|
||||
pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py
|
||||
pytest ch05/11_qwen3/tests/test_qwen3_nb.py
|
||||
pytest ch05/12_gemma3/tests/test_gemma3_nb.py
|
||||
pytest ch05/12_gemma3/tests/test_gemma3_kv_nb.py
|
||||
pytest ch06/01_main-chapter-code/tests.py
|
||||
|
||||
- name: Validate Selected Jupyter Notebooks (uv)
|
||||
shell: bash
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
|
||||
- name: Test Selected Bonus Materials
|
||||
shell: bash
|
||||
|
||||
24
.github/workflows/basic-tests-macos-uv.yml
vendored
24
.github/workflows/basic-tests-macos-uv.yml
vendored
@@ -47,20 +47,20 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
|
||||
pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py
|
||||
pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py
|
||||
pytest --ruff ch05/12_gemma3/tests/test_gemma3_nb.py
|
||||
pytest --ruff ch05/12_gemma3/tests/test_gemma3_kv_nb.py
|
||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
||||
pytest setup/02_installing-python-libraries/tests.py
|
||||
pytest ch04/01_main-chapter-code/tests.py
|
||||
pytest ch05/01_main-chapter-code/tests.py
|
||||
pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
|
||||
pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py
|
||||
pytest ch05/11_qwen3/tests/test_qwen3_nb.py
|
||||
pytest ch05/12_gemma3/tests/test_gemma3_nb.py
|
||||
pytest ch05/12_gemma3/tests/test_gemma3_kv_nb.py
|
||||
pytest ch06/01_main-chapter-code/tests.py
|
||||
|
||||
- name: Validate Selected Jupyter Notebooks (uv)
|
||||
shell: bash
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
|
||||
14
.github/workflows/basic-tests-old-pytorch.yml
vendored
14
.github/workflows/basic-tests-old-pytorch.yml
vendored
@@ -43,14 +43,14 @@ jobs:
|
||||
- name: Test Selected Python Scripts
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
||||
pytest setup/02_installing-python-libraries/tests.py
|
||||
pytest ch04/01_main-chapter-code/tests.py
|
||||
pytest ch05/01_main-chapter-code/tests.py
|
||||
pytest ch06/01_main-chapter-code/tests.py
|
||||
|
||||
- name: Validate Selected Jupyter Notebooks
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
|
||||
14
.github/workflows/basic-tests-pip.yml
vendored
14
.github/workflows/basic-tests-pip.yml
vendored
@@ -46,14 +46,14 @@ jobs:
|
||||
- name: Test Selected Python Scripts
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
||||
pytest setup/02_installing-python-libraries/tests.py
|
||||
pytest ch04/01_main-chapter-code/tests.py
|
||||
pytest ch05/01_main-chapter-code/tests.py
|
||||
pytest ch06/01_main-chapter-code/tests.py
|
||||
|
||||
- name: Validate Selected Jupyter Notebooks
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
14
.github/workflows/basic-tests-pixi.yml
vendored
14
.github/workflows/basic-tests-pixi.yml
vendored
@@ -47,14 +47,14 @@ jobs:
|
||||
- name: Test Selected Python Scripts
|
||||
shell: pixi run --environment tests bash -e {0}
|
||||
run: |
|
||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
||||
pytest setup/02_installing-python-libraries/tests.py
|
||||
pytest ch04/01_main-chapter-code/tests.py
|
||||
pytest ch05/01_main-chapter-code/tests.py
|
||||
pytest ch06/01_main-chapter-code/tests.py
|
||||
|
||||
- name: Validate Selected Jupyter Notebooks
|
||||
shell: pixi run --environment tests bash -e {0}
|
||||
run: |
|
||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
|
||||
14
.github/workflows/basic-tests-pytorch-rc.yml
vendored
14
.github/workflows/basic-tests-pytorch-rc.yml
vendored
@@ -39,14 +39,14 @@ jobs:
|
||||
- name: Test Selected Python Scripts
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
||||
pytest setup/02_installing-python-libraries/tests.py
|
||||
pytest ch04/01_main-chapter-code/tests.py
|
||||
pytest ch05/01_main-chapter-code/tests.py
|
||||
pytest ch06/01_main-chapter-code/tests.py
|
||||
|
||||
- name: Validate Selected Jupyter Notebooks
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
|
||||
20
.github/workflows/basic-tests-windows-uv-pip.yml
vendored
20
.github/workflows/basic-tests-windows-uv-pip.yml
vendored
@@ -49,18 +49,18 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
source .venv/Scripts/activate
|
||||
pytest --ruff setup/02_installing-python-libraries/tests.py
|
||||
pytest --ruff ch04/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch05/01_main-chapter-code/tests.py
|
||||
pytest --ruff ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
|
||||
pytest --ruff ch05/07_gpt_to_llama/tests/test_llama32_nb.py
|
||||
pytest --ruff ch05/11_qwen3/tests/test_qwen3_nb.py
|
||||
pytest --ruff ch06/01_main-chapter-code/tests.py
|
||||
pytest setup/02_installing-python-libraries/tests.py
|
||||
pytest ch04/01_main-chapter-code/tests.py
|
||||
pytest ch05/01_main-chapter-code/tests.py
|
||||
pytest ch05/07_gpt_to_llama/tests/tests_rope_and_parts.py
|
||||
pytest ch05/07_gpt_to_llama/tests/test_llama32_nb.py
|
||||
pytest ch05/11_qwen3/tests/test_qwen3_nb.py
|
||||
pytest ch06/01_main-chapter-code/tests.py
|
||||
|
||||
- name: Run Jupyter Notebook Tests
|
||||
shell: bash
|
||||
run: |
|
||||
source .venv/Scripts/activate
|
||||
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
pytest --nbval ch02/01_main-chapter-code/dataloader.ipynb
|
||||
pytest --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
|
||||
pytest --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
|
||||
@@ -121,19 +121,40 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib.request\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"file_path = \"the-verdict.txt\"\n",
|
||||
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
||||
"\n",
|
||||
"if not os.path.exists(file_path):\n",
|
||||
" response = requests.get(url, timeout=30)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" text_data = response.text\n",
|
||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||
" file.write(text_data)\n",
|
||||
"else:\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" text_data = file.read()\n",
|
||||
"\n",
|
||||
"# The book originally used the following code below\n",
|
||||
"# However, urllib uses older protocol settings that\n",
|
||||
"# can cause problems for some readers using a VPN.\n",
|
||||
"# The `requests` version above is more robust\n",
|
||||
"# in that regard.\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"import os\n",
|
||||
"import urllib.request\n",
|
||||
"\n",
|
||||
"if not os.path.exists(file_path):\n",
|
||||
" with urllib.request.urlopen(url) as response:\n",
|
||||
" text_data = response.read().decode('utf-8')\n",
|
||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||
" file.write(text_data)\n",
|
||||
"else:\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" text_data = file.read()"
|
||||
" text_data = file.read()\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -190,7 +190,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import urllib\n",
|
||||
"# import urllib\n",
|
||||
"import requests\n",
|
||||
"from pathlib import Path\n",
|
||||
"import pandas as pd\n",
|
||||
"from previous_chapters import (\n",
|
||||
@@ -215,13 +216,20 @@
|
||||
"extracted_path = \"sms_spam_collection\"\n",
|
||||
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
|
||||
"except (requests.exceptions.RequestException, TimeoutError) as e:\n",
|
||||
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
|
||||
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||
"\n",
|
||||
"# The book originally used\n",
|
||||
"# except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
|
||||
"# in the code above.\n",
|
||||
"# However, some VPN users reported issues with `urllib`, so the code was updated\n",
|
||||
"# to use `requests` instead\n",
|
||||
"\n",
|
||||
"df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
|
||||
"balanced_df = create_balanced_dataset(df)\n",
|
||||
"balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})\n",
|
||||
|
||||
@@ -9,12 +9,12 @@
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import urllib
|
||||
import zipfile
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import requests
|
||||
import tiktoken
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -367,9 +367,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
|
||||
return
|
||||
|
||||
# Downloading the file
|
||||
with urllib.request.urlopen(url) as response:
|
||||
with open(zip_path, "wb") as out_file:
|
||||
out_file.write(response.read())
|
||||
response = requests.get(url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
with open(zip_path, "wb") as out_file:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
out_file.write(chunk)
|
||||
|
||||
# Unzipping the file
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
|
||||
@@ -163,6 +163,30 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"if not os.path.exists(\"the-verdict.txt\"):\n",
|
||||
" url = (\n",
|
||||
" \"https://raw.githubusercontent.com/rasbt/\"\n",
|
||||
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
|
||||
" \"the-verdict.txt\"\n",
|
||||
" )\n",
|
||||
" file_path = \"the-verdict.txt\"\n",
|
||||
"\n",
|
||||
" response = requests.get(url, timeout=30)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" with open(file_path, \"wb\") as f:\n",
|
||||
" f.write(response.content)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# The book originally used the following code below\n",
|
||||
"# However, urllib uses older protocol settings that\n",
|
||||
"# can cause problems for some readers using a VPN.\n",
|
||||
"# The `requests` version above is more robust\n",
|
||||
"# in that regard.\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"import os\n",
|
||||
"import urllib.request\n",
|
||||
"\n",
|
||||
@@ -171,7 +195,8 @@
|
||||
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
|
||||
" \"the-verdict.txt\")\n",
|
||||
" file_path = \"the-verdict.txt\"\n",
|
||||
" urllib.request.urlretrieve(url, file_path)"
|
||||
" urllib.request.urlretrieve(url, file_path)\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -823,7 +823,7 @@
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib.request\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"def download_file_if_absent(url, filename, search_dirs):\n",
|
||||
" for directory in search_dirs:\n",
|
||||
@@ -834,13 +834,19 @@
|
||||
"\n",
|
||||
" target_path = os.path.join(search_dirs[0], filename)\n",
|
||||
" try:\n",
|
||||
" with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n",
|
||||
" out_file.write(response.read())\n",
|
||||
" response = requests.get(url, stream=True, timeout=60)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" with open(target_path, \"wb\") as out_file:\n",
|
||||
" for chunk in response.iter_content(chunk_size=8192):\n",
|
||||
" if chunk:\n",
|
||||
" out_file.write(chunk)\n",
|
||||
" print(f\"Downloaded {filename} to {target_path}\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Failed to download {filename}. Error: {e}\")\n",
|
||||
"\n",
|
||||
" return target_path\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"verdict_path = download_file_if_absent(\n",
|
||||
" url=(\n",
|
||||
" \"https://raw.githubusercontent.com/rasbt/\"\n",
|
||||
|
||||
@@ -793,19 +793,43 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib.request\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"file_path = \"the-verdict.txt\"\n",
|
||||
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
||||
"\n",
|
||||
"if not os.path.exists(file_path):\n",
|
||||
" with urllib.request.urlopen(url) as response:\n",
|
||||
" text_data = response.read().decode('utf-8')\n",
|
||||
" response = requests.get(url, timeout=30)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" text_data = response.text\n",
|
||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||
" file.write(text_data)\n",
|
||||
"else:\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" text_data = file.read()"
|
||||
" text_data = file.read()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# The book originally used the following code below\n",
|
||||
"# However, urllib uses older protocol settings that\n",
|
||||
"# can cause problems for some readers using a VPN.\n",
|
||||
"# The `requests` version above is more robust\n",
|
||||
"# in that regard.\n",
|
||||
"\n",
|
||||
" \n",
|
||||
"# import os\n",
|
||||
"# import urllib.request\n",
|
||||
"\n",
|
||||
"# file_path = \"the-verdict.txt\"\n",
|
||||
"# url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
||||
"\n",
|
||||
"# if not os.path.exists(file_path):\n",
|
||||
"# with urllib.request.urlopen(url) as response:\n",
|
||||
"# text_data = response.read().decode('utf-8')\n",
|
||||
"# with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||
"# file.write(text_data)\n",
|
||||
"# else:\n",
|
||||
"# with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
"# text_data = file.read()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -491,7 +491,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib.request\n",
|
||||
"import requests\n",
|
||||
"from previous_chapters import create_dataloader_v1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -499,6 +499,25 @@
|
||||
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
|
||||
"\n",
|
||||
"if not os.path.exists(file_path):\n",
|
||||
" response = requests.get(url, timeout=30)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" text_data = response.text\n",
|
||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||
" file.write(text_data)\n",
|
||||
"else:\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" text_data = file.read()\n",
|
||||
"\n",
|
||||
"# The book originally used the following code below\n",
|
||||
"# However, urllib uses older protocol settings that\n",
|
||||
"# can cause problems for some readers using a VPN.\n",
|
||||
"# The `requests` version above is more robust\n",
|
||||
"# in that regard.\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"import urllib.request\n",
|
||||
"\n",
|
||||
"if not os.path.exists(file_path):\n",
|
||||
" with urllib.request.urlopen(url) as response:\n",
|
||||
" text_data = response.read().decode('utf-8')\n",
|
||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||
@@ -506,6 +525,7 @@
|
||||
"else:\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" text_data = file.read()\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Train/validation ratio\n",
|
||||
|
||||
@@ -5,9 +5,8 @@
|
||||
|
||||
|
||||
import os
|
||||
import urllib.request
|
||||
|
||||
# import requests
|
||||
import requests
|
||||
import json
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
@@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir):
|
||||
|
||||
def download_file(url, destination, backup_url=None):
|
||||
def _attempt_download(download_url):
|
||||
with urllib.request.urlopen(download_url) as response:
|
||||
# Get the total file size from headers, defaulting to 0 if not present
|
||||
file_size = int(response.headers.get("Content-Length", 0))
|
||||
response = requests.get(download_url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check if file exists and has the same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return True # Indicate success without re-downloading
|
||||
file_size = int(response.headers.get("Content-Length", 0))
|
||||
|
||||
block_size = 1024 # 1 Kilobyte
|
||||
# Check if file exists and has same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size and file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return True
|
||||
|
||||
# Initialize the progress bar with total file size
|
||||
progress_bar_description = os.path.basename(download_url)
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
||||
with open(destination, "wb") as file:
|
||||
while True:
|
||||
chunk = response.read(block_size)
|
||||
if not chunk:
|
||||
break
|
||||
block_size = 1024 # 1 KB
|
||||
desc = os.path.basename(download_url)
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
|
||||
with open(destination, "wb") as file:
|
||||
for chunk in response.iter_content(chunk_size=block_size):
|
||||
if chunk:
|
||||
file.write(chunk)
|
||||
progress_bar.update(len(chunk))
|
||||
return True
|
||||
return True
|
||||
|
||||
try:
|
||||
if _attempt_download(url):
|
||||
return
|
||||
except (urllib.error.HTTPError, urllib.error.URLError):
|
||||
except requests.exceptions.RequestException:
|
||||
if backup_url is not None:
|
||||
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
|
||||
try:
|
||||
if _attempt_download(backup_url):
|
||||
return
|
||||
except urllib.error.HTTPError:
|
||||
except requests.exceptions.RequestException:
|
||||
pass
|
||||
|
||||
# If we reach here, both attempts have failed
|
||||
error_message = (
|
||||
f"Failed to download from both primary URL ({url})"
|
||||
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
|
||||
|
||||
@@ -7,9 +7,8 @@ import argparse
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import urllib.request
|
||||
|
||||
# import requests
|
||||
import requests
|
||||
import tensorflow as tf
|
||||
import tiktoken
|
||||
import torch
|
||||
@@ -60,18 +59,18 @@ def download_and_load_gpt2(model_size, models_dir):
|
||||
return settings, params
|
||||
|
||||
|
||||
"""
|
||||
def download_file(url, destination):
|
||||
# Send a GET request to download the file in streaming mode
|
||||
response = requests.get(url, stream=True)
|
||||
# Send a GET request to download the file
|
||||
response = requests.get(url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
# Get the total file size from headers, defaulting to 0 if not present
|
||||
file_size = int(response.headers.get("content-length", 0))
|
||||
file_size = int(response.headers.get("Content-Length", 0))
|
||||
|
||||
# Check if file exists and has the same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size == file_size_local:
|
||||
if file_size and file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return
|
||||
|
||||
@@ -79,43 +78,12 @@ def download_file(url, destination):
|
||||
block_size = 1024 # 1 Kilobyte
|
||||
|
||||
# Initialize the progress bar with total file size
|
||||
progress_bar_description = url.split("/")[-1] # Extract filename from URL
|
||||
progress_bar_description = os.path.basename(url)
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
||||
# Open the destination file in binary write mode
|
||||
with open(destination, "wb") as file:
|
||||
# Iterate over the file data in chunks
|
||||
for chunk in response.iter_content(block_size):
|
||||
progress_bar.update(len(chunk)) # Update progress bar
|
||||
file.write(chunk) # Write the chunk to the file
|
||||
"""
|
||||
|
||||
|
||||
def download_file(url, destination):
|
||||
# Send a GET request to download the file
|
||||
with urllib.request.urlopen(url) as response:
|
||||
# Get the total file size from headers, defaulting to 0 if not present
|
||||
file_size = int(response.headers.get("Content-Length", 0))
|
||||
|
||||
# Check if file exists and has the same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return
|
||||
|
||||
# Define the block size for reading the file
|
||||
block_size = 1024 # 1 Kilobyte
|
||||
|
||||
# Initialize the progress bar with total file size
|
||||
progress_bar_description = os.path.basename(url) # Extract filename from URL
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
||||
# Open the destination file in binary write mode
|
||||
with open(destination, "wb") as file:
|
||||
# Read the file in chunks and write to destination
|
||||
while True:
|
||||
chunk = response.read(block_size)
|
||||
if not chunk:
|
||||
break
|
||||
for chunk in response.iter_content(chunk_size=block_size):
|
||||
if chunk:
|
||||
file.write(chunk)
|
||||
progress_bar.update(len(chunk)) # Update progress bar
|
||||
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import os
|
||||
import requests
|
||||
import torch
|
||||
import urllib.request
|
||||
import tiktoken
|
||||
|
||||
|
||||
@@ -141,14 +141,14 @@ def main(gpt_config, settings):
|
||||
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
with urllib.request.urlopen(url) as response:
|
||||
text_data = response.read().decode('utf-8')
|
||||
response = requests.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
text_data = response.text
|
||||
with open(file_path, "w", encoding="utf-8") as file:
|
||||
file.write(text_data)
|
||||
else:
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
text_data = file.read()
|
||||
|
||||
##############################
|
||||
# Initialize model
|
||||
##############################
|
||||
|
||||
@@ -7,9 +7,7 @@
|
||||
|
||||
import pytest
|
||||
from gpt_train import main
|
||||
import http.client
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
@pytest.fixture
|
||||
def gpt_config():
|
||||
@@ -43,23 +41,23 @@ def test_main(gpt_config, other_settings):
|
||||
|
||||
|
||||
def check_file_size(url, expected_size):
|
||||
parsed_url = urlparse(url)
|
||||
if parsed_url.scheme == "https":
|
||||
conn = http.client.HTTPSConnection(parsed_url.netloc)
|
||||
else:
|
||||
conn = http.client.HTTPConnection(parsed_url.netloc)
|
||||
try:
|
||||
response = requests.head(url, allow_redirects=True, timeout=30)
|
||||
if response.status_code != 200:
|
||||
return False, f"{url} not accessible"
|
||||
|
||||
conn.request("HEAD", parsed_url.path)
|
||||
response = conn.getresponse()
|
||||
if response.status != 200:
|
||||
return False, f"{url} not accessible"
|
||||
size = response.getheader("Content-Length")
|
||||
if size is None:
|
||||
return False, "Content-Length header is missing"
|
||||
size = int(size)
|
||||
if size != expected_size:
|
||||
return False, f"{url} file has expected size {expected_size}, but got {size}"
|
||||
return True, f"{url} file size is correct"
|
||||
size = response.headers.get("Content-Length")
|
||||
if size is None:
|
||||
return False, "Content-Length header is missing"
|
||||
|
||||
size = int(size)
|
||||
if size != expected_size:
|
||||
return False, f"{url} file has expected size {expected_size}, but got {size}"
|
||||
|
||||
return True, f"{url} file size is correct"
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
return False, f"Failed to access {url}: {e}"
|
||||
|
||||
|
||||
def test_model_files():
|
||||
|
||||
@@ -134,7 +134,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib.request\n",
|
||||
"import requests\n",
|
||||
"from safetensors.torch import load_file\n",
|
||||
"\n",
|
||||
"URL_DIR = {\n",
|
||||
@@ -149,7 +149,10 @@
|
||||
"\n",
|
||||
"# Download file\n",
|
||||
"if not os.path.exists(output_file):\n",
|
||||
" urllib.request.urlretrieve(url, output_file)\n",
|
||||
" response = requests.get(url, timeout=30)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" with open(output_file, \"wb\") as f:\n",
|
||||
" f.write(response.content)\n",
|
||||
"\n",
|
||||
"# Load file\n",
|
||||
"state_dict = load_file(output_file)"
|
||||
|
||||
@@ -144,12 +144,15 @@
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib.request\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n",
|
||||
"\n",
|
||||
"if not os.path.exists(file_name):\n",
|
||||
" urllib.request.urlretrieve(url, file_name)\n",
|
||||
" response = requests.get(url, timeout=60)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" with open(file_name, \"wb\") as f:\n",
|
||||
" f.write(response.content)\n",
|
||||
" print(f\"Downloaded to {file_name}\")"
|
||||
]
|
||||
},
|
||||
@@ -276,12 +279,15 @@
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib.request\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"url = f\"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}\"\n",
|
||||
"\n",
|
||||
"if not os.path.exists(file_name):\n",
|
||||
" urllib.request.urlretrieve(url, file_name)\n",
|
||||
" response = requests.get(url, timeout=60)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" with open(file_name, \"wb\") as f:\n",
|
||||
" f.write(response.content)\n",
|
||||
" print(f\"Downloaded to {file_name}\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -58,12 +58,17 @@ This automatically downloads the weight file based on the model choice above:
|
||||
|
||||
```python
|
||||
import os
|
||||
import urllib.request
|
||||
import requests
|
||||
|
||||
url = f"https://huggingface.co/rasbt/llama-3.2-from-scratch/resolve/main/{MODEL_FILE}"
|
||||
|
||||
if not os.path.exists(MODEL_FILE):
|
||||
urllib.request.urlretrieve(url, MODEL_FILE)
|
||||
response = requests.get(url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
with open(MODEL_FILE, "wb") as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
print(f"Downloaded to {MODEL_FILE}")
|
||||
```
|
||||
|
||||
|
||||
@@ -6,9 +6,9 @@
|
||||
|
||||
import os
|
||||
import time
|
||||
import urllib.request
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import requests
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
@@ -397,8 +397,9 @@ def main(gpt_config, settings):
|
||||
url = "https://www.gutenberg.org/cache/epub/145/pg145.txt"
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
with urllib.request.urlopen(url) as response:
|
||||
text_data = response.read().decode('utf-8')
|
||||
response = requests.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
text_data = response.text
|
||||
with open(file_path, "w", encoding="utf-8") as file:
|
||||
file.write(text_data)
|
||||
else:
|
||||
|
||||
@@ -6,9 +6,9 @@
|
||||
|
||||
import os
|
||||
import time
|
||||
import urllib.request
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import requests
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
@@ -468,11 +468,11 @@ def main(gpt_config, settings, rank, world_size):
|
||||
# NEW: Only download 1 time
|
||||
if rank == 0:
|
||||
if not os.path.exists(file_path):
|
||||
with urllib.request.urlopen(url) as response:
|
||||
text_data = response.read().decode('utf-8')
|
||||
response = requests.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
text_data = response.text
|
||||
with open(file_path, "w", encoding="utf-8") as file:
|
||||
file.write(text_data)
|
||||
|
||||
# NEW: All processes wait until rank 0 is done, using the GPU index.
|
||||
torch.distributed.barrier(device_ids=[device.index])
|
||||
|
||||
|
||||
@@ -186,6 +186,56 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import zipfile\n",
|
||||
"import os\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
|
||||
"zip_path = \"sms_spam_collection.zip\"\n",
|
||||
"extracted_path = \"sms_spam_collection\"\n",
|
||||
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):\n",
|
||||
" if data_file_path.exists():\n",
|
||||
" print(f\"{data_file_path} already exists. Skipping download and extraction.\")\n",
|
||||
" return\n",
|
||||
"\n",
|
||||
" # Downloading the file\n",
|
||||
" response = requests.get(url, stream=True, timeout=60)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" with open(zip_path, \"wb\") as out_file:\n",
|
||||
" for chunk in response.iter_content(chunk_size=8192):\n",
|
||||
" if chunk:\n",
|
||||
" out_file.write(chunk)\n",
|
||||
"\n",
|
||||
" # Unzipping the file\n",
|
||||
" with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n",
|
||||
" zip_ref.extractall(extracted_path)\n",
|
||||
"\n",
|
||||
" # Add .tsv file extension\n",
|
||||
" original_file_path = Path(extracted_path) / \"SMSSpamCollection\"\n",
|
||||
" os.rename(original_file_path, data_file_path)\n",
|
||||
" print(f\"File downloaded and saved as {data_file_path}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||
"except (requests.exceptions.RequestException, TimeoutError) as e:\n",
|
||||
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
|
||||
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# The book originally used the following code below\n",
|
||||
"# However, urllib uses older protocol settings that\n",
|
||||
"# can cause problems for some readers using a VPN.\n",
|
||||
"# The `requests` version above is more robust\n",
|
||||
"# in that regard.\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"import urllib.request\n",
|
||||
"import zipfile\n",
|
||||
"import os\n",
|
||||
@@ -220,7 +270,8 @@
|
||||
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
|
||||
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
|
||||
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) "
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
# This is a summary file containing the main takeaways from chapter 6.
|
||||
|
||||
import urllib.request
|
||||
import requests
|
||||
import zipfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
@@ -27,9 +27,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
|
||||
return
|
||||
|
||||
# Downloading the file
|
||||
with urllib.request.urlopen(url) as response:
|
||||
with open(zip_path, "wb") as out_file:
|
||||
out_file.write(response.read())
|
||||
response = requests.get(url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
with open(zip_path, "wb") as out_file:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
out_file.write(chunk)
|
||||
|
||||
# Unzipping the file
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
@@ -259,7 +262,7 @@ if __name__ == "__main__":
|
||||
|
||||
try:
|
||||
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
|
||||
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
||||
except (requests.exceptions.RequestException, TimeoutError) as e:
|
||||
print(f"Primary URL failed: {e}. Trying backup URL...")
|
||||
url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
|
||||
|
||||
@@ -8,10 +8,10 @@ import math
|
||||
import os
|
||||
from pathlib import Path
|
||||
import time
|
||||
import urllib.request
|
||||
import zipfile
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
import tiktoken
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
@@ -113,9 +113,12 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path):
|
||||
return
|
||||
|
||||
# Downloading the file
|
||||
with urllib.request.urlopen(url) as response:
|
||||
with open(zip_path, "wb") as out_file:
|
||||
out_file.write(response.read())
|
||||
response = requests.get(url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
with open(zip_path, "wb") as out_file:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
out_file.write(chunk)
|
||||
|
||||
# Unzipping the file
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
@@ -608,11 +611,11 @@ if __name__ == "__main__":
|
||||
base_path = Path(".")
|
||||
file_names = ["train.csv", "validation.csv", "test.csv"]
|
||||
all_exist = all((base_path / file_name).exists() for file_name in file_names)
|
||||
|
||||
|
||||
if not all_exist:
|
||||
try:
|
||||
download_and_unzip(url, zip_path, extract_to, new_file_path)
|
||||
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
||||
except (requests.exceptions.RequestException, TimeoutError) as e:
|
||||
print(f"Primary URL failed: {e}. Trying backup URL...")
|
||||
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||
download_and_unzip(backup_url, zip_path, extract_to, new_file_path)
|
||||
|
||||
@@ -7,7 +7,7 @@ import os
|
||||
import sys
|
||||
import tarfile
|
||||
import time
|
||||
import urllib.request
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
|
||||
@@ -32,7 +32,15 @@ def download_and_extract_dataset(dataset_url, target_file, directory):
|
||||
if not os.path.exists(directory):
|
||||
if os.path.exists(target_file):
|
||||
os.remove(target_file)
|
||||
urllib.request.urlretrieve(dataset_url, target_file, reporthook)
|
||||
|
||||
response = requests.get(dataset_url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
with open(target_file, "wb") as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
print("\nExtracting dataset ...")
|
||||
with tarfile.open(target_file, "r:gz") as tar:
|
||||
tar.extractall()
|
||||
|
||||
@@ -7,7 +7,7 @@ import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import time
|
||||
import urllib
|
||||
import requests
|
||||
import zipfile
|
||||
|
||||
import pandas as pd
|
||||
@@ -62,9 +62,12 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path):
|
||||
return
|
||||
|
||||
# Downloading the file
|
||||
with urllib.request.urlopen(url) as response:
|
||||
with open(zip_path, "wb") as out_file:
|
||||
out_file.write(response.read())
|
||||
response = requests.get(url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
with open(zip_path, "wb") as out_file:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
out_file.write(chunk)
|
||||
|
||||
# Unzipping the file
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
@@ -412,7 +415,7 @@ if __name__ == "__main__":
|
||||
if not all_exist:
|
||||
try:
|
||||
download_and_unzip(url, zip_path, extract_to, new_file_path)
|
||||
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
||||
except (requests.exceptions.RequestException, TimeoutError) as e:
|
||||
print(f"Primary URL failed: {e}. Trying backup URL...")
|
||||
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||
download_and_unzip(backup_url, zip_path, extract_to, new_file_path)
|
||||
|
||||
@@ -169,10 +169,33 @@
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"import urllib\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def download_and_load_file(file_path, url):\n",
|
||||
" if not os.path.exists(file_path):\n",
|
||||
" response = requests.get(url, timeout=30)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" text_data = response.text\n",
|
||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||
" file.write(text_data)\n",
|
||||
"\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" data = json.load(file)\n",
|
||||
"\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# The book originally used the following code below\n",
|
||||
"# However, urllib uses older protocol settings that\n",
|
||||
"# can cause problems for some readers using a VPN.\n",
|
||||
"# The `requests` version above is more robust\n",
|
||||
"# in that regard.\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"import urllib\n",
|
||||
"\n",
|
||||
"def download_and_load_file(file_path, url):\n",
|
||||
"\n",
|
||||
" if not os.path.exists(file_path):\n",
|
||||
" with urllib.request.urlopen(url) as response:\n",
|
||||
@@ -180,15 +203,15 @@
|
||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||
" file.write(text_data)\n",
|
||||
"\n",
|
||||
" # The book originally contained this unnecessary \"else\" clause:\n",
|
||||
" #else:\n",
|
||||
" # with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" # text_data = file.read()\n",
|
||||
" else:\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" text_data = file.read()\n",
|
||||
"\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" data = json.load(file)\n",
|
||||
"\n",
|
||||
" return data\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"file_path = \"instruction-data.json\"\n",
|
||||
@@ -2490,7 +2513,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import urllib.request\n",
|
||||
"import requests # noqa: F811\n",
|
||||
"# import urllib.request\n",
|
||||
"\n",
|
||||
"def query_model(\n",
|
||||
" prompt,\n",
|
||||
@@ -2512,7 +2536,8 @@
|
||||
" }\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" \"\"\"\n",
|
||||
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
|
||||
" payload = json.dumps(data).encode(\"utf-8\")\n",
|
||||
"\n",
|
||||
@@ -2536,6 +2561,26 @@
|
||||
" response_data += response_json[\"message\"][\"content\"]\n",
|
||||
"\n",
|
||||
" return response_data\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" # The book originally used the commented-out above, which is based\n",
|
||||
" # on urllib. It works generally fine, but some readers reported\n",
|
||||
" # issues with using urlib when using a (company) VPN.\n",
|
||||
" # The code below uses the requests library, which doesn't seem\n",
|
||||
" # to have these issues.\n",
|
||||
"\n",
|
||||
" # Send the POST request\n",
|
||||
" with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
|
||||
" r.raise_for_status()\n",
|
||||
" response_data = \"\"\n",
|
||||
" for line in r.iter_lines(decode_unicode=True):\n",
|
||||
" if not line:\n",
|
||||
" continue\n",
|
||||
" response_json = json.loads(line)\n",
|
||||
" if \"message\" in response_json:\n",
|
||||
" response_data += response_json[\"message\"][\"content\"]\n",
|
||||
"\n",
|
||||
" return response_data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"model = \"llama3\"\n",
|
||||
|
||||
@@ -12,10 +12,10 @@ import math
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import urllib
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.ticker import MaxNLocator
|
||||
import requests
|
||||
import tiktoken
|
||||
import torch
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
@@ -234,17 +234,17 @@ def custom_collate_with_masking_fn(
|
||||
|
||||
|
||||
def download_and_load_file(file_path, url):
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
with urllib.request.urlopen(url) as response:
|
||||
text_data = response.read().decode("utf-8")
|
||||
response = requests.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
text_data = response.text
|
||||
with open(file_path, "w", encoding="utf-8") as file:
|
||||
file.write(text_data)
|
||||
else:
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
text_data = file.read()
|
||||
|
||||
with open(file_path, "r") as file:
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
data = json.load(file)
|
||||
|
||||
return data
|
||||
|
||||
@@ -5,11 +5,10 @@
|
||||
|
||||
|
||||
import os
|
||||
import urllib.request
|
||||
|
||||
# import requests
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
import tensorflow as tf
|
||||
from tqdm import tqdm
|
||||
|
||||
@@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir):
|
||||
|
||||
def download_file(url, destination, backup_url=None):
|
||||
def _attempt_download(download_url):
|
||||
with urllib.request.urlopen(download_url) as response:
|
||||
# Get the total file size from headers, defaulting to 0 if not present
|
||||
file_size = int(response.headers.get("Content-Length", 0))
|
||||
response = requests.get(download_url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check if file exists and has the same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return True # Indicate success without re-downloading
|
||||
file_size = int(response.headers.get("Content-Length", 0))
|
||||
|
||||
block_size = 1024 # 1 Kilobyte
|
||||
# Check if file exists and has same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size and file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return True
|
||||
|
||||
# Initialize the progress bar with total file size
|
||||
progress_bar_description = os.path.basename(download_url)
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
||||
with open(destination, "wb") as file:
|
||||
while True:
|
||||
chunk = response.read(block_size)
|
||||
if not chunk:
|
||||
break
|
||||
block_size = 1024 # 1 KB
|
||||
desc = os.path.basename(download_url)
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
|
||||
with open(destination, "wb") as file:
|
||||
for chunk in response.iter_content(chunk_size=block_size):
|
||||
if chunk:
|
||||
file.write(chunk)
|
||||
progress_bar.update(len(chunk))
|
||||
return True
|
||||
return True
|
||||
|
||||
try:
|
||||
if _attempt_download(url):
|
||||
return
|
||||
except (urllib.error.HTTPError, urllib.error.URLError):
|
||||
except requests.exceptions.RequestException:
|
||||
if backup_url is not None:
|
||||
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
|
||||
try:
|
||||
if _attempt_download(backup_url):
|
||||
return
|
||||
except urllib.error.HTTPError:
|
||||
except requests.exceptions.RequestException:
|
||||
pass
|
||||
|
||||
# If we reach here, both attempts have failed
|
||||
error_message = (
|
||||
f"Failed to download from both primary URL ({url})"
|
||||
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
|
||||
@@ -97,37 +92,6 @@ def download_file(url, destination, backup_url=None):
|
||||
print(f"An unexpected error occurred: {e}")
|
||||
|
||||
|
||||
# Alternative way using `requests`
|
||||
"""
|
||||
def download_file(url, destination):
|
||||
# Send a GET request to download the file in streaming mode
|
||||
response = requests.get(url, stream=True)
|
||||
|
||||
# Get the total file size from headers, defaulting to 0 if not present
|
||||
file_size = int(response.headers.get("content-length", 0))
|
||||
|
||||
# Check if file exists and has the same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return
|
||||
|
||||
# Define the block size for reading the file
|
||||
block_size = 1024 # 1 Kilobyte
|
||||
|
||||
# Initialize the progress bar with total file size
|
||||
progress_bar_description = url.split("/")[-1] # Extract filename from URL
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
||||
# Open the destination file in binary write mode
|
||||
with open(destination, "wb") as file:
|
||||
# Iterate over the file data in chunks
|
||||
for chunk in response.iter_content(block_size):
|
||||
progress_bar.update(len(chunk)) # Update progress bar
|
||||
file.write(chunk) # Write the chunk to the file
|
||||
"""
|
||||
|
||||
|
||||
def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
|
||||
# Initialize parameters dictionary with empty blocks for each layer
|
||||
params = {"blocks": [{} for _ in range(settings["n_layer"])]}
|
||||
|
||||
@@ -11,9 +11,9 @@ import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import urllib
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import requests
|
||||
import tiktoken
|
||||
import torch
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
@@ -97,14 +97,14 @@ def custom_collate_fn(
|
||||
|
||||
|
||||
def download_and_load_file(file_path, url):
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
with urllib.request.urlopen(url) as response:
|
||||
text_data = response.read().decode("utf-8")
|
||||
response = requests.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
text_data = response.text
|
||||
with open(file_path, "w", encoding="utf-8") as file:
|
||||
file.write(text_data)
|
||||
|
||||
with open(file_path, "r") as file:
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
data = json.load(file)
|
||||
|
||||
return data
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
import json
|
||||
import psutil
|
||||
from tqdm import tqdm
|
||||
import urllib.request
|
||||
import requests
|
||||
|
||||
|
||||
def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"):
|
||||
@@ -25,23 +25,16 @@ def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"):
|
||||
}
|
||||
}
|
||||
|
||||
# Convert the dictionary to a JSON formatted string and encode it to bytes
|
||||
payload = json.dumps(data).encode("utf-8")
|
||||
|
||||
# Create a request object, setting the method to POST and adding necessary headers
|
||||
request = urllib.request.Request(url, data=payload, method="POST")
|
||||
request.add_header("Content-Type", "application/json")
|
||||
|
||||
# Send the request and capture the response
|
||||
response_data = ""
|
||||
with urllib.request.urlopen(request) as response:
|
||||
# Read and decode the response
|
||||
while True:
|
||||
line = response.readline().decode("utf-8")
|
||||
# Send the POST request
|
||||
with requests.post(url, json=data, stream=True, timeout=30) as r:
|
||||
r.raise_for_status()
|
||||
response_data = ""
|
||||
for line in r.iter_lines(decode_unicode=True):
|
||||
if not line:
|
||||
break
|
||||
continue
|
||||
response_json = json.loads(line)
|
||||
response_data += response_json["message"]["content"]
|
||||
if "message" in response_json:
|
||||
response_data += response_json["message"]["content"]
|
||||
|
||||
return response_data
|
||||
|
||||
|
||||
@@ -215,8 +215,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import urllib.request\n",
|
||||
"import json\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\"):\n",
|
||||
@@ -236,27 +236,19 @@
|
||||
" }\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
|
||||
" payload = json.dumps(data).encode(\"utf-8\")\n",
|
||||
"\n",
|
||||
" # Create a request object, setting the method to POST and adding necessary headers\n",
|
||||
" request = urllib.request.Request(url, data=payload, method=\"POST\")\n",
|
||||
" request.add_header(\"Content-Type\", \"application/json\")\n",
|
||||
"\n",
|
||||
" # Send the request and capture the response\n",
|
||||
" response_data = \"\"\n",
|
||||
" with urllib.request.urlopen(request) as response:\n",
|
||||
" # Read and decode the response\n",
|
||||
" while True:\n",
|
||||
" line = response.readline().decode(\"utf-8\")\n",
|
||||
" # Send the POST request\n",
|
||||
" with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
|
||||
" r.raise_for_status()\n",
|
||||
" response_data = \"\"\n",
|
||||
" for line in r.iter_lines(decode_unicode=True):\n",
|
||||
" if not line:\n",
|
||||
" break\n",
|
||||
" continue\n",
|
||||
" response_json = json.loads(line)\n",
|
||||
" response_data += response_json[\"message\"][\"content\"]\n",
|
||||
" if \"message\" in response_json:\n",
|
||||
" response_data += response_json[\"message\"][\"content\"]\n",
|
||||
"\n",
|
||||
" return response_data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"result = query_model(\"What do Llamas eat?\")\n",
|
||||
"print(result)"
|
||||
]
|
||||
@@ -640,7 +632,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.10.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -274,8 +274,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import urllib.request\n",
|
||||
"import json\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def query_model(prompt, model=\"llama3.1:70b\", url=\"http://localhost:11434/api/chat\"):\n",
|
||||
@@ -294,23 +294,16 @@
|
||||
" }\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
|
||||
" payload = json.dumps(data).encode(\"utf-8\")\n",
|
||||
"\n",
|
||||
" # Create a request object, setting the method to POST and adding necessary headers\n",
|
||||
" request = urllib.request.Request(url, data=payload, method=\"POST\")\n",
|
||||
" request.add_header(\"Content-Type\", \"application/json\")\n",
|
||||
"\n",
|
||||
" # Send the request and capture the response\n",
|
||||
" response_data = \"\"\n",
|
||||
" with urllib.request.urlopen(request) as response:\n",
|
||||
" # Read and decode the response\n",
|
||||
" while True:\n",
|
||||
" line = response.readline().decode(\"utf-8\")\n",
|
||||
" # Send the POST request\n",
|
||||
" with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
|
||||
" r.raise_for_status()\n",
|
||||
" response_data = \"\"\n",
|
||||
" for line in r.iter_lines(decode_unicode=True):\n",
|
||||
" if not line:\n",
|
||||
" break\n",
|
||||
" continue\n",
|
||||
" response_json = json.loads(line)\n",
|
||||
" response_data += response_json[\"message\"][\"content\"]\n",
|
||||
" if \"message\" in response_json:\n",
|
||||
" response_data += response_json[\"message\"][\"content\"]\n",
|
||||
"\n",
|
||||
" return response_data\n",
|
||||
"\n",
|
||||
@@ -587,7 +580,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.10.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -231,23 +231,21 @@
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"import urllib\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def download_and_load_file(file_path, url):\n",
|
||||
"\n",
|
||||
" if not os.path.exists(file_path):\n",
|
||||
" with urllib.request.urlopen(url) as response:\n",
|
||||
" text_data = response.read().decode(\"utf-8\")\n",
|
||||
" response = requests.get(url, timeout=30)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" text_data = response.text\n",
|
||||
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
|
||||
" file.write(text_data)\n",
|
||||
" else:\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" text_data = file.read()\n",
|
||||
"\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" data = json.load(file)\n",
|
||||
"\n",
|
||||
" data = json.loads(text_data)\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
|
||||
@@ -194,8 +194,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import urllib.request\n",
|
||||
"import json\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"def query_model(prompt, model=\"llama3\", url=\"http://localhost:11434/api/chat\", role=\"user\"):\n",
|
||||
" # Create the data payload as a dictionary\n",
|
||||
@@ -209,25 +209,21 @@
|
||||
" ]\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" # Convert the dictionary to a JSON formatted string and encode it to bytes\n",
|
||||
" payload = json.dumps(data).encode(\"utf-8\")\n",
|
||||
"\n",
|
||||
" # Create a request object, setting the method to POST and adding necessary headers\n",
|
||||
" request = urllib.request.Request(url, data=payload, method=\"POST\")\n",
|
||||
" request.add_header(\"Content-Type\", \"application/json\")\n",
|
||||
"\n",
|
||||
" # Send the request and capture the response\n",
|
||||
" response_data = \"\"\n",
|
||||
" with urllib.request.urlopen(request) as response:\n",
|
||||
" # Read and decode the response\n",
|
||||
" while True:\n",
|
||||
" line = response.readline().decode(\"utf-8\")\n",
|
||||
" # Send the POST request\n",
|
||||
" with requests.post(url, json=data, stream=True, timeout=30) as r:\n",
|
||||
" r.raise_for_status()\n",
|
||||
" response_data = \"\"\n",
|
||||
" for line in r.iter_lines(decode_unicode=True):\n",
|
||||
" if not line:\n",
|
||||
" break\n",
|
||||
" continue\n",
|
||||
" response_json = json.loads(line)\n",
|
||||
" response_data += response_json[\"message\"][\"content\"]\n",
|
||||
" if \"message\" in response_json:\n",
|
||||
" response_data += response_json[\"message\"][\"content\"]\n",
|
||||
"\n",
|
||||
" return response_data"
|
||||
" return response_data\n",
|
||||
"\n",
|
||||
"result = query_model(\"What do Llamas eat?\")\n",
|
||||
"print(result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -498,7 +494,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
"version": "3.10.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -7,11 +7,11 @@ from .ch04 import generate_text_simple
|
||||
|
||||
import json
|
||||
import os
|
||||
import urllib.request
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.ticker import MaxNLocator
|
||||
import requests
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
@@ -279,44 +279,40 @@ def download_and_load_gpt2(model_size, models_dir):
|
||||
|
||||
def download_file(url, destination, backup_url=None):
|
||||
def _attempt_download(download_url):
|
||||
with urllib.request.urlopen(download_url) as response:
|
||||
# Get the total file size from headers, defaulting to 0 if not present
|
||||
file_size = int(response.headers.get("Content-Length", 0))
|
||||
response = requests.get(download_url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check if file exists and has the same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return True # Indicate success without re-downloading
|
||||
file_size = int(response.headers.get("Content-Length", 0))
|
||||
|
||||
block_size = 1024 # 1 Kilobyte
|
||||
# Check if file exists and has same size
|
||||
if os.path.exists(destination):
|
||||
file_size_local = os.path.getsize(destination)
|
||||
if file_size and file_size == file_size_local:
|
||||
print(f"File already exists and is up-to-date: {destination}")
|
||||
return True
|
||||
|
||||
# Initialize the progress bar with total file size
|
||||
progress_bar_description = os.path.basename(download_url)
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
|
||||
with open(destination, "wb") as file:
|
||||
while True:
|
||||
chunk = response.read(block_size)
|
||||
if not chunk:
|
||||
break
|
||||
block_size = 1024 # 1 KB
|
||||
desc = os.path.basename(download_url)
|
||||
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
|
||||
with open(destination, "wb") as file:
|
||||
for chunk in response.iter_content(chunk_size=block_size):
|
||||
if chunk:
|
||||
file.write(chunk)
|
||||
progress_bar.update(len(chunk))
|
||||
return True
|
||||
return True
|
||||
|
||||
try:
|
||||
if _attempt_download(url):
|
||||
return
|
||||
except (urllib.error.HTTPError, urllib.error.URLError):
|
||||
except requests.exceptions.RequestException:
|
||||
if backup_url is not None:
|
||||
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
|
||||
try:
|
||||
if _attempt_download(backup_url):
|
||||
return
|
||||
except urllib.error.HTTPError:
|
||||
except requests.exceptions.RequestException:
|
||||
pass
|
||||
|
||||
# If we reach here, both attempts have failed
|
||||
error_message = (
|
||||
f"Failed to download from both primary URL ({url})"
|
||||
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
|
||||
|
||||
@@ -4,11 +4,11 @@
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
|
||||
import urllib.request
|
||||
import zipfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
import matplotlib.pyplot as plt
|
||||
from torch.utils.data import Dataset
|
||||
import torch
|
||||
@@ -21,9 +21,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
|
||||
return
|
||||
|
||||
# Downloading the file
|
||||
with urllib.request.urlopen(url) as response:
|
||||
with open(zip_path, "wb") as out_file:
|
||||
out_file.write(response.read())
|
||||
response = requests.get(url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
with open(zip_path, "wb") as out_file:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
out_file.write(chunk)
|
||||
|
||||
# Unzipping the file
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
import json
|
||||
import os
|
||||
import psutil
|
||||
import urllib
|
||||
import requests
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
@@ -14,24 +14,46 @@ from torch.utils.data import Dataset
|
||||
|
||||
|
||||
def download_and_load_file(file_path, url):
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
with urllib.request.urlopen(url) as response:
|
||||
text_data = response.read().decode("utf-8")
|
||||
response = requests.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
text_data = response.text
|
||||
with open(file_path, "w", encoding="utf-8") as file:
|
||||
file.write(text_data)
|
||||
|
||||
# The book originally contained this unnecessary "else" clause:
|
||||
# else:
|
||||
# with open(file_path, "r", encoding="utf-8") as file:
|
||||
# text_data = file.read()
|
||||
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
data = json.load(file)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
# The book originally used the following code below
|
||||
# However, urllib uses older protocol settings that
|
||||
# can cause problems for some readers using a VPN.
|
||||
# The `requests` version above is more robust
|
||||
# in that regard.
|
||||
|
||||
|
||||
# import urllib
|
||||
|
||||
# def download_and_load_file(file_path, url):
|
||||
|
||||
# if not os.path.exists(file_path):
|
||||
# with urllib.request.urlopen(url) as response:
|
||||
# text_data = response.read().decode("utf-8")
|
||||
# with open(file_path, "w", encoding="utf-8") as file:
|
||||
# file.write(text_data)
|
||||
|
||||
# else:
|
||||
# with open(file_path, "r", encoding="utf-8") as file:
|
||||
# text_data = file.read()
|
||||
|
||||
# with open(file_path, "r", encoding="utf-8") as file:
|
||||
# data = json.load(file)
|
||||
|
||||
# return data
|
||||
|
||||
|
||||
def format_input(entry):
|
||||
instruction_text = (
|
||||
f"Below is an instruction that describes a task. "
|
||||
@@ -202,27 +224,16 @@ def query_model(
|
||||
}
|
||||
}
|
||||
|
||||
# Convert the dictionary to a JSON formatted string and encode it to bytes
|
||||
payload = json.dumps(data).encode("utf-8")
|
||||
|
||||
# Create a request object, setting the method to POST and adding necessary headers
|
||||
request = urllib.request.Request(
|
||||
url,
|
||||
data=payload,
|
||||
method="POST"
|
||||
)
|
||||
request.add_header("Content-Type", "application/json")
|
||||
|
||||
# Send the request and capture the response
|
||||
response_data = ""
|
||||
with urllib.request.urlopen(request) as response:
|
||||
# Read and decode the response
|
||||
while True:
|
||||
line = response.readline().decode("utf-8")
|
||||
# Send the POST request
|
||||
with requests.post(url, json=data, stream=True, timeout=30) as r:
|
||||
r.raise_for_status()
|
||||
response_data = ""
|
||||
for line in r.iter_lines(decode_unicode=True):
|
||||
if not line:
|
||||
break
|
||||
continue
|
||||
response_json = json.loads(line)
|
||||
response_data += response_json["message"]["content"]
|
||||
if "message" in response_json:
|
||||
response_data += response_json["message"]["content"]
|
||||
|
||||
return response_data
|
||||
|
||||
|
||||
@@ -6,9 +6,9 @@
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
@@ -660,7 +660,12 @@ def download_from_huggingface(repo_id, filename, local_dir, revision="main"):
|
||||
print(f"File already exists: {dest_path}")
|
||||
else:
|
||||
print(f"Downloading {url} to {dest_path}...")
|
||||
urllib.request.urlretrieve(url, dest_path)
|
||||
response = requests.get(url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
with open(dest_path, "wb") as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
return dest_path
|
||||
|
||||
|
||||
@@ -12,9 +12,9 @@ from llms_from_scratch.ch06 import (
|
||||
from llms_from_scratch.appendix_e import replace_linear_with_lora
|
||||
|
||||
from pathlib import Path
|
||||
import urllib
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
import tiktoken
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, Subset
|
||||
@@ -35,7 +35,7 @@ def test_train_classifier_lora(tmp_path):
|
||||
download_and_unzip_spam_data(
|
||||
url, zip_path, extracted_path, data_file_path
|
||||
)
|
||||
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
||||
except (requests.exceptions.RequestException, TimeoutError) as e:
|
||||
print(f"Primary URL failed: {e}. Trying backup URL...")
|
||||
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||
download_and_unzip_spam_data(
|
||||
|
||||
@@ -6,8 +6,8 @@
|
||||
from llms_from_scratch.ch02 import create_dataloader_v1
|
||||
|
||||
import os
|
||||
import urllib.request
|
||||
|
||||
import requests
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
@@ -16,11 +16,17 @@ import torch
|
||||
def test_dataloader(tmp_path, file_name):
|
||||
|
||||
if not os.path.exists("the-verdict.txt"):
|
||||
url = ("https://raw.githubusercontent.com/rasbt/"
|
||||
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
|
||||
"the-verdict.txt")
|
||||
url = (
|
||||
"https://raw.githubusercontent.com/rasbt/"
|
||||
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
|
||||
"the-verdict.txt"
|
||||
)
|
||||
file_path = "the-verdict.txt"
|
||||
urllib.request.urlretrieve(url, file_path)
|
||||
|
||||
response = requests.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
with open("the-verdict.txt", "r", encoding="utf-8") as f:
|
||||
raw_text = f.read()
|
||||
|
||||
@@ -8,8 +8,8 @@ from llms_from_scratch.ch04 import GPTModel, GPTModelFast
|
||||
from llms_from_scratch.ch05 import train_model_simple
|
||||
|
||||
import os
|
||||
import urllib
|
||||
|
||||
import requests
|
||||
import pytest
|
||||
import tiktoken
|
||||
import torch
|
||||
@@ -46,8 +46,9 @@ def test_train_simple(tmp_path, ModelClass):
|
||||
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
with urllib.request.urlopen(url) as response:
|
||||
text_data = response.read().decode("utf-8")
|
||||
response = requests.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
text_data = response.text
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(text_data)
|
||||
else:
|
||||
|
||||
@@ -11,8 +11,8 @@ from llms_from_scratch.ch06 import (
|
||||
)
|
||||
|
||||
from pathlib import Path
|
||||
import urllib
|
||||
|
||||
import requests
|
||||
import pandas as pd
|
||||
import tiktoken
|
||||
import torch
|
||||
@@ -34,7 +34,7 @@ def test_train_classifier(tmp_path):
|
||||
download_and_unzip_spam_data(
|
||||
url, zip_path, extracted_path, data_file_path
|
||||
)
|
||||
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
||||
except (requests.exceptions.RequestException, TimeoutError) as e:
|
||||
print(f"Primary URL failed: {e}. Trying backup URL...")
|
||||
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||
download_and_unzip_spam_data(
|
||||
|
||||
@@ -9,10 +9,9 @@ import ast
|
||||
import re
|
||||
import types
|
||||
from pathlib import Path
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
|
||||
import nbformat
|
||||
import requests
|
||||
|
||||
|
||||
def _extract_imports(src: str):
|
||||
@@ -125,21 +124,24 @@ def import_definitions_from_notebook(nb_dir_or_path, notebook_name=None, *, extr
|
||||
exec(src, mod.__dict__)
|
||||
return mod
|
||||
|
||||
|
||||
def download_file(url, out_dir="."):
|
||||
"""Simple file download utility for tests."""
|
||||
from pathlib import Path
|
||||
out_dir = Path(out_dir)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
filename = Path(urllib.parse.urlparse(url).path).name
|
||||
filename = Path(url).name
|
||||
dest = out_dir / filename
|
||||
|
||||
|
||||
if dest.exists():
|
||||
return dest
|
||||
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(url) as response:
|
||||
with open(dest, 'wb') as f:
|
||||
f.write(response.read())
|
||||
response = requests.get(url, stream=True, timeout=30)
|
||||
response.raise_for_status()
|
||||
with open(dest, "wb") as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
return dest
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to download {url}: {e}")
|
||||
|
||||
Reference in New Issue
Block a user