Switch from urllib to requests to improve reliability (#867)

* Switch from urllib to requests to improve reliability

* Keep ruff linter-specific

* update

* update

* update
This commit is contained in:
Sebastian Raschka
2025-10-07 15:22:59 -05:00
committed by GitHub
parent 8552565bda
commit 7bd263144e
47 changed files with 592 additions and 436 deletions

View File

@@ -190,7 +190,8 @@
}
],
"source": [
"import urllib\n",
"# import urllib\n",
"import requests\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"from previous_chapters import (\n",
@@ -215,13 +216,20 @@
"extracted_path = \"sms_spam_collection\"\n",
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
"\n",
"\n",
"try:\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
"except (requests.exceptions.RequestException, TimeoutError) as e:\n",
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
"\n",
"# The book originally used\n",
"# except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
"# in the code above.\n",
"# However, some VPN users reported issues with `urllib`, so the code was updated\n",
"# to use `requests` instead\n",
"\n",
"df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
"balanced_df = create_balanced_dataset(df)\n",
"balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})\n",

View File

@@ -9,12 +9,12 @@
import os
from pathlib import Path
import urllib
import zipfile
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import tiktoken
import torch
import torch.nn as nn
@@ -367,9 +367,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
return
# Downloading the file
with urllib.request.urlopen(url) as response:
with open(zip_path, "wb") as out_file:
out_file.write(response.read())
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status()
with open(zip_path, "wb") as out_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
out_file.write(chunk)
# Unzipping the file
with zipfile.ZipFile(zip_path, "r") as zip_ref: