mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Switch from urllib to requests to improve reliability (#867)
* Switch from urllib to requests to improve reliability * Keep ruff linter-specific * update * update * update
This commit is contained in:
committed by
GitHub
parent
8552565bda
commit
7bd263144e
@@ -186,6 +186,56 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import zipfile\n",
|
||||
"import os\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
|
||||
"zip_path = \"sms_spam_collection.zip\"\n",
|
||||
"extracted_path = \"sms_spam_collection\"\n",
|
||||
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):\n",
|
||||
" if data_file_path.exists():\n",
|
||||
" print(f\"{data_file_path} already exists. Skipping download and extraction.\")\n",
|
||||
" return\n",
|
||||
"\n",
|
||||
" # Downloading the file\n",
|
||||
" response = requests.get(url, stream=True, timeout=60)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" with open(zip_path, \"wb\") as out_file:\n",
|
||||
" for chunk in response.iter_content(chunk_size=8192):\n",
|
||||
" if chunk:\n",
|
||||
" out_file.write(chunk)\n",
|
||||
"\n",
|
||||
" # Unzipping the file\n",
|
||||
" with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n",
|
||||
" zip_ref.extractall(extracted_path)\n",
|
||||
"\n",
|
||||
" # Add .tsv file extension\n",
|
||||
" original_file_path = Path(extracted_path) / \"SMSSpamCollection\"\n",
|
||||
" os.rename(original_file_path, data_file_path)\n",
|
||||
" print(f\"File downloaded and saved as {data_file_path}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||
"except (requests.exceptions.RequestException, TimeoutError) as e:\n",
|
||||
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
|
||||
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# The book originally used the following code below\n",
|
||||
"# However, urllib uses older protocol settings that\n",
|
||||
"# can cause problems for some readers using a VPN.\n",
|
||||
"# The `requests` version above is more robust\n",
|
||||
"# in that regard.\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"import urllib.request\n",
|
||||
"import zipfile\n",
|
||||
"import os\n",
|
||||
@@ -220,7 +270,8 @@
|
||||
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
|
||||
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
|
||||
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) "
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
# This is a summary file containing the main takeaways from chapter 6.
|
||||
|
||||
import urllib.request
|
||||
import requests
|
||||
import zipfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
@@ -27,9 +27,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
|
||||
return
|
||||
|
||||
# Downloading the file
|
||||
with urllib.request.urlopen(url) as response:
|
||||
with open(zip_path, "wb") as out_file:
|
||||
out_file.write(response.read())
|
||||
response = requests.get(url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
with open(zip_path, "wb") as out_file:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
out_file.write(chunk)
|
||||
|
||||
# Unzipping the file
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
@@ -259,7 +262,7 @@ if __name__ == "__main__":
|
||||
|
||||
try:
|
||||
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
|
||||
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
||||
except (requests.exceptions.RequestException, TimeoutError) as e:
|
||||
print(f"Primary URL failed: {e}. Trying backup URL...")
|
||||
url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
|
||||
|
||||
Reference in New Issue
Block a user