mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Switch from urllib to requests to improve reliability (#867)
* Switch from urllib to requests to improve reliability * Keep ruff linter-specific * update * update * update
This commit is contained in:
committed by
GitHub
parent
8552565bda
commit
7bd263144e
@@ -186,6 +186,56 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import zipfile\n",
|
||||
"import os\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
|
||||
"zip_path = \"sms_spam_collection.zip\"\n",
|
||||
"extracted_path = \"sms_spam_collection\"\n",
|
||||
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):\n",
|
||||
" if data_file_path.exists():\n",
|
||||
" print(f\"{data_file_path} already exists. Skipping download and extraction.\")\n",
|
||||
" return\n",
|
||||
"\n",
|
||||
" # Downloading the file\n",
|
||||
" response = requests.get(url, stream=True, timeout=60)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" with open(zip_path, \"wb\") as out_file:\n",
|
||||
" for chunk in response.iter_content(chunk_size=8192):\n",
|
||||
" if chunk:\n",
|
||||
" out_file.write(chunk)\n",
|
||||
"\n",
|
||||
" # Unzipping the file\n",
|
||||
" with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n",
|
||||
" zip_ref.extractall(extracted_path)\n",
|
||||
"\n",
|
||||
" # Add .tsv file extension\n",
|
||||
" original_file_path = Path(extracted_path) / \"SMSSpamCollection\"\n",
|
||||
" os.rename(original_file_path, data_file_path)\n",
|
||||
" print(f\"File downloaded and saved as {data_file_path}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||
"except (requests.exceptions.RequestException, TimeoutError) as e:\n",
|
||||
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
|
||||
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# The book originally used the following code below\n",
|
||||
"# However, urllib uses older protocol settings that\n",
|
||||
"# can cause problems for some readers using a VPN.\n",
|
||||
"# The `requests` version above is more robust\n",
|
||||
"# in that regard.\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"import urllib.request\n",
|
||||
"import zipfile\n",
|
||||
"import os\n",
|
||||
@@ -220,7 +270,8 @@
|
||||
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
|
||||
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
|
||||
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) "
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user