mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Switch from urllib to requests to improve reliability (#867)
* Switch from urllib to requests to improve reliability * Keep ruff linter-specific * update * update * update
This commit is contained in:
committed by
GitHub
parent
8552565bda
commit
7bd263144e
@@ -190,7 +190,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import urllib\n",
|
||||
"# import urllib\n",
|
||||
"import requests\n",
|
||||
"from pathlib import Path\n",
|
||||
"import pandas as pd\n",
|
||||
"from previous_chapters import (\n",
|
||||
@@ -215,13 +216,20 @@
|
||||
"extracted_path = \"sms_spam_collection\"\n",
|
||||
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
|
||||
"except (requests.exceptions.RequestException, TimeoutError) as e:\n",
|
||||
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
|
||||
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
||||
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||
"\n",
|
||||
"# The book originally used\n",
|
||||
"# except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
|
||||
"# in the code above.\n",
|
||||
"# However, some VPN users reported issues with `urllib`, so the code was updated\n",
|
||||
"# to use `requests` instead\n",
|
||||
"\n",
|
||||
"df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
|
||||
"balanced_df = create_balanced_dataset(df)\n",
|
||||
"balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})\n",
|
||||
|
||||
@@ -9,12 +9,12 @@
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import urllib
|
||||
import zipfile
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import requests
|
||||
import tiktoken
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -367,9 +367,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
|
||||
return
|
||||
|
||||
# Downloading the file
|
||||
with urllib.request.urlopen(url) as response:
|
||||
with open(zip_path, "wb") as out_file:
|
||||
out_file.write(response.read())
|
||||
response = requests.get(url, stream=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
with open(zip_path, "wb") as out_file:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
out_file.write(chunk)
|
||||
|
||||
# Unzipping the file
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
|
||||
Reference in New Issue
Block a user