mirror of
https://github.com/frankwxu/AI4DigitalForensics.git
synced 2026-04-10 11:23:42 +00:00
add lecture 9
This commit is contained in:
Binary file not shown.
@@ -0,0 +1,484 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "31ee256c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Breast cancer prediction"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "53af081c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.datasets import load_breast_cancer\n",
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"from sklearn.model_selection import train_test_split"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "536078f0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Load and preprocess breast cancer dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "06746e3c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\"\"\"Load and preprocess breast cancer dataset.\"\"\"\n",
|
||||
"# Load dataset\n",
|
||||
"data = load_breast_cancer()\n",
|
||||
"X, y = data.data, data.target"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3477485c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Understand inputs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "76d4d576",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(569, 30)"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "fddcc037",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,\n",
|
||||
" 3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,\n",
|
||||
" 8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,\n",
|
||||
" 3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,\n",
|
||||
" 1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01])"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X[0, :]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "070dcd69",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(569,)"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"y.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "c4632c29",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"np.int64(0)"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"y[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b74373cb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
" ### Split dataset into training and testing"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "0675a8c7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
||||
" X, y, test_size=0.2, random_state=1234\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "bfe70bd9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(455, 30)"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_train.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "a4df0052",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(114, 30)"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_test.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d597a997",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Scale fetures\n",
|
||||
"Scaling features, as done in the code with StandardScaler, transforms the input data so that each feature has a mean of 0 and a standard deviation of 1. This is also known as standardization. The purpose of scaling features in this context is to:\n",
|
||||
"\n",
|
||||
"- Improve Model Convergence: Many machine learning algorithms, including neural networks optimized with gradient-based methods like SGD, converge faster when features are on a similar scale. Unscaled features with different ranges can cause gradients to vary widely, slowing down or destabilizing training.\n",
|
||||
"- Ensure Fair Feature Influence: Features with larger numerical ranges could disproportionately influence the model compared to features with smaller ranges. Standardization ensures all features contribute equally to the model's predictions.\n",
|
||||
"- Enhance Numerical Stability: Large or highly variable feature values can lead to numerical instability in computations, especially in deep learning frameworks like PyTorch. Scaling mitigates this risk."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "3aeb88da",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Scale features\n",
|
||||
"scaler = StandardScaler()\n",
|
||||
"X_train = scaler.fit_transform(X_train)\n",
|
||||
"X_test = scaler.transform(X_test)\n",
|
||||
"\n",
|
||||
"# Convert to PyTorch tensors\n",
|
||||
"X_train = torch.from_numpy(X_train.astype(np.float32))\n",
|
||||
"X_test = torch.from_numpy(X_test.astype(np.float32))\n",
|
||||
"y_train = torch.from_numpy(y_train.astype(np.float32)).view(-1, 1)\n",
|
||||
"y_test = torch.from_numpy(y_test.astype(np.float32)).view(-1, 1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "3b10079f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"torch.Size([455, 30])"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_train.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "13f4059c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"tensor([-0.3618, -0.2652, -0.3172, -0.4671, 1.8038, 1.1817, -0.5169, 0.1065,\n",
|
||||
" -0.3901, 1.3914, 0.1437, -0.1208, 0.1601, -0.1326, -0.5863, -0.1248,\n",
|
||||
" -0.5787, 0.1091, -0.2819, -0.1889, -0.2571, -0.2403, -0.2442, -0.3669,\n",
|
||||
" 0.5449, 0.2481, -0.7109, -0.0797, -0.5280, 0.2506])"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_train[0,:]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b0b15d2f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Binary Classifier model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "e1b50a04",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class BinaryClassifier(nn.Module):\n",
|
||||
" \"\"\"Simple neural network for binary classification.\"\"\"\n",
|
||||
" def __init__(self, input_features):\n",
|
||||
" super(BinaryClassifier, self).__init__()\n",
|
||||
" self.linear = nn.Linear(input_features, 1)\n",
|
||||
" \n",
|
||||
" def forward(self, x):\n",
|
||||
" return torch.sigmoid(self.linear(x))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "49694959",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"torch.Size([455, 30])"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_train.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "14873622",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### show binary classification model \n",
|
||||
"- the number of input features\n",
|
||||
"- the number of output features"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "466f6c41",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"BinaryClassifier(\n",
|
||||
" (linear): Linear(in_features=30, out_features=1, bias=True)\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"n_features = X_train.shape[1]\n",
|
||||
"model = BinaryClassifier(n_features)\n",
|
||||
"model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c66978b5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Train the model with given parameters.\n",
|
||||
"\n",
|
||||
"- forward pass: prediction\n",
|
||||
"- loss: error\n",
|
||||
"- autograd: weight change direction\n",
|
||||
"- stochastic gradient descent (optimizer): update weights\n",
|
||||
"- optimizer.zero_grad()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "1d1d7868",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epoch [10/100], Loss: 0.4627\n",
|
||||
"Epoch [20/100], Loss: 0.4105\n",
|
||||
"Epoch [30/100], Loss: 0.3721\n",
|
||||
"Epoch [40/100], Loss: 0.3424\n",
|
||||
"Epoch [50/100], Loss: 0.3186\n",
|
||||
"Epoch [60/100], Loss: 0.2990\n",
|
||||
"Epoch [70/100], Loss: 0.2825\n",
|
||||
"Epoch [80/100], Loss: 0.2683\n",
|
||||
"Epoch [90/100], Loss: 0.2560\n",
|
||||
"Epoch [100/100], Loss: 0.2452\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"num_epochs=100\n",
|
||||
"learning_rate=0.01\n",
|
||||
"\n",
|
||||
"\"\"\"Train the model with given parameters.\"\"\"\n",
|
||||
"criterion = nn.BCELoss()\n",
|
||||
"optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)\n",
|
||||
"\n",
|
||||
"for epoch in range(num_epochs):\n",
|
||||
" # Forward pass\n",
|
||||
" y_pred = model(X_train)\n",
|
||||
" loss = criterion(y_pred, y_train)\n",
|
||||
" \n",
|
||||
" # Backward pass and optimization\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" \n",
|
||||
" # Log progress\n",
|
||||
" if (epoch + 1) % 10 == 0:\n",
|
||||
" print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1a59248d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Evaluate model performance on test set"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "eeddd812",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Test Accuracy: 0.8947\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with torch.no_grad():\n",
|
||||
" y_pred = model(X_test)\n",
|
||||
" y_pred_classes = y_pred.round() # Values 𝑥 ≥ 0.5 are rounded to 1, else 0\n",
|
||||
" accuracy = y_pred_classes.eq(y_test).sum() / float(y_test.shape[0])\n",
|
||||
" print(f'\\nTest Accuracy: {accuracy:.4f}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1dc4fcd3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,146 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "53af081c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Training model...\n",
|
||||
"Epoch [10/100], Loss: 0.6247\n",
|
||||
"Epoch [20/100], Loss: 0.4940\n",
|
||||
"Epoch [30/100], Loss: 0.4156\n",
|
||||
"Epoch [40/100], Loss: 0.3641\n",
|
||||
"Epoch [50/100], Loss: 0.3277\n",
|
||||
"Epoch [60/100], Loss: 0.3005\n",
|
||||
"Epoch [70/100], Loss: 0.2794\n",
|
||||
"Epoch [80/100], Loss: 0.2624\n",
|
||||
"Epoch [90/100], Loss: 0.2483\n",
|
||||
"Epoch [100/100], Loss: 0.2364\n",
|
||||
"\n",
|
||||
"Test Accuracy: 0.9211\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.datasets import load_breast_cancer\n",
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"def prepare_data():\n",
|
||||
" \"\"\"Load and preprocess breast cancer dataset.\"\"\"\n",
|
||||
" # Load dataset\n",
|
||||
" data = load_breast_cancer()\n",
|
||||
" X, y = data.data, data.target\n",
|
||||
" \n",
|
||||
" # Split dataset\n",
|
||||
" X_train, X_test, y_train, y_test = train_test_split(\n",
|
||||
" X, y, test_size=0.2, random_state=1234\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Scale features\n",
|
||||
" scaler = StandardScaler()\n",
|
||||
" X_train = scaler.fit_transform(X_train)\n",
|
||||
" X_test = scaler.transform(X_test)\n",
|
||||
" \n",
|
||||
" # Convert to PyTorch tensors\n",
|
||||
" X_train = torch.from_numpy(X_train.astype(np.float32))\n",
|
||||
" X_test = torch.from_numpy(X_test.astype(np.float32))\n",
|
||||
" y_train = torch.from_numpy(y_train.astype(np.float32)).view(-1, 1)\n",
|
||||
" y_test = torch.from_numpy(y_test.astype(np.float32)).view(-1, 1)\n",
|
||||
" \n",
|
||||
" return X_train, X_test, y_train, y_test\n",
|
||||
"\n",
|
||||
"class BinaryClassifier(nn.Module):\n",
|
||||
" \"\"\"Simple neural network for binary classification.\"\"\"\n",
|
||||
" def __init__(self, input_features):\n",
|
||||
" super(BinaryClassifier, self).__init__()\n",
|
||||
" self.linear = nn.Linear(input_features, 1)\n",
|
||||
" \n",
|
||||
" def forward(self, x):\n",
|
||||
" return torch.sigmoid(self.linear(x))\n",
|
||||
"\n",
|
||||
"def train_model(model, X_train, y_train, num_epochs=100, learning_rate=0.01):\n",
|
||||
" \"\"\"Train the model with given parameters.\"\"\"\n",
|
||||
" criterion = nn.BCELoss()\n",
|
||||
" optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)\n",
|
||||
" \n",
|
||||
" for epoch in range(num_epochs):\n",
|
||||
" # Forward pass\n",
|
||||
" y_pred = model(X_train)\n",
|
||||
" loss = criterion(y_pred, y_train)\n",
|
||||
" \n",
|
||||
" # Backward pass and optimization\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" \n",
|
||||
" # Log progress\n",
|
||||
" if (epoch + 1) % 10 == 0:\n",
|
||||
" print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')\n",
|
||||
"\n",
|
||||
"def evaluate_model(model, X_test, y_test):\n",
|
||||
" \"\"\"Evaluate model performance on test set.\"\"\"\n",
|
||||
" with torch.no_grad():\n",
|
||||
" y_pred = model(X_test)\n",
|
||||
" y_pred_classes = y_pred.round()\n",
|
||||
" accuracy = y_pred_classes.eq(y_test).sum() / float(y_test.shape[0])\n",
|
||||
" return accuracy.item()\n",
|
||||
"\n",
|
||||
"def main():\n",
|
||||
" # Prepare data\n",
|
||||
" X_train, X_test, y_train, y_test = prepare_data()\n",
|
||||
" \n",
|
||||
" # Initialize model\n",
|
||||
" n_features = X_train.shape[1]\n",
|
||||
" model = BinaryClassifier(n_features)\n",
|
||||
" \n",
|
||||
" # Train model\n",
|
||||
" print(\"Training model...\")\n",
|
||||
" train_model(model, X_train, y_train)\n",
|
||||
" \n",
|
||||
" # Evaluate model\n",
|
||||
" accuracy = evaluate_model(model, X_test, y_test)\n",
|
||||
" print(f'\\nTest Accuracy: {accuracy:.4f}')\n",
|
||||
"\n",
|
||||
"main()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "76d4d576",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
BIN
lectures/07_binary_classification_n_to_1/1_breast_cancer_F1.pptx
Normal file
BIN
lectures/07_binary_classification_n_to_1/1_breast_cancer_F1.pptx
Normal file
Binary file not shown.
BIN
lectures/07_binary_classification_n_to_1/2_DataLoader_wine.pptx
Normal file
BIN
lectures/07_binary_classification_n_to_1/2_DataLoader_wine.pptx
Normal file
Binary file not shown.
209
lectures/07_binary_classification_n_to_1/2_dataloader_wine.ipynb
Normal file
209
lectures/07_binary_classification_n_to_1/2_dataloader_wine.ipynb
Normal file
@@ -0,0 +1,209 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "52950b67",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"First sample - Features: tensor([1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,\n",
|
||||
" 3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,\n",
|
||||
" 1.0650e+03]), Label: tensor([1.])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"import torchvision\n",
|
||||
"from torch.utils.data import Dataset, DataLoader\n",
|
||||
"import numpy as np\n",
|
||||
"import math\n",
|
||||
"\n",
|
||||
"# Custom Dataset class for Wine dataset\n",
|
||||
"class WineDataset(Dataset):\n",
|
||||
" def __init__(self, data_path='data/wine.csv'):\n",
|
||||
" \"\"\"\n",
|
||||
" Initialize the dataset by loading wine data from a CSV file.\n",
|
||||
" \n",
|
||||
" Args:\n",
|
||||
" data_path (str): Path to the wine CSV file\n",
|
||||
" \"\"\"\n",
|
||||
" # Load data from CSV, skipping header row\n",
|
||||
" xy = np.loadtxt(data_path, delimiter=',', dtype=np.float32, skiprows=1)\n",
|
||||
" self.n_samples = xy.shape[0]\n",
|
||||
" \n",
|
||||
" # Split into features (all columns except first) and labels (first column)\n",
|
||||
" self.x_data = torch.from_numpy(xy[:, 1:]) # Shape: [n_samples, n_features]\n",
|
||||
" self.y_data = torch.from_numpy(xy[:, [0]]) # Shape: [n_samples, 1]\n",
|
||||
"\n",
|
||||
" def __getitem__(self, index):\n",
|
||||
" \"\"\"\n",
|
||||
" Enable indexing to retrieve a specific sample.\n",
|
||||
" \n",
|
||||
" Args:\n",
|
||||
" index (int): Index of the sample to retrieve\n",
|
||||
" \n",
|
||||
" Returns:\n",
|
||||
" tuple: (features, label) for the specified index\n",
|
||||
" \"\"\"\n",
|
||||
" return self.x_data[index], self.y_data[index]\n",
|
||||
"\n",
|
||||
" def __len__(self):\n",
|
||||
" \"\"\"\n",
|
||||
" Return the total number of samples in the dataset.\n",
|
||||
" \n",
|
||||
" Returns:\n",
|
||||
" int: Number of samples\n",
|
||||
" \"\"\"\n",
|
||||
" return self.n_samples\n",
|
||||
"\n",
|
||||
"# Create dataset instance\n",
|
||||
"dataset = WineDataset()\n",
|
||||
"\n",
|
||||
"# Access and print first sample\n",
|
||||
"features, labels = dataset[0]\n",
|
||||
"print(f\"First sample - Features: {features}, Label: {labels}\")\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "5448f749",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sample batch - Features: torch.Size([4, 13]), Labels: torch.Size([4, 1])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\"\"\"\n",
|
||||
"Create a DataLoader for the wine dataset.\n",
|
||||
"\n",
|
||||
"Args:\n",
|
||||
" dataset (Dataset): The dataset to load\n",
|
||||
" batch_size (int): Number of samples per batch\n",
|
||||
" shuffle (bool): Whether to shuffle the data\n",
|
||||
" num_workers (int): Number of subprocesses for data loading\n",
|
||||
" \n",
|
||||
"Returns:\n",
|
||||
" DataLoader: Configured DataLoader instance\n",
|
||||
"\"\"\"\n",
|
||||
"train_loader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0)\n",
|
||||
"\n",
|
||||
"# Examine one batch\n",
|
||||
"dataiter = iter(train_loader)\n",
|
||||
"features, labels = next(dataiter)\n",
|
||||
"print(f\"Sample batch - Features: {features.shape}, Labels: {labels.shape}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "0e122c46",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Total samples: 178, Iterations per epoch: 45\n",
|
||||
"Epoch: 1/2, Step 5/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 1/2, Step 10/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 1/2, Step 15/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 1/2, Step 20/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 1/2, Step 25/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 1/2, Step 30/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 1/2, Step 35/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 1/2, Step 40/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 1/2, Step 45/45 | Inputs torch.Size([2, 13]) | Labels torch.Size([2, 1])\n",
|
||||
"Epoch: 2/2, Step 5/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 2/2, Step 10/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 2/2, Step 15/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 2/2, Step 20/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 2/2, Step 25/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 2/2, Step 30/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 2/2, Step 35/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 2/2, Step 40/45 | Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])\n",
|
||||
"Epoch: 2/2, Step 45/45 | Inputs torch.Size([2, 13]) | Labels torch.Size([2, 1])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Training loop parameters\n",
|
||||
"num_epochs = 2\n",
|
||||
"total_samples = len(dataset)\n",
|
||||
"n_iterations = math.ceil(total_samples / 4)\n",
|
||||
"print(f\"Total samples: {total_samples}, Iterations per epoch: {n_iterations}\")\n",
|
||||
"\n",
|
||||
"# Dummy training loop\n",
|
||||
"for epoch in range(num_epochs):\n",
|
||||
" for i, (inputs, labels) in enumerate(train_loader):\n",
|
||||
" # Training step\n",
|
||||
" if (i + 1) % 5 == 0:\n",
|
||||
" print(f'Epoch: {epoch+1}/{num_epochs}, Step {i+1}/{n_iterations} | '\n",
|
||||
" f'Inputs {inputs.shape} | Labels {labels.shape}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "37095d28",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"MNIST batch - Inputs: torch.Size([3, 1, 28, 28]), Targets: torch.Size([3])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Example with MNIST dataset\n",
|
||||
"train_dataset = torchvision.datasets.MNIST(root='./data',\n",
|
||||
" train=True,\n",
|
||||
" transform=torchvision.transforms.ToTensor(),\n",
|
||||
" download=True)\n",
|
||||
"\n",
|
||||
"mnist_loader = DataLoader(dataset=train_dataset,\n",
|
||||
" batch_size=3,\n",
|
||||
" shuffle=True)\n",
|
||||
"\n",
|
||||
"# Examine MNIST batch\n",
|
||||
"dataiter = iter(mnist_loader)\n",
|
||||
"inputs, targets = next(dataiter)\n",
|
||||
"print(f\"MNIST batch - Inputs: {inputs.shape}, Targets: {targets.shape}\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user