From 3c5b288ca0e962702ac3af5ffe3e60f341ef8be0 Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Thu, 28 Mar 2024 08:02:05 -0500
Subject: [PATCH] minor typo fixes

---
 ch05/01_main-chapter-code/ch05.ipynb | 166 +++++++++++++++------------
 1 file changed, 91 insertions(+), 75 deletions(-)

diff --git a/ch05/01_main-chapter-code/ch05.ipynb b/ch05/01_main-chapter-code/ch05.ipynb
index 8be4de4..ae42271 100644
--- a/ch05/01_main-chapter-code/ch05.ipynb
+++ b/ch05/01_main-chapter-code/ch05.ipynb
@@ -23,7 +23,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "92b989e9-da36-4159-b212-799184764dd9",
    "metadata": {},
    "outputs": [
@@ -119,7 +119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "86000d74-624a-48f0-86da-f41926cb9e04",
    "metadata": {
     "colab": {
@@ -180,7 +180,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "5e062b82-3540-48ce-8eb4-009686d0d16c",
    "metadata": {},
    "outputs": [
@@ -260,7 +260,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "6b5402f8-ec0c-4a44-9892-18a97779ee4f",
    "metadata": {
     "colab": {
@@ -290,7 +290,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "e7b6ec51-6f8c-49bd-a349-95ba38b46fb6",
    "metadata": {},
    "outputs": [
@@ -345,7 +345,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "34ebd76a-16ec-4c17-8958-8a135735cc1c",
    "metadata": {
     "colab": {
@@ -385,7 +385,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "id": "c990ead6-53cd-49a7-a6d1-14d8c1518249",
    "metadata": {},
    "outputs": [
@@ -430,7 +430,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "54aef09c-d6e3-4238-8653-b3a1b0a1077a",
    "metadata": {
     "colab": {
@@ -470,7 +470,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "31402a67-a16e-4aeb-977e-70abb9c9949b",
    "metadata": {
     "colab": {
@@ -504,7 +504,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "9b003797-161b-4d98-81dc-e68320e09fec",
    "metadata": {
     "colab": {
@@ -548,7 +548,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "176ddf35-1c5f-4d7c-bf17-70f3e7069bd4",
    "metadata": {},
    "outputs": [
@@ -591,7 +591,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "id": "695d6f64-5084-4c23-aea4-105c9e38cfe4",
    "metadata": {
     "colab": {
@@ -628,7 +628,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "id": "0e17e027-ab9f-4fb5-ac9b-a009b831c122",
    "metadata": {
     "colab": {
@@ -666,7 +666,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "id": "62d0816e-b29a-4c8f-a9a5-a167562de978",
    "metadata": {
     "colab": {
@@ -700,7 +700,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "id": "168952a1-b964-4aa7-8e49-966fa26add54",
    "metadata": {
     "colab": {
@@ -764,7 +764,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "id": "654fde37-b2a9-4a20-a8d3-0206c056e2ff",
    "metadata": {},
    "outputs": [],
@@ -795,7 +795,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "id": "6kgJbe4ehI4q",
    "metadata": {
     "colab": {
@@ -821,7 +821,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "id": "j2XPde_ThM_e",
    "metadata": {
     "colab": {
@@ -847,7 +847,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
    "id": "6b46a952-d50a-4837-af09-4095698f7fd1",
    "metadata": {
     "colab": {
@@ -903,7 +903,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
    "id": "0959c855-f860-4358-8b98-bc654f047578",
    "metadata": {},
    "outputs": [],
@@ -940,7 +940,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
    "id": "f37b3eb0-854e-4895-9898-fa7d1e67566e",
    "metadata": {},
    "outputs": [],
@@ -977,7 +977,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
    "id": "ca0116d0-d229-472c-9fbf-ebc229331c3e",
    "metadata": {},
    "outputs": [
@@ -1021,7 +1021,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 24,
    "id": "eb860488-5453-41d7-9870-23b723f742a0",
    "metadata": {
     "colab": {
@@ -1066,7 +1066,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 25,
    "id": "7b9de31e-4096-47b3-976d-b6d2fdce04bc",
    "metadata": {
     "id": "7b9de31e-4096-47b3-976d-b6d2fdce04bc"
@@ -1110,7 +1110,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 26,
    "id": "56f5b0c9-1065-4d67-98b9-010e42fc1e2a",
    "metadata": {},
    "outputs": [
@@ -1178,8 +1178,7 @@
     "                       eval_freq, eval_iter, start_context):\n",
     "    # Initialize lists to track losses and tokens seen\n",
     "    train_losses, val_losses, track_tokens_seen = [], [], []\n",
-    "    tokens_seen = 0\n",
-    "    global_step = -1\n",
+    "    tokens_seen, global_step = 0, -1\n",
     "\n",
     "    # Main training loop\n",
     "    for epoch in range(num_epochs):\n",
@@ -1408,7 +1407,7 @@
    "metadata": {},
    "source": [
     "- Inference is relatively cheap with a relatively small LLM as the GPT model we trained above, so there's no need to use a GPU for it in case you used a GPU for training it above\n",
-    "- Using the `generate_text_simple method` (from the previous chapter) that we used earlier inside the simple training function, we can generate new text one word (or token) at a time\n",
+    "- Using the `generate_text_simple` function (from the previous chapter) that we used earlier inside the simple training function, we can generate new text one word (or token) at a time\n",
     "- As explained in section 5.1.2, the next generated token is the token corresponding to the largest probability score among all tokens in the vocabulary"
    ]
   },
@@ -1498,8 +1497,6 @@
     }
    ],
    "source": [
-    "# Assume some logits from a neural network output for 7 vocabulary tokens\n",
-    "\n",
     "vocab = { \n",
     "    \"closer\": 0,\n",
     "    \"every\": 1, \n",
@@ -1527,12 +1524,74 @@
     "print(inverse_vocab[next_token_id])"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "6400572f-b3c8-49e2-95bc-433e55c5b3a1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "forward\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch.manual_seed(123)\n",
+    "next_token_id = torch.multinomial(probas, num_samples=1).item()\n",
+    "print(inverse_vocab[next_token_id])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "b23b863e-252a-403c-b5b1-62bc0a42319f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "73 x closer\n",
+      "0 x every\n",
+      "0 x effort\n",
+      "582 x forward\n",
+      "2 x inches\n",
+      "0 x moves\n",
+      "0 x pizza\n",
+      "343 x toward\n"
+     ]
+    }
+   ],
+   "source": [
+    "def print_sampled_tokens(probas):\n",
+    "    torch.manual_seed(123) # Manual seed for reproducibility\n",
+    "    sample = [torch.multinomial(probas, num_samples=1).item() for i in range(1_000)]\n",
+    "    sampled_ids = torch.bincount(torch.tensor(sample))\n",
+    "    for i, freq in enumerate(sampled_ids):\n",
+    "        print(f\"{freq} x {inverse_vocab[i]}\")\n",
+    "\n",
+    "print_sampled_tokens(probas)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c63d0a27-830b-42b5-9986-6d1a7de04dd9",
+   "metadata": {},
+   "source": [
+    "- Instead of determining the most likely token via `torch.argmax`, we use `torch.multinomial(probas, num_samples=1)` to determine the most likely token by sampling from the softmax distribution\n",
+    "- For illustration purposes, let's see what happens when we sample the next token 1,000 times using the original softmax probabilities:"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "32e7d9cf-a26d-4d9a-8664-4af1efa73832",
    "metadata": {},
    "source": [
-    "- \"Temperature scaling\" is just a fancy word for diving the logits by a number greater than 0\n",
+    "- We can control the distribution and selection process via a concept called temperature scaling\n",
+    "- \"Temperature scaling\" is just a fancy word for dividing the logits by a number greater than 0\n",
     "- Temperatures greater than 1 will result in more uniformly distributed token probabilities after applying the softmax\n",
     "- Temperatures smaller than 1 will result in more confident (sharper or more peaky) distributions after applying the softmax"
    ]
@@ -1549,7 +1608,7 @@
     "    return torch.softmax(scaled_logits, dim=0)\n",
     "\n",
     "# Temperature values\n",
-    "temperatures = [1, 0.1, 5]  # Original, higher confidence, and\n",
+    "temperatures = [1, 0.1, 5]  # Original, higher confidence, and lower confidence\n",
     "\n",
     "# Calculate scaled probabilities\n",
     "scaled_probas = [softmax_with_temperature(next_token_logits, T) for T in temperatures]"
@@ -1591,49 +1650,6 @@
     "plt.show()"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "c63d0a27-830b-42b5-9986-6d1a7de04dd9",
-   "metadata": {},
-   "source": [
-    "- Instead of determining the most likely token via `torch.argmax`, we use `torch.multinomial(probas, num_samples=1)` to determine the most likely token by sampling from the softmax distribution\n",
-    "- For illustration purposes, let's see what happens when we sample the next token 1,000 times using the original softmax probabilities:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "id": "b23b863e-252a-403c-b5b1-62bc0a42319f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "73 x closer\n",
-      "0 x every\n",
-      "0 x effort\n",
-      "582 x forward\n",
-      "2 x inches\n",
-      "0 x moves\n",
-      "0 x pizza\n",
-      "343 x toward\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Original probas\n",
-    "\n",
-    "def print_sampled_tokens(probas):\n",
-    "    torch.manual_seed(123)\n",
-    "    sample = [torch.multinomial(probas, num_samples=1).item() for i in range(1_000)]\n",
-    "    sampled_ids = torch.bincount(torch.tensor(sample))\n",
-    "    for i, freq in enumerate(sampled_ids):\n",
-    "        print(f\"{freq} x {inverse_vocab[i]}\")\n",
-    "\n",
-    "print_sampled_tokens(probas)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "d750e989-842a-4cfa-a44b-cf44d6e49163",