From 3c5b288ca0e962702ac3af5ffe3e60f341ef8be0 Mon Sep 17 00:00:00 2001 From: rasbt Date: Thu, 28 Mar 2024 08:02:05 -0500 Subject: [PATCH] minor typo fixes --- ch05/01_main-chapter-code/ch05.ipynb | 166 +++++++++++++++------------ 1 file changed, 91 insertions(+), 75 deletions(-) diff --git a/ch05/01_main-chapter-code/ch05.ipynb b/ch05/01_main-chapter-code/ch05.ipynb index 8be4de4..ae42271 100644 --- a/ch05/01_main-chapter-code/ch05.ipynb +++ b/ch05/01_main-chapter-code/ch05.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "92b989e9-da36-4159-b212-799184764dd9", "metadata": {}, "outputs": [ @@ -119,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "86000d74-624a-48f0-86da-f41926cb9e04", "metadata": { "colab": { @@ -180,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "5e062b82-3540-48ce-8eb4-009686d0d16c", "metadata": {}, "outputs": [ @@ -260,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "6b5402f8-ec0c-4a44-9892-18a97779ee4f", "metadata": { "colab": { @@ -290,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "e7b6ec51-6f8c-49bd-a349-95ba38b46fb6", "metadata": {}, "outputs": [ @@ -345,7 +345,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "34ebd76a-16ec-4c17-8958-8a135735cc1c", "metadata": { "colab": { @@ -385,7 +385,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "c990ead6-53cd-49a7-a6d1-14d8c1518249", "metadata": {}, "outputs": [ @@ -430,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "54aef09c-d6e3-4238-8653-b3a1b0a1077a", "metadata": { "colab": { @@ -470,7 +470,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "31402a67-a16e-4aeb-977e-70abb9c9949b", "metadata": { "colab": { @@ -504,7 +504,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "9b003797-161b-4d98-81dc-e68320e09fec", "metadata": { "colab": { @@ -548,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "176ddf35-1c5f-4d7c-bf17-70f3e7069bd4", "metadata": {}, "outputs": [ @@ -591,7 +591,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "695d6f64-5084-4c23-aea4-105c9e38cfe4", "metadata": { "colab": { @@ -628,7 +628,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "0e17e027-ab9f-4fb5-ac9b-a009b831c122", "metadata": { "colab": { @@ -666,7 +666,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "62d0816e-b29a-4c8f-a9a5-a167562de978", "metadata": { "colab": { @@ -700,7 +700,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "168952a1-b964-4aa7-8e49-966fa26add54", "metadata": { "colab": { @@ -764,7 +764,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "654fde37-b2a9-4a20-a8d3-0206c056e2ff", "metadata": {}, "outputs": [], @@ -795,7 +795,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "6kgJbe4ehI4q", "metadata": { "colab": { @@ -821,7 +821,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "j2XPde_ThM_e", "metadata": { "colab": { @@ -847,7 +847,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "6b46a952-d50a-4837-af09-4095698f7fd1", "metadata": { "colab": { @@ -903,7 +903,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "0959c855-f860-4358-8b98-bc654f047578", "metadata": {}, "outputs": [], @@ -940,7 +940,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "f37b3eb0-854e-4895-9898-fa7d1e67566e", "metadata": {}, "outputs": [], @@ -977,7 +977,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "ca0116d0-d229-472c-9fbf-ebc229331c3e", "metadata": {}, "outputs": [ @@ -1021,7 +1021,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "id": "eb860488-5453-41d7-9870-23b723f742a0", "metadata": { "colab": { @@ -1066,7 +1066,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "id": "7b9de31e-4096-47b3-976d-b6d2fdce04bc", "metadata": { "id": "7b9de31e-4096-47b3-976d-b6d2fdce04bc" @@ -1110,7 +1110,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "id": "56f5b0c9-1065-4d67-98b9-010e42fc1e2a", "metadata": {}, "outputs": [ @@ -1178,8 +1178,7 @@ " eval_freq, eval_iter, start_context):\n", " # Initialize lists to track losses and tokens seen\n", " train_losses, val_losses, track_tokens_seen = [], [], []\n", - " tokens_seen = 0\n", - " global_step = -1\n", + " tokens_seen, global_step = 0, -1\n", "\n", " # Main training loop\n", " for epoch in range(num_epochs):\n", @@ -1408,7 +1407,7 @@ "metadata": {}, "source": [ "- Inference is relatively cheap with a relatively small LLM as the GPT model we trained above, so there's no need to use a GPU for it in case you used a GPU for training it above\n", - "- Using the `generate_text_simple method` (from the previous chapter) that we used earlier inside the simple training function, we can generate new text one word (or token) at a time\n", + "- Using the `generate_text_simple` function (from the previous chapter) that we used earlier inside the simple training function, we can generate new text one word (or token) at a time\n", "- As explained in section 5.1.2, the next generated token is the token corresponding to the largest probability score among all tokens in the vocabulary" ] }, @@ -1498,8 +1497,6 @@ } ], "source": [ - "# Assume some logits from a neural network output for 7 vocabulary tokens\n", - "\n", "vocab = { \n", " \"closer\": 0,\n", " \"every\": 1, \n", @@ -1527,12 +1524,74 @@ "print(inverse_vocab[next_token_id])" ] }, + { + "cell_type": "code", + "execution_count": 36, + "id": "6400572f-b3c8-49e2-95bc-433e55c5b3a1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "forward\n" + ] + } + ], + "source": [ + "torch.manual_seed(123)\n", + "next_token_id = torch.multinomial(probas, num_samples=1).item()\n", + "print(inverse_vocab[next_token_id])" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "b23b863e-252a-403c-b5b1-62bc0a42319f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "73 x closer\n", + "0 x every\n", + "0 x effort\n", + "582 x forward\n", + "2 x inches\n", + "0 x moves\n", + "0 x pizza\n", + "343 x toward\n" + ] + } + ], + "source": [ + "def print_sampled_tokens(probas):\n", + " torch.manual_seed(123) # Manual seed for reproducibility\n", + " sample = [torch.multinomial(probas, num_samples=1).item() for i in range(1_000)]\n", + " sampled_ids = torch.bincount(torch.tensor(sample))\n", + " for i, freq in enumerate(sampled_ids):\n", + " print(f\"{freq} x {inverse_vocab[i]}\")\n", + "\n", + "print_sampled_tokens(probas)" + ] + }, + { + "cell_type": "markdown", + "id": "c63d0a27-830b-42b5-9986-6d1a7de04dd9", + "metadata": {}, + "source": [ + "- Instead of determining the most likely token via `torch.argmax`, we use `torch.multinomial(probas, num_samples=1)` to determine the most likely token by sampling from the softmax distribution\n", + "- For illustration purposes, let's see what happens when we sample the next token 1,000 times using the original softmax probabilities:" + ] + }, { "cell_type": "markdown", "id": "32e7d9cf-a26d-4d9a-8664-4af1efa73832", "metadata": {}, "source": [ - "- \"Temperature scaling\" is just a fancy word for diving the logits by a number greater than 0\n", + "- We can control the distribution and selection process via a concept called temperature scaling\n", + "- \"Temperature scaling\" is just a fancy word for dividing the logits by a number greater than 0\n", "- Temperatures greater than 1 will result in more uniformly distributed token probabilities after applying the softmax\n", "- Temperatures smaller than 1 will result in more confident (sharper or more peaky) distributions after applying the softmax" ] @@ -1549,7 +1608,7 @@ " return torch.softmax(scaled_logits, dim=0)\n", "\n", "# Temperature values\n", - "temperatures = [1, 0.1, 5] # Original, higher confidence, and\n", + "temperatures = [1, 0.1, 5] # Original, higher confidence, and lower confidence\n", "\n", "# Calculate scaled probabilities\n", "scaled_probas = [softmax_with_temperature(next_token_logits, T) for T in temperatures]" @@ -1591,49 +1650,6 @@ "plt.show()" ] }, - { - "cell_type": "markdown", - "id": "c63d0a27-830b-42b5-9986-6d1a7de04dd9", - "metadata": {}, - "source": [ - "- Instead of determining the most likely token via `torch.argmax`, we use `torch.multinomial(probas, num_samples=1)` to determine the most likely token by sampling from the softmax distribution\n", - "- For illustration purposes, let's see what happens when we sample the next token 1,000 times using the original softmax probabilities:" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "b23b863e-252a-403c-b5b1-62bc0a42319f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "73 x closer\n", - "0 x every\n", - "0 x effort\n", - "582 x forward\n", - "2 x inches\n", - "0 x moves\n", - "0 x pizza\n", - "343 x toward\n" - ] - } - ], - "source": [ - "# Original probas\n", - "\n", - "def print_sampled_tokens(probas):\n", - " torch.manual_seed(123)\n", - " sample = [torch.multinomial(probas, num_samples=1).item() for i in range(1_000)]\n", - " sampled_ids = torch.bincount(torch.tensor(sample))\n", - " for i, freq in enumerate(sampled_ids):\n", - " print(f\"{freq} x {inverse_vocab[i]}\")\n", - "\n", - "print_sampled_tokens(probas)" - ] - }, { "cell_type": "markdown", "id": "d750e989-842a-4cfa-a44b-cf44d6e49163",