diff --git a/ch05/01_main-chapter-code/ch05.ipynb b/ch05/01_main-chapter-code/ch05.ipynb index fc6df9d..d33d4d8 100644 --- a/ch05/01_main-chapter-code/ch05.ipynb +++ b/ch05/01_main-chapter-code/ch05.ipynb @@ -198,7 +198,7 @@ "from previous_chapters import generate_text_simple\n", "\n", "def text_to_token_ids(text, tokenizer):\n", - " encoded = tokenizer.encode(text)\n", + " encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})\n", " encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension\n", " return encoded_tensor\n", "\n", @@ -430,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "id": "54aef09c-d6e3-4238-8653-b3a1b0a1077a", "metadata": { "colab": { @@ -470,7 +470,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 9, "id": "31402a67-a16e-4aeb-977e-70abb9c9949b", "metadata": { "colab": { @@ -504,7 +504,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 10, "id": "9b003797-161b-4d98-81dc-e68320e09fec", "metadata": { "colab": { diff --git a/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py b/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py index 3fcf0d0..cf3d31b 100644 --- a/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py +++ b/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py @@ -305,11 +305,11 @@ def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, output_dir): def text_to_token_ids(text, tokenizer): - encoded = tokenizer.encode(text) - encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension + encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'}) + encoded_tensor = torch.tensor(encoded).unsqueeze(0) # Add batch dimension return encoded_tensor def token_ids_to_text(token_ids, tokenizer): - flat = token_ids.squeeze(0) # remove batch dimension + flat = token_ids.squeeze(0) # Remove batch dimension return tokenizer.decode(flat.tolist())