add endoftext token

This commit is contained in:
rasbt
2024-03-26 06:47:05 -05:00
parent de576296de
commit 12fff1ddcb
2 changed files with 7 additions and 7 deletions

View File

@@ -198,7 +198,7 @@
"from previous_chapters import generate_text_simple\n",
"\n",
"def text_to_token_ids(text, tokenizer):\n",
" encoded = tokenizer.encode(text)\n",
" encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})\n",
" encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension\n",
" return encoded_tensor\n",
"\n",
@@ -430,7 +430,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 8,
"id": "54aef09c-d6e3-4238-8653-b3a1b0a1077a",
"metadata": {
"colab": {
@@ -470,7 +470,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 9,
"id": "31402a67-a16e-4aeb-977e-70abb9c9949b",
"metadata": {
"colab": {
@@ -504,7 +504,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 10,
"id": "9b003797-161b-4d98-81dc-e68320e09fec",
"metadata": {
"colab": {

View File

@@ -305,11 +305,11 @@ def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, output_dir):
def text_to_token_ids(text, tokenizer):
encoded = tokenizer.encode(text)
encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
encoded_tensor = torch.tensor(encoded).unsqueeze(0) # Add batch dimension
return encoded_tensor
def token_ids_to_text(token_ids, tokenizer):
flat = token_ids.squeeze(0) # remove batch dimension
flat = token_ids.squeeze(0) # Remove batch dimension
return tokenizer.decode(flat.tolist())