mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
add endoftext token
This commit is contained in:
@@ -198,7 +198,7 @@
|
||||
"from previous_chapters import generate_text_simple\n",
|
||||
"\n",
|
||||
"def text_to_token_ids(text, tokenizer):\n",
|
||||
" encoded = tokenizer.encode(text)\n",
|
||||
" encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})\n",
|
||||
" encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension\n",
|
||||
" return encoded_tensor\n",
|
||||
"\n",
|
||||
@@ -430,7 +430,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 8,
|
||||
"id": "54aef09c-d6e3-4238-8653-b3a1b0a1077a",
|
||||
"metadata": {
|
||||
"colab": {
|
||||
@@ -470,7 +470,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"execution_count": 9,
|
||||
"id": "31402a67-a16e-4aeb-977e-70abb9c9949b",
|
||||
"metadata": {
|
||||
"colab": {
|
||||
@@ -504,7 +504,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"execution_count": 10,
|
||||
"id": "9b003797-161b-4d98-81dc-e68320e09fec",
|
||||
"metadata": {
|
||||
"colab": {
|
||||
|
||||
@@ -305,11 +305,11 @@ def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, output_dir):
|
||||
|
||||
|
||||
def text_to_token_ids(text, tokenizer):
|
||||
encoded = tokenizer.encode(text)
|
||||
encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
|
||||
encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
|
||||
encoded_tensor = torch.tensor(encoded).unsqueeze(0) # Add batch dimension
|
||||
return encoded_tensor
|
||||
|
||||
|
||||
def token_ids_to_text(token_ids, tokenizer):
|
||||
flat = token_ids.squeeze(0) # remove batch dimension
|
||||
flat = token_ids.squeeze(0) # Remove batch dimension
|
||||
return tokenizer.decode(flat.tolist())
|
||||
|
||||
Reference in New Issue
Block a user