mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
cleanup and minimal notebook
This commit is contained in:
@@ -548,7 +548,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 16,
|
||||
"id": "57c3143b-e860-4d3b-a22a-de22b547a6a9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -558,7 +558,7 @@
|
||||
"1161"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -569,7 +569,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 17,
|
||||
"id": "50e51bb1-ae05-4aa8-a9ff-455b65ed1959",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -600,7 +600,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 18,
|
||||
"id": "948861c5-3f30-4712-a234-725f20d26f68",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -636,32 +636,68 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 19,
|
||||
"id": "effcef79-e0a5-4f4a-a43a-31dd94b9250a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[1, 7, 364, 1157, 644, 1002, 12, 0, 59, 1015, 983, 1011, 740, 1015, 1, 9]"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tokenizer = SimpleTokenizerV2(vocab)\n",
|
||||
"\n",
|
||||
"text = \"Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.\"\n",
|
||||
"text1 = \"Hello, do you like tea?\"\n",
|
||||
"text2 = \"In the sunlit terraces of the palace.\"\n",
|
||||
"\n",
|
||||
"text = \" <|endoftext|> \".join((text1, text2))\n",
|
||||
"\n",
|
||||
"print(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "ddfe7346-398d-4bf8-99f1-5b071244ce95",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[1160,\n",
|
||||
" 5,\n",
|
||||
" 362,\n",
|
||||
" 1155,\n",
|
||||
" 642,\n",
|
||||
" 1000,\n",
|
||||
" 10,\n",
|
||||
" 1159,\n",
|
||||
" 57,\n",
|
||||
" 1013,\n",
|
||||
" 981,\n",
|
||||
" 1009,\n",
|
||||
" 738,\n",
|
||||
" 1013,\n",
|
||||
" 1160,\n",
|
||||
" 7]"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tokenizer.encode(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 21,
|
||||
"id": "0c350ff6-2734-4e84-9ec7-d578baa4ae1b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -671,7 +707,7 @@
|
||||
"'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -703,7 +739,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 22,
|
||||
"id": "ede1d41f-934b-4bf4-8184-54394a257a94",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -713,7 +749,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"execution_count": 23,
|
||||
"id": "48967a77-7d17-42bf-9e92-fc619d63a59e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -734,7 +770,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"execution_count": 24,
|
||||
"id": "6ad3312f-a5f7-4efc-9d7d-8ea09d7b5128",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -744,7 +780,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"execution_count": 25,
|
||||
"id": "5ff2cd85-7cfb-4325-b390-219938589428",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -766,7 +802,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"execution_count": 26,
|
||||
"id": "d26a48bb-f82e-41a8-a955-a1c9cf9d50ab",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -784,6 +820,76 @@
|
||||
"print(strings)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f63d62ab-4b80-489c-8041-e4052fe29969",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- Experiments with unknown words:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "ce25cf25-a2bb-44d2-bac1-cb566f433f98",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[33901, 86, 343, 86, 220, 959]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"integers = tokenizer.encode(\"Akwirw ier\")\n",
|
||||
"print(integers)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "3e224f96-41d0-4074-ac6e-f7db2490f806",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"33901 -> Ak\n",
|
||||
"86 -> w\n",
|
||||
"343 -> ir\n",
|
||||
"86 -> w\n",
|
||||
"220 -> \n",
|
||||
"959 -> ier\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for i in integers:\n",
|
||||
" print(f\"{i} -> {tokenizer.decode([i])}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "766bcf29-64bf-47ca-9b65-4ae8e607d580",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Akwirw ier\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"strings = tokenizer.decode(integers)\n",
|
||||
"print(strings)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "abbd7c0d-70f8-4386-a114-907e96c950b0",
|
||||
@@ -794,7 +900,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"execution_count": 30,
|
||||
"id": "848d5ade-fd1f-46c3-9e31-1426e315c71b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -807,10 +913,10 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with open('the-verdict.txt', 'r', encoding='utf-8') as f:\n",
|
||||
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
|
||||
" raw_text = f.read()\n",
|
||||
"\n",
|
||||
"enc_text = tokenizer.encode(raw_text, allowed_special={\"<|endoftext|>\"})\n",
|
||||
"enc_text = tokenizer.encode(raw_text)\n",
|
||||
"print(len(enc_text))"
|
||||
]
|
||||
},
|
||||
@@ -825,7 +931,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"execution_count": 31,
|
||||
"id": "e84424a7-646d-45b6-99e3-80d15fb761f2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -835,7 +941,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": 32,
|
||||
"id": "dfbff852-a92f-48c8-a46d-143a0f109f40",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -868,7 +974,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"execution_count": 33,
|
||||
"id": "d97b031e-ed55-409d-95f2-aeb38c6fe366",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -878,12 +984,13 @@
|
||||
"text": [
|
||||
"[290] ----> 4920\n",
|
||||
"[290, 4920] ----> 2241\n",
|
||||
"[290, 4920, 2241] ----> 287\n"
|
||||
"[290, 4920, 2241] ----> 287\n",
|
||||
"[290, 4920, 2241, 287] ----> 257\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for i in range(1, context_size):\n",
|
||||
"for i in range(1, context_size+1):\n",
|
||||
" context = enc_sample[:i]\n",
|
||||
" desired = enc_sample[i]\n",
|
||||
"\n",
|
||||
@@ -892,7 +999,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"execution_count": 34,
|
||||
"id": "f57bd746-dcbf-4433-8e24-ee213a8c34a1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -902,12 +1009,13 @@
|
||||
"text": [
|
||||
" and ----> established\n",
|
||||
" and established ----> himself\n",
|
||||
" and established himself ----> in\n"
|
||||
" and established himself ----> in\n",
|
||||
" and established himself in ----> a\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for i in range(1, context_size):\n",
|
||||
"for i in range(1, context_size+1):\n",
|
||||
" context = enc_sample[:i]\n",
|
||||
" desired = enc_sample[i]\n",
|
||||
"\n",
|
||||
@@ -933,7 +1041,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"execution_count": 35,
|
||||
"id": "e1770134-e7f3-4725-a679-e04c3be48cac",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -941,7 +1049,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"PyTorch version: 2.0.1\n"
|
||||
"PyTorch version: 2.1.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -960,7 +1068,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"execution_count": 36,
|
||||
"id": "74b41073-4c9f-46e2-a1bd-d38e4122b375",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -993,7 +1101,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"execution_count": 37,
|
||||
"id": "5eb30ebe-97b3-43c5-9ff1-a97d621b3c4e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -1021,18 +1129,18 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"execution_count": 38,
|
||||
"id": "df31d96c-6bfd-4564-a956-6192242d7579",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('the-verdict.txt', 'r', encoding='utf-8') as f:\n",
|
||||
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
|
||||
" raw_text = f.read()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"execution_count": 39,
|
||||
"id": "9226d00c-ad9a-4949-a6e4-9afccfc7214f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -1048,13 +1156,13 @@
|
||||
"dataloader = create_dataloader(raw_text, batch_size=1, max_length=4, stride=1)\n",
|
||||
"\n",
|
||||
"data_iter = iter(dataloader)\n",
|
||||
"next_batch = next(data_iter)\n",
|
||||
"print(next_batch)"
|
||||
"first_batch = next(data_iter)\n",
|
||||
"print(first_batch)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"execution_count": 40,
|
||||
"id": "10deb4bc-4de1-4d20-921e-4b1c7a0e1a6d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -1067,8 +1175,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"next_batch = next(data_iter)\n",
|
||||
"print(next_batch)"
|
||||
"second_batch = next(data_iter)\n",
|
||||
"print(second_batch)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1077,12 +1185,12 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- We can also create batched outputs\n",
|
||||
"- Note that we increase the stride here so that we don't have overlaps between the batches, which could lead to increased overfitting"
|
||||
"- Note that we increase the stride here so that we don't have overlaps between the batches, since more overlap could lead to increased overfitting"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"execution_count": 41,
|
||||
"id": "1916e7a6-f03d-4f09-91a6-d0bdbac5a58c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -1149,7 +1257,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"execution_count": 42,
|
||||
"id": "15a6304c-9474-4470-b85d-3991a49fa653",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -1167,7 +1275,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"execution_count": 43,
|
||||
"id": "93cb2cee-9aa6-4bb8-8977-c65661d16eda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -1189,29 +1297,26 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"execution_count": 44,
|
||||
"id": "a686eb61-e737-4351-8f1c-222913d47468",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Parameter containing:\n",
|
||||
"tensor([[ 0.3374, -0.1778, -0.1690],\n",
|
||||
" [ 0.9178, 1.5810, 1.3010],\n",
|
||||
" [ 1.2753, -0.2010, -0.1606],\n",
|
||||
" [-0.4015, 0.9666, -1.1481],\n",
|
||||
" [-1.1589, 0.3255, -0.6315],\n",
|
||||
" [-2.8400, -0.7849, -1.4096]], requires_grad=True)"
|
||||
]
|
||||
},
|
||||
"execution_count": 39,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Parameter containing:\n",
|
||||
"tensor([[ 0.3374, -0.1778, -0.1690],\n",
|
||||
" [ 0.9178, 1.5810, 1.3010],\n",
|
||||
" [ 1.2753, -0.2010, -0.1606],\n",
|
||||
" [-0.4015, 0.9666, -1.1481],\n",
|
||||
" [-1.1589, 0.3255, -0.6315],\n",
|
||||
" [-2.8400, -0.7849, -1.4096]], requires_grad=True)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"embedding_layer.weight"
|
||||
"print(embedding_layer.weight)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1233,23 +1338,20 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"execution_count": 45,
|
||||
"id": "e43600ba-f287-4746-8ddf-d0f71a9023ca",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"tensor([[-0.4015, 0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)"
|
||||
]
|
||||
},
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([[-0.4015, 0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"embedding_layer(torch.tensor([3]))"
|
||||
"print(embedding_layer(torch.tensor([3])))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1263,47 +1365,23 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"execution_count": 46,
|
||||
"id": "50280ead-0363-44c8-8c35-bb885d92c8b7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"tensor([[-2.8400, -0.7849, -1.4096],\n",
|
||||
" [ 0.9178, 1.5810, 1.3010],\n",
|
||||
" [-0.4015, 0.9666, -1.1481],\n",
|
||||
" [ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)"
|
||||
]
|
||||
},
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([[-2.8400, -0.7849, -1.4096],\n",
|
||||
" [ 0.9178, 1.5810, 1.3010],\n",
|
||||
" [-0.4015, 0.9666, -1.1481],\n",
|
||||
" [ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"embedding_layer(input_ids)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "53f452c4-5fcb-4528-8fda-fd1a16f26bc7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- The BytePair encoder has a vocabulary size of 50,257:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"id": "91c1f77f-cb0c-4f72-a258-ec9bab2bc755",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vocab_size = 50257\n",
|
||||
"output_dim = 256\n",
|
||||
"\n",
|
||||
"token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)"
|
||||
"print(embedding_layer(input_ids))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1319,12 +1397,13 @@
|
||||
"id": "7f187f87-c1f8-4c2e-8050-350bbb972f55",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- The BytePair encoder has a vocabulary size of 50,257:\n",
|
||||
"- Suppose we want to encode the input tokens into a 256-dimensional vector representation:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"execution_count": 48,
|
||||
"id": "0b9e344d-03a6-4f2c-b723-67b6a20c5041",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -1340,42 +1419,70 @@
|
||||
"id": "a2654722-24e4-4b0d-a43c-436a461eb70b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- If we sample data from teh dataloader, we embed the tokens in each batch into a 256-dimensional vector\n",
|
||||
"- If we sample data from the dataloader, we embed the tokens in each batch into a 256-dimensional vector\n",
|
||||
"- If we have a batch size of 8 with 4 tokens each, this results in a 8 x 4 x 256 tensor:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"execution_count": 49,
|
||||
"id": "ad56a263-3d2e-4d91-98bf-d0b68d3c7fc3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataloader = create_dataloader(raw_text, batch_size=8, max_length=4, stride=5)\n",
|
||||
"max_length = 4\n",
|
||||
"dataloader = create_dataloader(raw_text, batch_size=8, max_length=max_length, stride=5)\n",
|
||||
"data_iter = iter(dataloader)\n",
|
||||
"inputs, targets = next(data_iter)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"execution_count": 50,
|
||||
"id": "84416b60-3707-4370-bcbc-da0b62f2b64d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Token IDs:\n",
|
||||
" tensor([[ 40, 367, 2885, 1464],\n",
|
||||
" [ 3619, 402, 271, 10899],\n",
|
||||
" [ 257, 7026, 15632, 438],\n",
|
||||
" [ 257, 922, 5891, 1576],\n",
|
||||
" [ 568, 340, 373, 645],\n",
|
||||
" [ 5975, 284, 502, 284],\n",
|
||||
" [ 326, 11, 287, 262],\n",
|
||||
" [ 286, 465, 13476, 11]])\n",
|
||||
"\n",
|
||||
"Inputs shape:\n",
|
||||
" torch.Size([8, 4])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"Token IDs:\\n\", inputs)\n",
|
||||
"print(\"\\nInputs shape:\\n\", inputs.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"id": "7766ec38-30d0-4128-8c31-f49f063c43d1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"torch.Size([8, 4, 256])"
|
||||
]
|
||||
},
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"torch.Size([8, 4, 256])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"token_embeddings = token_embedding_layer(inputs)\n",
|
||||
"token_embeddings.shape"
|
||||
"print(token_embeddings.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1383,12 +1490,12 @@
|
||||
"id": "fe2ae164-6f19-4e32-b9e5-76950fcf1c9f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- GPT2 uses absolute position embeddings, so we just create another embedding layer:"
|
||||
"- GPT-2 uses absolute position embeddings, so we just create another embedding layer:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"execution_count": 52,
|
||||
"id": "cc048e20-7ac8-417e-81f5-8fe6f9a4fe07",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -1398,24 +1505,21 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"execution_count": 53,
|
||||
"id": "c369a1e7-d566-4b53-b398-d6adafb44105",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"torch.Size([8, 4, 256])"
|
||||
]
|
||||
},
|
||||
"execution_count": 47,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"torch.Size([4, 256])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pos_embeddings = pos_embedding_layer(inputs)\n",
|
||||
"pos_embeddings.shape"
|
||||
"pos_embeddings = pos_embedding_layer(torch.arange(max_length))\n",
|
||||
"print(pos_embeddings.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1428,25 +1532,38 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"execution_count": 54,
|
||||
"id": "b22fab89-526e-43c8-9035-5b7018e34288",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"torch.Size([8, 4, 256])"
|
||||
]
|
||||
},
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"torch.Size([8, 4, 256])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"input_embeddings = token_embeddings + pos_embeddings\n",
|
||||
"input_embeddings.shape"
|
||||
"print(input_embeddings.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a6b71f61-57f4-496b-bf48-9097c591f54c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c2894bbd-6cf5-4bfa-80ad-a23b5d1a45f4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -1465,7 +1582,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
150
ch02/01_main-chapter-code/dataloader.ipynb
Normal file
150
ch02/01_main-chapter-code/dataloader.ipynb
Normal file
@@ -0,0 +1,150 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6f678e62-7bcb-4405-86ae-dce94f494303",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# The Main Data Loading Pipeline Summarized"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "070000fc-a7b7-4c56-a2c0-a938d413a790",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The complete chapter code is located in [ch02.ipynb](./ch02.ipynb).\n",
|
||||
"\n",
|
||||
"This notebook contains the main takeaway, the data loading pipeline without the intermediate steps."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "0ed4b7db-3b47-4fd3-a4a6-5f4ed5dd166e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import tiktoken\n",
|
||||
"import torch\n",
|
||||
"from torch.utils.data import Dataset, DataLoader\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class GPTDatasetV1(Dataset):\n",
|
||||
" def __init__(self, txt, tokenizer, max_length, stride):\n",
|
||||
" self.tokenizer = tokenizer\n",
|
||||
" self.input_ids = []\n",
|
||||
" self.target_ids = []\n",
|
||||
"\n",
|
||||
" # Tokenize the entire text\n",
|
||||
" token_ids = tokenizer.encode(txt)\n",
|
||||
"\n",
|
||||
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
||||
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
||||
" input_chunk = token_ids[i:i + max_length]\n",
|
||||
" target_chunk = token_ids[i + 1: i + max_length + 1]\n",
|
||||
" self.input_ids.append(torch.tensor(input_chunk))\n",
|
||||
" self.target_ids.append(torch.tensor(target_chunk))\n",
|
||||
"\n",
|
||||
" def __len__(self):\n",
|
||||
" return len(self.input_ids)\n",
|
||||
"\n",
|
||||
" def __getitem__(self, idx):\n",
|
||||
" return self.input_ids[idx], self.target_ids[idx]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def create_dataloader(txt, batch_size=4, max_length=256, stride=128):\n",
|
||||
" # Initialize the tokenizer\n",
|
||||
" tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
|
||||
"\n",
|
||||
" # Create dataset\n",
|
||||
" dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n",
|
||||
"\n",
|
||||
" # Create dataloader\n",
|
||||
" dataloader = DataLoader(dataset, batch_size=batch_size)\n",
|
||||
"\n",
|
||||
" return dataloader\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
|
||||
" raw_text = f.read()\n",
|
||||
"\n",
|
||||
"tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
|
||||
"encoded_text = tokenizer.encode(raw_text)\n",
|
||||
"\n",
|
||||
"vocab_size = 50257\n",
|
||||
"output_dim = 256\n",
|
||||
"token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)\n",
|
||||
"pos_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)\n",
|
||||
"\n",
|
||||
"max_length = 4\n",
|
||||
"dataloader = create_dataloader(raw_text, batch_size=8, max_length=max_length, stride=5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "664397bc-6daa-4b88-90aa-e8fc1fbd5846",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for batch in dataloader:\n",
|
||||
" x, y = batch\n",
|
||||
"\n",
|
||||
" token_embeddings = token_embedding_layer(x)\n",
|
||||
" pos_embeddings = pos_embedding_layer(torch.arange(max_length))\n",
|
||||
"\n",
|
||||
" input_embeddings = token_embeddings + pos_embeddings\n",
|
||||
"\n",
|
||||
" break"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "d3664332-e6bb-447e-8b96-203aafde8b24",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"torch.Size([8, 4, 256])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(input_embeddings.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2773c09d-c136-4372-a2be-04b58d292842",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -478,7 +478,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
117
ch02/Untitled.ipynb
Normal file
117
ch02/Untitled.ipynb
Normal file
@@ -0,0 +1,117 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "98efe79e-daa3-40d0-ab4d-f667d4d6ba9d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/Author/miniforge3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||
"Downloading (…)olve/main/vocab.json: 100%|█| 1.04M/1.04M [00:00<00:00, 1.66MB/s]\n",
|
||||
"Downloading (…)olve/main/merges.txt: 100%|███| 456k/456k [00:00<00:00, 2.44MB/s]\n",
|
||||
"Downloading (…)/main/tokenizer.json: 100%|█| 1.36M/1.36M [00:00<00:00, 1.97MB/s]\n",
|
||||
"Downloading (…)lve/main/config.json: 100%|██████| 718/718 [00:00<00:00, 621kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Decoded Inputs:\n",
|
||||
"I HAD always\n",
|
||||
" Jack Gisburn\n",
|
||||
" a cheap genius--\n",
|
||||
" a good fellow enough\n",
|
||||
"so it was no\n",
|
||||
" surprise to me to\n",
|
||||
" that, in the\n",
|
||||
" of his glory,\n",
|
||||
"\n",
|
||||
"Decoded Targets:\n",
|
||||
" HAD always thought\n",
|
||||
" Gisburn rather\n",
|
||||
" cheap genius--though\n",
|
||||
" good fellow enough--\n",
|
||||
" it was no great\n",
|
||||
" to me to hear\n",
|
||||
", in the height\n",
|
||||
" his glory, he\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"from transformers import GPT2Tokenizer\n",
|
||||
"\n",
|
||||
"tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')\n",
|
||||
"\n",
|
||||
"inputs = torch.tensor([\n",
|
||||
" [40, 367, 2885, 1464],\n",
|
||||
" [3619, 402, 271, 10899],\n",
|
||||
" [257, 7026, 15632, 438],\n",
|
||||
" [257, 922, 5891, 1576],\n",
|
||||
" [568, 340, 373, 645],\n",
|
||||
" [5975, 284, 502, 284],\n",
|
||||
" [326, 11, 287, 262],\n",
|
||||
" [286, 465, 13476, 11]\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"targets = torch.tensor([\n",
|
||||
" [367, 2885, 1464, 1807],\n",
|
||||
" [402, 271, 10899, 2138],\n",
|
||||
" [7026, 15632, 438, 2016],\n",
|
||||
" [922, 5891, 1576, 438],\n",
|
||||
" [340, 373, 645, 1049],\n",
|
||||
" [284, 502, 284, 3285],\n",
|
||||
" [11, 287, 262, 6001],\n",
|
||||
" [465, 13476, 11, 339]\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"decoded_inputs = [tokenizer.decode(i) for i in inputs]\n",
|
||||
"decoded_targets = [tokenizer.decode(t) for t in targets]\n",
|
||||
"\n",
|
||||
"print(\"Decoded Inputs:\")\n",
|
||||
"for di in decoded_inputs:\n",
|
||||
" print(di)\n",
|
||||
"\n",
|
||||
"print(\"\\nDecoded Targets:\")\n",
|
||||
"for dt in decoded_targets:\n",
|
||||
" print(dt)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "defc6b2f-9ac2-49e0-a4e1-03247cacffce",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user