diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index 005a9e2..72c43fa 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -26,7 +26,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "torch version: 2.1.0\n", + "torch version: 2.2.1\n", "tiktoken version: 0.5.1\n" ] } @@ -559,7 +559,7 @@ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[16], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m SimpleTokenizerV1(vocab)\n\u001b[1;32m 3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHello, do you like tea. Is this-- a test?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_to_int[s] \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", + "Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpreprocessed\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", "Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", "\u001b[0;31mKeyError\u001b[0m: 'Hello'" ] @@ -585,7 +585,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "ce9df29c-6c5b-43f1-8c1a-c7f7b79db78f", "metadata": {}, "outputs": [], @@ -601,17 +601,17 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "id": "57c3143b-e860-4d3b-a22a-de22b547a6a9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "1161" + "1159" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -622,7 +622,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "id": "50e51bb1-ae05-4aa8-a9ff-455b65ed1959", "metadata": {}, "outputs": [ @@ -630,11 +630,11 @@ "name": "stdout", "output_type": "stream", "text": [ + "('yet', 1154)\n", + "('you', 1155)\n", "('younger', 1156)\n", "('your', 1157)\n", - "('yourself', 1158)\n", - "('<|endoftext|>', 1159)\n", - "('<|unk|>', 1160)\n" + "('yourself', 1158)\n" ] } ], @@ -653,7 +653,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "id": "948861c5-3f30-4712-a234-725f20d26f68", "metadata": {}, "outputs": [], @@ -689,7 +689,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "id": "effcef79-e0a5-4f4a-a43a-31dd94b9250a", "metadata": {}, "outputs": [ @@ -714,34 +714,22 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "id": "ddfe7346-398d-4bf8-99f1-5b071244ce95", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "[1160,\n", - " 5,\n", - " 362,\n", - " 1155,\n", - " 642,\n", - " 1000,\n", - " 10,\n", - " 1159,\n", - " 57,\n", - " 1013,\n", - " 981,\n", - " 1009,\n", - " 738,\n", - " 1013,\n", - " 1160,\n", - " 7]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" + "ename": "KeyError", + "evalue": "'<|unk|>'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[21], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[19], line 12\u001b[0m, in \u001b[0;36mSimpleTokenizerV2.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[1;32m 9\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item \u001b[38;5;28;01mif\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_to_int \n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m<|unk|>\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[0;32m---> 12\u001b[0m ids \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpreprocessed\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", + "Cell \u001b[0;32mIn[19], line 12\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[1;32m 9\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item \u001b[38;5;28;01mif\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_to_int \n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m<|unk|>\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[0;32m---> 12\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", + "\u001b[0;31mKeyError\u001b[0m: '<|unk|>'" + ] } ], "source": [ @@ -750,21 +738,10 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "0c350ff6-2734-4e84-9ec7-d578baa4ae1b", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tokenizer.decode(tokenizer.encode(text))" ] @@ -792,7 +769,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "id": "ede1d41f-934b-4bf4-8184-54394a257a94", "metadata": {}, "outputs": [], @@ -802,7 +779,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 23, "id": "48967a77-7d17-42bf-9e92-fc619d63a59e", "metadata": {}, "outputs": [ @@ -823,7 +800,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 24, "id": "6ad3312f-a5f7-4efc-9d7d-8ea09d7b5128", "metadata": {}, "outputs": [], @@ -833,7 +810,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 25, "id": "5ff2cd85-7cfb-4325-b390-219938589428", "metadata": {}, "outputs": [ @@ -855,7 +832,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 26, "id": "d26a48bb-f82e-41a8-a955-a1c9cf9d50ab", "metadata": {}, "outputs": [ @@ -883,7 +860,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 27, "id": "848d5ade-fd1f-46c3-9e31-1426e315c71b", "metadata": {}, "outputs": [ @@ -914,7 +891,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 28, "id": "e84424a7-646d-45b6-99e3-80d15fb761f2", "metadata": {}, "outputs": [], @@ -924,7 +901,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 29, "id": "dfbff852-a92f-48c8-a46d-143a0f109f40", "metadata": {}, "outputs": [ @@ -957,7 +934,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 30, "id": "d97b031e-ed55-409d-95f2-aeb38c6fe366", "metadata": {}, "outputs": [ @@ -982,7 +959,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 31, "id": "f57bd746-dcbf-4433-8e24-ee213a8c34a1", "metadata": {}, "outputs": [ @@ -1024,7 +1001,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 32, "id": "e1770134-e7f3-4725-a679-e04c3be48cac", "metadata": {}, "outputs": [ @@ -1032,7 +1009,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "PyTorch version: 2.1.0\n" + "PyTorch version: 2.2.1\n" ] } ], @@ -1051,7 +1028,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 33, "id": "74b41073-4c9f-46e2-a1bd-d38e4122b375", "metadata": {}, "outputs": [], @@ -1084,7 +1061,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 34, "id": "5eb30ebe-97b3-43c5-9ff1-a97d621b3c4e", "metadata": {}, "outputs": [], @@ -1114,7 +1091,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 35, "id": "df31d96c-6bfd-4564-a956-6192242d7579", "metadata": {}, "outputs": [], @@ -1125,7 +1102,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 36, "id": "9226d00c-ad9a-4949-a6e4-9afccfc7214f", "metadata": {}, "outputs": [ @@ -1147,7 +1124,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 37, "id": "10deb4bc-4de1-4d20-921e-4b1c7a0e1a6d", "metadata": {}, "outputs": [ @@ -1175,7 +1152,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 38, "id": "1916e7a6-f03d-4f09-91a6-d0bdbac5a58c", "metadata": {}, "outputs": [ @@ -1242,7 +1219,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 39, "id": "15a6304c-9474-4470-b85d-3991a49fa653", "metadata": {}, "outputs": [], @@ -1260,7 +1237,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 40, "id": "93cb2cee-9aa6-4bb8-8977-c65661d16eda", "metadata": {}, "outputs": [], @@ -1282,7 +1259,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 41, "id": "a686eb61-e737-4351-8f1c-222913d47468", "metadata": {}, "outputs": [ @@ -1323,7 +1300,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 42, "id": "e43600ba-f287-4746-8ddf-d0f71a9023ca", "metadata": {}, "outputs": [ @@ -1350,7 +1327,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 43, "id": "50280ead-0363-44c8-8c35-bb885d92c8b7", "metadata": {}, "outputs": [ @@ -1388,7 +1365,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 44, "id": "0b9e344d-03a6-4f2c-b723-67b6a20c5041", "metadata": {}, "outputs": [], @@ -1410,20 +1387,20 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 45, "id": "ad56a263-3d2e-4d91-98bf-d0b68d3c7fc3", "metadata": {}, "outputs": [], "source": [ "max_length = 4\n", - "dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=5, shuffle=False)\n", + "dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)\n", "data_iter = iter(dataloader)\n", "inputs, targets = next(data_iter)" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 46, "id": "84416b60-3707-4370-bcbc-da0b62f2b64d", "metadata": {}, "outputs": [ @@ -1433,13 +1410,13 @@ "text": [ "Token IDs:\n", " tensor([[ 40, 367, 2885, 1464],\n", - " [ 3619, 402, 271, 10899],\n", - " [ 257, 7026, 15632, 438],\n", - " [ 257, 922, 5891, 1576],\n", + " [ 1807, 3619, 402, 271],\n", + " [10899, 2138, 257, 7026],\n", + " [15632, 438, 2016, 257],\n", + " [ 922, 5891, 1576, 438],\n", " [ 568, 340, 373, 645],\n", - " [ 5975, 284, 502, 284],\n", - " [ 326, 11, 287, 262],\n", - " [ 286, 465, 13476, 11]])\n", + " [ 1049, 5975, 284, 502],\n", + " [ 284, 3285, 326, 11]])\n", "\n", "Inputs shape:\n", " torch.Size([8, 4])\n" @@ -1453,7 +1430,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 47, "id": "7766ec38-30d0-4128-8c31-f49f063c43d1", "metadata": {}, "outputs": [