add colon and semicolon to tokenizer

This commit is contained in:
rasbt
2024-03-23 06:50:34 -05:00
parent 5d02559993
commit 001507481e

View File

@@ -37,7 +37,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"torch version: 2.1.0\n", "torch version: 2.2.1\n",
"tiktoken version: 0.5.1\n" "tiktoken version: 0.5.1\n"
] ]
} }
@@ -273,7 +273,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 6,
"id": "902f0d9c-9828-4c46-ba32-8fe810c3840a", "id": "ed3a9467-04b4-49d9-96c5-b8042bcf8374",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -287,7 +287,7 @@
"source": [ "source": [
"text = \"Hello, world. Is this-- a test?\"\n", "text = \"Hello, world. Is this-- a test?\"\n",
"\n", "\n",
"result = re.split(r'([,.?_!\"()\\']|--|\\s)', text)\n", "result = re.split(r'([,.:;?_!\"()\\']|--|\\s)', text)\n",
"result = [item.strip() for item in result if item.strip()]\n", "result = [item.strip() for item in result if item.strip()]\n",
"print(result)" "print(result)"
] ]
@@ -750,7 +750,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 21, "execution_count": 17,
"id": "ce9df29c-6c5b-43f1-8c1a-c7f7b79db78f", "id": "ce9df29c-6c5b-43f1-8c1a-c7f7b79db78f",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -766,7 +766,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 22, "execution_count": 18,
"id": "57c3143b-e860-4d3b-a22a-de22b547a6a9", "id": "57c3143b-e860-4d3b-a22a-de22b547a6a9",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -776,7 +776,7 @@
"1161" "1161"
] ]
}, },
"execution_count": 22, "execution_count": 18,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@@ -787,7 +787,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": 19,
"id": "50e51bb1-ae05-4aa8-a9ff-455b65ed1959", "id": "50e51bb1-ae05-4aa8-a9ff-455b65ed1959",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -818,7 +818,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 24, "execution_count": 20,
"id": "948861c5-3f30-4712-a234-725f20d26f68", "id": "948861c5-3f30-4712-a234-725f20d26f68",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -854,7 +854,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 25, "execution_count": 21,
"id": "effcef79-e0a5-4f4a-a43a-31dd94b9250a", "id": "effcef79-e0a5-4f4a-a43a-31dd94b9250a",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -879,7 +879,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26, "execution_count": 22,
"id": "ddfe7346-398d-4bf8-99f1-5b071244ce95", "id": "ddfe7346-398d-4bf8-99f1-5b071244ce95",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -904,7 +904,7 @@
" 7]" " 7]"
] ]
}, },
"execution_count": 26, "execution_count": 22,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@@ -915,7 +915,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": 23,
"id": "0c350ff6-2734-4e84-9ec7-d578baa4ae1b", "id": "0c350ff6-2734-4e84-9ec7-d578baa4ae1b",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -925,7 +925,7 @@
"'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'" "'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'"
] ]
}, },
"execution_count": 27, "execution_count": 23,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@@ -1876,7 +1876,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.6" "version": "3.10.12"
} }
}, },
"nbformat": 4, "nbformat": 4,