add lecture 10

This commit is contained in:
Frank Xu
2025-05-12 17:23:15 -04:00
parent f5d435e80a
commit eca62f586b
10 changed files with 2937 additions and 199 deletions

View File

@@ -1,4 +1,5 @@
pip install ipywidgets
pip install scikit-learn
pip install ultralytics
pip install ultralytics opencv-python
pip install ultralytics opencv-python
pip install transformers

View File

@@ -13,7 +13,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"id": "739c5173",
"metadata": {},
"outputs": [
@@ -23,7 +23,7 @@
"'2.6.0+cu126'"
]
},
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
@@ -38,12 +38,14 @@
"id": "75acf7d8",
"metadata": {},
"source": [
"### Multi-dimensional"
"### Multi-dimensional\n",
"\n",
"![Tensor shape](https://cdn-images-1.medium.com/max/2000/1*_D5ZvufDS38WkhK9rK32hQ.jpeg)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"id": "0e82be1e",
"metadata": {},
"outputs": [
@@ -53,7 +55,7 @@
"tensor(5)"
]
},
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@@ -66,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"id": "7c239759",
"metadata": {},
"outputs": [
@@ -76,7 +78,7 @@
"0"
]
},
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -85,9 +87,18 @@
"x.ndim"
]
},
{
"cell_type": "markdown",
"id": "24ec3101",
"metadata": {},
"source": [
"![shapes](https://velog.velcdn.com/images/sangyun/post/ad3a0dfa-84cd-4b29-9a4e-9768b19c6df4/image.png)\n",
"![shapes](https://velog.velcdn.com/images/sangyun/post/accfee47-0d44-401c-a6b9-c4fff5822dcf/image.png)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"id": "d176548d",
"metadata": {},
"outputs": [
@@ -97,7 +108,7 @@
"torch.Size([])"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -108,7 +119,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"id": "07e03145",
"metadata": {},
"outputs": [
@@ -118,7 +129,7 @@
"5"
]
},
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -129,7 +140,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"id": "41fcc46e",
"metadata": {},
"outputs": [
@@ -139,7 +150,7 @@
"tensor([1, 2, 3])"
]
},
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -152,7 +163,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"id": "f9894c37",
"metadata": {},
"outputs": [
@@ -162,7 +173,7 @@
"1"
]
},
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -173,7 +184,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"id": "7dc166eb",
"metadata": {},
"outputs": [
@@ -183,7 +194,7 @@
"torch.Size([3])"
]
},
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -195,7 +206,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"id": "2581817b",
"metadata": {},
"outputs": [
@@ -206,7 +217,7 @@
" [ 9, 10]])"
]
},
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -220,7 +231,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"id": "46961042",
"metadata": {},
"outputs": [
@@ -230,7 +241,7 @@
"2"
]
},
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -241,7 +252,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"id": "9669fda8",
"metadata": {},
"outputs": [
@@ -251,7 +262,7 @@
"torch.Size([2, 2])"
]
},
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -262,7 +273,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"id": "15297945",
"metadata": {},
"outputs": [
@@ -274,7 +285,7 @@
" [2, 4, 5]]])"
]
},
"execution_count": 13,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -289,7 +300,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"id": "5bbed071",
"metadata": {},
"outputs": [
@@ -299,7 +310,7 @@
"3"
]
},
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -310,9 +321,30 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"id": "483d25c7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([1, 3, 3])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"TENSOR.shape"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "c4e76ef2",
"metadata": {},
"outputs": [
{
"data": {
@@ -325,34 +357,13 @@
"output_type": "execute_result"
}
],
"source": [
"TENSOR.shape"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c4e76ef2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([1, 3, 3])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"TENSOR.size()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 16,
"id": "b56abf50",
"metadata": {},
"outputs": [
@@ -364,7 +375,7 @@
" [6, 9]])"
]
},
"execution_count": 17,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -376,7 +387,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 17,
"id": "cdd39ae8",
"metadata": {},
"outputs": [
@@ -391,7 +402,7 @@
" [9]])"
]
},
"execution_count": 18,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -403,7 +414,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 18,
"id": "adf1ab41",
"metadata": {},
"outputs": [
@@ -415,7 +426,7 @@
" [2., 4., 5.]]])"
]
},
"execution_count": 19,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@@ -430,7 +441,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 19,
"id": "a368079f",
"metadata": {},
"outputs": [
@@ -440,7 +451,7 @@
"torch.float32"
]
},
"execution_count": 20,
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@@ -451,20 +462,20 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 20,
"id": "4d00ea95",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(tensor([[0.9019, 0.8531],\n",
" [0.9996, 0.5826]]),\n",
" tensor([[0.0682, 0.6102],\n",
" [0.5610, 0.0305]]))"
"(tensor([[0.0440, 0.2059],\n",
" [0.1639, 0.4233]]),\n",
" tensor([[0.1890, 0.7100],\n",
" [0.9819, 0.5552]]))"
]
},
"execution_count": 21,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@@ -485,9 +496,31 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 21,
"id": "aeed7a0a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([1, 2])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x=torch.tensor([1, 2, 3, 4, 5, 6])\n",
"x[0:2]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "721ce7eb",
"metadata": {},
"outputs": [
{
"data": {
@@ -500,28 +533,6 @@
"output_type": "execute_result"
}
],
"source": [
"x=torch.tensor([1, 2, 3, 4, 5, 6])\n",
"x[0:2]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "721ce7eb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([1, 2])"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x=torch.tensor([1, 2, 3, 4, 5, 6])\n",
"x[:2]"
@@ -529,7 +540,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 23,
"id": "6423f4d2",
"metadata": {},
"outputs": [
@@ -539,7 +550,7 @@
"tensor(6)"
]
},
"execution_count": 24,
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
@@ -551,7 +562,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 24,
"id": "0125386f",
"metadata": {},
"outputs": [
@@ -561,7 +572,7 @@
"tensor([3, 4, 5, 6])"
]
},
"execution_count": 25,
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@@ -573,7 +584,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 25,
"id": "97373387",
"metadata": {},
"outputs": [
@@ -584,7 +595,7 @@
" [4, 5]])"
]
},
"execution_count": 26,
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
@@ -604,7 +615,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 26,
"id": "bba6b1b4",
"metadata": {},
"outputs": [
@@ -614,7 +625,7 @@
"tensor([4, 5, 6])"
]
},
"execution_count": 27,
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
@@ -626,7 +637,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 27,
"id": "12a96c84",
"metadata": {},
"outputs": [
@@ -636,7 +647,7 @@
"tensor([3, 6, 9])"
]
},
"execution_count": 28,
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
@@ -648,7 +659,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 28,
"id": "a0f73c88",
"metadata": {},
"outputs": [
@@ -658,7 +669,7 @@
"tensor([[5, 6]])"
]
},
"execution_count": 29,
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
@@ -673,6 +684,32 @@
"x[1:2, 1:3] # tensor([[5, 6]])"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "485c115b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 4\n",
"2 5\n",
"3 6\n"
]
}
],
"source": [
"# combine multiple iterables (like lists or tuples) element-wise \n",
"# into a single iterable of tuples.\n",
"a = torch.tensor([1, 2, 3])\n",
"b = torch.tensor([4, 5, 6])\n",
"\n",
"for x, y in zip(a, b):\n",
" print(x.item(), y.item())"
]
},
{
"cell_type": "markdown",
"id": "a3c1d8b5",
@@ -749,7 +786,7 @@
},
{
"cell_type": "code",
"execution_count": 54,
"execution_count": 33,
"id": "cfa1dcae",
"metadata": {},
"outputs": [
@@ -759,7 +796,7 @@
"tensor(6)"
]
},
"execution_count": 54,
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
@@ -771,7 +808,7 @@
},
{
"cell_type": "code",
"execution_count": 55,
"execution_count": 34,
"id": "f27ae72f",
"metadata": {},
"outputs": [
@@ -781,7 +818,7 @@
"tensor(3)"
]
},
"execution_count": 55,
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
@@ -793,19 +830,19 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 35,
"id": "e10312d5",
"metadata": {},
"outputs": [],
"source": [
"test_outputs = torch.tensor([[2.5, 0.8, 1.3], # Sample 1\n",
" [0.4, 3.2, 1.9]]) # Sample 2\n",
"max_values, max_indices = torch.max(test_outputs,1)"
"max_values, max_indices = torch.max(test_outputs,1) # push alone the column"
]
},
{
"cell_type": "code",
"execution_count": 59,
"execution_count": 36,
"id": "7f887d49",
"metadata": {},
"outputs": [
@@ -815,7 +852,7 @@
"tensor([2.5000, 3.2000])"
]
},
"execution_count": 59,
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
@@ -826,7 +863,7 @@
},
{
"cell_type": "code",
"execution_count": 61,
"execution_count": 37,
"id": "600af54b",
"metadata": {},
"outputs": [
@@ -836,7 +873,7 @@
"tensor([0, 1])"
]
},
"execution_count": 61,
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
@@ -845,6 +882,29 @@
"max_indices"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "f4ce3e53",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([2., 5.])"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_outputs = torch.tensor([[1, 2, 3], # Sample 1\n",
" [4, 5, 6]], dtype=torch.float) # Sample 2\n",
"torch.mean(test_outputs,dim=1)"
]
},
{
"cell_type": "markdown",
"id": "02a00747",
@@ -855,7 +915,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 39,
"id": "45267f2f",
"metadata": {},
"outputs": [
@@ -868,7 +928,7 @@
" [7, 8]]))"
]
},
"execution_count": 33,
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
@@ -883,7 +943,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 40,
"id": "193a7828",
"metadata": {},
"outputs": [
@@ -894,7 +954,7 @@
" [10, 12]])"
]
},
"execution_count": 34,
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
@@ -905,7 +965,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 41,
"id": "1ce81689",
"metadata": {},
"outputs": [
@@ -916,7 +976,7 @@
" [21, 32]])"
]
},
"execution_count": 35,
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
@@ -928,7 +988,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 42,
"id": "62f8cde3",
"metadata": {},
"outputs": [
@@ -938,7 +998,7 @@
"tensor([11, 12, 13])"
]
},
"execution_count": 36,
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
@@ -953,7 +1013,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 43,
"id": "2098ad78",
"metadata": {},
"outputs": [
@@ -964,7 +1024,7 @@
" [4, 5, 6]])"
]
},
"execution_count": 37,
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
@@ -977,7 +1037,7 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 44,
"id": "883321f8",
"metadata": {},
"outputs": [
@@ -989,7 +1049,7 @@
" [5, 6]])"
]
},
"execution_count": 38,
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
@@ -1000,7 +1060,7 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 45,
"id": "9ceace9b",
"metadata": {},
"outputs": [
@@ -1010,7 +1070,7 @@
"tensor([False, True, True, False, True, False])"
]
},
"execution_count": 39,
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
@@ -1022,7 +1082,7 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 46,
"id": "96ea0d2f",
"metadata": {},
"outputs": [
@@ -1032,7 +1092,7 @@
"tensor([ True, False, True, True, False, False])"
]
},
"execution_count": 40,
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
@@ -1044,7 +1104,7 @@
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 47,
"id": "c1d9f060",
"metadata": {},
"outputs": [
@@ -1054,7 +1114,7 @@
"tensor([False, False, True, False, False, False])"
]
},
"execution_count": 41,
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
@@ -1065,7 +1125,7 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 48,
"id": "796d977f",
"metadata": {},
"outputs": [
@@ -1075,7 +1135,7 @@
"tensor(1)"
]
},
"execution_count": 42,
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
@@ -1086,7 +1146,7 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": 49,
"id": "60402427",
"metadata": {},
"outputs": [
@@ -1096,7 +1156,7 @@
"tensor(1)"
]
},
"execution_count": 43,
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
@@ -1119,7 +1179,7 @@
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": 50,
"id": "2a3fd4ae",
"metadata": {},
"outputs": [
@@ -1129,7 +1189,7 @@
"tensor([1, 2, 3])"
]
},
"execution_count": 44,
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
@@ -1142,7 +1202,7 @@
},
{
"cell_type": "code",
"execution_count": 45,
"execution_count": 51,
"id": "df247bd3",
"metadata": {},
"outputs": [
@@ -1152,7 +1212,7 @@
"array([1, 2, 3])"
]
},
"execution_count": 45,
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
@@ -1164,7 +1224,7 @@
},
{
"cell_type": "code",
"execution_count": 46,
"execution_count": 52,
"id": "9ada07ab",
"metadata": {},
"outputs": [
@@ -1174,7 +1234,7 @@
"tensor([1, 2, 3])"
]
},
"execution_count": 46,
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
@@ -1194,7 +1254,7 @@
},
{
"cell_type": "code",
"execution_count": 47,
"execution_count": 53,
"id": "30c9ea9f",
"metadata": {},
"outputs": [
@@ -1204,7 +1264,7 @@
"True"
]
},
"execution_count": 47,
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
@@ -1217,7 +1277,7 @@
},
{
"cell_type": "code",
"execution_count": 48,
"execution_count": 54,
"id": "dd523b3e",
"metadata": {},
"outputs": [
@@ -1227,7 +1287,7 @@
"'cuda'"
]
},
"execution_count": 48,
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
@@ -1240,7 +1300,7 @@
},
{
"cell_type": "code",
"execution_count": 49,
"execution_count": 55,
"id": "11d1a029",
"metadata": {},
"outputs": [
@@ -1257,7 +1317,7 @@
"tensor([1, 2, 3], device='cuda:0')"
]
},
"execution_count": 49,
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
@@ -1276,7 +1336,7 @@
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 56,
"id": "db5249d0",
"metadata": {},
"outputs": [
@@ -1284,7 +1344,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Weife\\AppData\\Local\\Temp\\ipykernel_154616\\3540074575.py:6: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
"C:\\Users\\Weife\\AppData\\Local\\Temp\\ipykernel_68020\\3540074575.py:6: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
" y = torch.tensor(x, device=device) # directly create a tensor on GPU\n"
]
}

File diff suppressed because one or more lines are too long

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,427 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 30,
"id": "18cc9c99",
"metadata": {},
"outputs": [],
"source": [
"# Program for sentiment analysis of synthetic Rotten Tomatoes reviews for The Matrix\n",
"# Uses generated dataset of 50 reviews (48 movie reviews + 2 reference texts)\n",
"# Implements: tokenization, token embeddings, sentiment prediction with frozen BERT and custom layer\n",
"# Requirements: pip install transformers torch pandas\n",
"\n",
"# Import required libraries\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"from transformers import AutoTokenizer, AutoModel\n",
"import pandas as pd\n",
"import csv\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "d0b0e4d3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>phrase</th>\n",
" <th>sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>The Matrix is great, revolutionary sci-fi that...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Terrible movie, The Matrixs plot is so confus...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>The Matrix was okay, entertaining but not life...</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Great visuals and action in The Matrix make it...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Hated The Matrix; terrible pacing and a story ...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id phrase sentiment\n",
"0 1 The Matrix is great, revolutionary sci-fi that... positive\n",
"1 2 Terrible movie, The Matrixs plot is so confus... negative\n",
"2 3 The Matrix was okay, entertaining but not life... neutral\n",
"3 4 Great visuals and action in The Matrix make it... positive\n",
"4 5 Hated The Matrix; terrible pacing and a story ... negative"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Load dataset\n",
"df = pd.read_csv('matrix_reviews.csv', encoding='utf-8')\n",
"df[:5]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "e9c58e58",
"metadata": {},
"outputs": [],
"source": [
"# Filter out reference texts (id 49, 50) for sentiment prediction\n",
"df_reviews = df[df['id'] <= 48].copy()\n",
"texts = df['phrase'].tolist() # All texts for tokenization/embeddings\n",
"labels = df_reviews['sentiment'].map({'positive': 1, 'negative': 0, 'neutral': 2}).values # Encode labels"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "36733cc8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Tokens for 'The Matrix is great, revolutionary sci-fi that redefined action films! #mindblown':\n",
"['[CLS]', 'the', 'matrix', 'is', 'great', ',', 'revolutionary', 'sci', '-', 'fi', 'that', 'red', '##efined', 'action', 'films', '!', '#', 'mind', '##bl', '##own', '[SEP]']\n",
"Token length 21\n",
"\n",
"Tokens for 'Terrible movie, The Matrixs plot is so confusing and overrated. #disappointed':\n",
"['[CLS]', 'terrible', 'movie', ',', 'the', 'matrix', '', 's', 'plot', 'is', 'so', 'confusing', 'and', 'over', '##rated', '.', '#', 'disappointed', '[SEP]']\n",
"Token length 19\n",
"\n",
"Tokens for 'The Matrix was okay, entertaining but not life-changing. #movies':\n",
"['[CLS]', 'the', 'matrix', 'was', 'okay', ',', 'entertaining', 'but', 'not', 'life', '-', 'changing', '.', '#', 'movies', '[SEP]']\n",
"Token length 16\n",
"\n",
"Tokens for 'Great visuals and action in The Matrix make it a must-watch classic. #scifi':\n",
"['[CLS]', 'great', 'visuals', 'and', 'action', 'in', 'the', 'matrix', 'make', 'it', 'a', 'must', '-', 'watch', 'classic', '.', '#', 'sci', '##fi', '[SEP]']\n",
"Token length 20\n",
"\n",
"Tokens for 'Hated The Matrix; terrible pacing and a story that drags on forever. #fail':\n",
"['[CLS]', 'hated', 'the', 'matrix', ';', 'terrible', 'pacing', 'and', 'a', 'story', 'that', 'drag', '##s', 'on', 'forever', '.', '#', 'fail', '[SEP]']\n",
"Token length 19\n"
]
}
],
"source": [
"# Initialize BERT tokenizer and model (frozen)\n",
"tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Load tokenizer\n",
"model = AutoModel.from_pretrained('bert-base-uncased') # Load model for embeddings\n",
"model.eval() # Set to evaluation mode (no training)\n",
"\n",
"# Step 1: Tokenization - Process all texts and store tokens\n",
"all_tokens = []\n",
"for text in texts[:5]: # Show first 5 for brevity\n",
" inputs = tokenizer(text, return_tensors=\"pt\", padding=True, truncation=True) # Tokenize\n",
" tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]) # Get tokens\n",
" all_tokens.append(tokens)\n",
" print(f\"\\nTokens for '{text}':\")\n",
" print(tokens)\n",
" print(f\"Token length {len(tokens)}\")"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "068f7cc3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Embeddings for 'The Matrix is great, revolutionary sci-fi that redefined action films! #mindblown' (first token, 5 numbers):\n",
"[ 0.2202626 -0.18178469 -0.46809724 0.1393926 0.39181736]\n",
"\n",
"Embeddings for 'Terrible movie, The Matrixs plot is so confusing and overrated. #disappointed' (first token, 5 numbers):\n",
"[0.7884245 0.652363 0.05890564 0.18900512 0.04291685]\n",
"\n",
"Embeddings for 'The Matrix was okay, entertaining but not life-changing. #movies' (first token, 5 numbers):\n",
"[ 0.16382633 -0.20111704 -0.42153656 0.16307226 -0.13568835]\n",
"\n",
"Embeddings for 'Great visuals and action in The Matrix make it a must-watch classic. #scifi' (first token, 5 numbers):\n",
"[ 0.5706272 0.07817388 -0.06764057 0.08270969 0.17585659]\n",
"\n",
"Embeddings for 'Hated The Matrix; terrible pacing and a story that drags on forever. #fail' (first token, 5 numbers):\n",
"[ 0.57143813 0.5018263 0.7289898 -0.03643154 -0.18432716]\n"
]
}
],
"source": [
"# Step 2: Token Embeddings - Generate embeddings for all texts\n",
"all_embeddings = []\n",
"for text in texts[:5]: # Show first 5 for brevity\n",
" inputs = tokenizer(text, return_tensors=\"pt\", padding=True, truncation=True) # Tokenize\n",
" with torch.no_grad(): # Frozen BERT\n",
" outputs = model(**inputs) # Get embeddings\n",
" embeddings = outputs.last_hidden_state[0] # Extract vectors\n",
" all_embeddings.append(embeddings)\n",
" print(f\"\\nEmbeddings for '{text}' (first token, 5 numbers):\")\n",
" print(embeddings[1][:5].numpy())"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "33f8d62c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([19, 768])"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_embeddings[1].shape"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "7a5d1681",
"metadata": {},
"outputs": [],
"source": [
"# Step 3: Sentiment Prediction - Train custom layer on frozen BERT embeddings\n",
"# Custom classifier model\n",
"class SentimentClassifier(nn.Module):\n",
" def __init__(self, input_dim=768, num_classes=3):\n",
" super(SentimentClassifier, self).__init__()\n",
" self.fc = nn.Linear(input_dim, num_classes) # Single dense layer\n",
" self.softmax = nn.Softmax(dim=1) # each column adds to 1\n",
"\n",
" def forward(self, x):\n",
" x = self.fc(x)\n",
" x = self.softmax(x)\n",
" return x"
]
},
{
"cell_type": "markdown",
"id": "9e78ee0f",
"metadata": {},
"source": [
"### Sentences and 3D dimension. Assume\n",
"- 3 sentences, \n",
"- 2 words, \n",
"- each word has 5 features, \n",
"\n",
"![shapes](https://www.tensorflow.org/static/guide/images/tensor/3-axis_front.png)\n",
"\n",
"#### What is dimension of sentence embeddings?\n",
"- (3,5)\n",
"\n",
"`nn.mean(data, dim=1)`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad411bb3",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mValueError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[37]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Batch all phrases together\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m inputs = \u001b[43mtokenizer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[43m \u001b[49m\u001b[43mdf_reviews\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mphrase\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtolist\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# all texts at once\u001b[39;49;00m\n\u001b[32m 4\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mpt\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 5\u001b[39m \u001b[43m \u001b[49m\u001b[43mpadding\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[43m \u001b[49m\u001b[43mtruncation\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m128\u001b[39;49m\n\u001b[32m 8\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.no_grad():\n\u001b[32m 11\u001b[39m outputs = model(**inputs)\n",
"\u001b[36mFile \u001b[39m\u001b[32mi:\\conda_envs\\reinforcement\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:2887\u001b[39m, in \u001b[36mPreTrainedTokenizerBase.__call__\u001b[39m\u001b[34m(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[39m\n\u001b[32m 2885\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._in_target_context_manager:\n\u001b[32m 2886\u001b[39m \u001b[38;5;28mself\u001b[39m._switch_to_input_mode()\n\u001b[32m-> \u001b[39m\u001b[32m2887\u001b[39m encodings = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_one\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mall_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2888\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m text_target \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 2889\u001b[39m \u001b[38;5;28mself\u001b[39m._switch_to_target_mode()\n",
"\u001b[36mFile \u001b[39m\u001b[32mi:\\conda_envs\\reinforcement\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:2947\u001b[39m, in \u001b[36mPreTrainedTokenizerBase._call_one\u001b[39m\u001b[34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)\u001b[39m\n\u001b[32m 2944\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m 2946\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text):\n\u001b[32m-> \u001b[39m\u001b[32m2947\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 2948\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mtext input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 2949\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 2950\u001b[39m )\n\u001b[32m 2952\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text_pair):\n\u001b[32m 2953\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 2954\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mtext input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 2955\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 2956\u001b[39m )\n",
"\u001b[31mValueError\u001b[39m: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)."
]
}
],
"source": [
"# Batch all phrases together\n",
"inputs = tokenizer(\n",
" list(df_reviews['phrase']), # all texts at once\n",
" return_tensors=\"pt\",\n",
" padding=True,\n",
" truncation=True,\n",
" max_length=128\n",
")\n",
"\n",
"with torch.no_grad():\n",
" outputs = model(**inputs)\n",
"\n",
"# outputs.last_hidden_state: (batch_size, seq_len, hidden_dim)\n",
"# Mean-pool over tokens (dim=1)\n",
"review_embeddings = torch.mean(outputs.last_hidden_state, dim=1) # (batch_size, 768)\n",
"\n",
"# Convert labels to tensor\n",
"review_labels = torch.tensor(labels, dtype=torch.long)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cfa993e5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1, Loss: 1.1128\n",
"Epoch 2, Loss: 1.0926\n",
"Epoch 3, Loss: 1.0726\n",
"Epoch 4, Loss: 1.0530\n",
"Epoch 5, Loss: 1.0337\n",
"Epoch 6, Loss: 1.0149\n",
"Epoch 7, Loss: 0.9966\n",
"Epoch 8, Loss: 0.9793\n",
"Epoch 9, Loss: 0.9629\n",
"Epoch 10, Loss: 0.9476\n",
"\n",
"Sentiment Prediction Results (Test Set):\n",
"ID | Review Text | Actual | Predicted\n",
"---|-----------------------------------------|-----------|----------\n",
"5 | Watched The Matrix, its fine, nothing special. #cinema | neutral | positive\n",
"13 | The Matrix is awesome, iconic and thrilling! #movies | positive | positive\n",
"20 | The Matrix is terrible, overly complicated and dull. #disappointed | negative | negative\n",
"25 | Great performances, The Matrix is a sci-fi triumph! #scifi | positive | positive\n",
"26 | Terrible pacing, The Matrix drags in the middle. #boring | negative | negative\n",
"27 | Saw The Matrix, neutral, its alright. #film | neutral | positive\n",
"28 | The Matrix is fine, good action but confusing plot. #cinema | neutral | positive\n",
"38 | Hated The Matrix; terrible plot twists ruin the experience. #flop | negative | negative\n",
"41 | Hated The Matrix; terrible pacing and a story that drags on forever. #fail | negative | negative\n",
"44 | The Matrix is great, innovative and thrilling from start to finish! #movies | positive | positive\n"
]
}
],
"source": [
"# Split data into train and test sets\n",
"train_emb, test_emb, train_labels, test_labels, train_texts, test_texts = train_test_split(\n",
" review_embeddings, review_labels, df_reviews['phrase'].tolist(),\n",
" test_size=0.2, random_state=42\n",
")\n",
"\n",
"# Initialize custom classifier\n",
"classifier = SentimentClassifier()\n",
"optimizer = optim.Adam(classifier.parameters(), lr=0.001)\n",
"criterion = nn.CrossEntropyLoss()\n",
"\n",
"# Training loop\n",
"num_epochs = 10\n",
"classifier.train()\n",
"for epoch in range(num_epochs):\n",
" optimizer.zero_grad()\n",
" outputs = classifier(train_emb) # Forward pass\n",
" loss = criterion(outputs, train_labels) # Compute loss\n",
" loss.backward() # Backpropagate\n",
" optimizer.step()\n",
" print(f\"Epoch {epoch+1}, Loss: {loss.item():.4f}\")\n",
"\n",
"# Predict sentiments for test set\n",
"classifier.eval()\n",
"with torch.no_grad():\n",
" test_outputs = classifier(test_emb)\n",
" y_pred = torch.argmax(test_outputs, dim=1).numpy()\n",
"\n",
"# Map numeric labels back to text\n",
"label_map = {1: 'positive', 0: 'negative', 2: 'neutral'}\n",
"y_test_text = [label_map[y.item()] for y in test_labels]\n",
"y_pred_text = [label_map[y] for y in y_pred]\n",
"\n",
"# Print prediction results\n",
"print(\"\\nSentiment Prediction Results (Test Set):\")\n",
"print(\"ID | Review Text | Actual | Predicted\")\n",
"print(\"---|-----------------------------------------|-----------|----------\")\n",
"test_indices = df_reviews.index[df_reviews['phrase'].isin(test_texts)].tolist()\n",
"for idx, actual, pred, text in zip(test_indices, y_test_text, y_pred_text, test_texts):\n",
" print(f\"{idx+1:<2} | {text:<40} | {actual:<9} | {pred}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c1d50bc",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,179 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "d13e10c0",
"metadata": {},
"outputs": [],
"source": [
"# Import required libraries\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"from transformers import AutoTokenizer, AutoModel"
]
},
{
"cell_type": "markdown",
"id": "98233002",
"metadata": {},
"source": [
"### Two sentences have different number of tokens"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "d577d7c3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['The Matrix is great', 'A terrible movie']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"review1=\"The Matrix is great\" # 5 tokens\n",
"review2=\"A terrible movie\" # 4 tokens\n",
"\n",
"reviews = [review1, review2]\n",
"reviews"
]
},
{
"cell_type": "markdown",
"id": "d5c81860",
"metadata": {},
"source": [
"### BERT processes inputs to tokens"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "22c86600",
"metadata": {},
"outputs": [],
"source": [
"# Initialize BERT tokenizer and model (frozen)\n",
"tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Load tokenizer\n",
"\n",
"# Batch all phrases together\n",
"inputs = tokenizer(\n",
" reviews, # all texts at once\n",
" return_tensors=\"pt\",\n",
" padding=True,\n",
" truncation=True,\n",
" max_length=128\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "6749e737",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"transformers.tokenization_utils_base.BatchEncoding"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(inputs)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "15c53ac7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([2, 6])\n",
"torch.Size([2, 6])\n",
"torch.Size([2, 6])\n"
]
}
],
"source": [
"print(inputs['input_ids'].shape) # torch.Size([batch_size, seq_len])\n",
"print(inputs['attention_mask'].shape) # torch.Size([batch_size, seq_len])\n",
"print(inputs['token_type_ids'].shape) # torch.Size([batch_size, seq_len])"
]
},
{
"cell_type": "markdown",
"id": "a132bb7a",
"metadata": {},
"source": [
"### padding when two sentences have different len"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "939aee8a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([ 101, 1037, 6659, 3185, 102, 0])\n",
"['[CLS]', 'a', 'terrible', 'movie', '[SEP]', '[PAD]']\n"
]
}
],
"source": [
"print(inputs['input_ids'][1]) # Token IDs\n",
"print(tokenizer.convert_ids_to_tokens(inputs['input_ids'][1])) # Tokens"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b3e54773",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,51 @@
id,phrase,sentiment
1,"The Matrix is great, revolutionary sci-fi that redefined action films! #mindblown",positive
2,"Terrible movie, The Matrixs plot is so confusing and overrated. #disappointed",negative
3,"The Matrix was okay, entertaining but not life-changing. #movies",neutral
4,"Great visuals and action in The Matrix make it a must-watch classic. #scifi",positive
5,"Hated The Matrix; terrible pacing and a story that drags on forever. #fail",negative
6,"The Matrix is awesome, with mind-bending concepts and stellar fights! #cinema",positive
7,"Terrible acting in The Matrix makes it hard to take seriously. #flop",negative
8,"Watched The Matrix, its decent but overhyped. #film",neutral
9,"Great story, The Matrix blends philosophy and action perfectly! #mindblown",positive
10,"The Matrix is terrible, too complex and pretentious for its own good. #waste",negative
11,"The Matrix has great effects, a sci-fi masterpiece! #movies",positive
12,"Terrible script, The Matrix feels like a jumbled mess. #boring",negative
13,"The Matrix is fine, good action but confusing plot. #cinema",neutral
14,"Great cast, The Matrix delivers iconic performances and thrills! #scifi",positive
15,"The Matrix is terrible, all flash with no substance. #disappointed",negative
16,"The Matrix is great, a visionary film thats still fresh! #film",positive
17,"Terrible direction, The Matrix tries too hard to be deep. #fail",negative
18,"Saw The Matrix, neutral vibe, its okay. #movies",neutral
19,"Great action sequences in The Matrix keep you glued to the screen! #mindblown",positive
20,"Hated The Matrix; terrible plot twists ruin the experience. #flop",negative
21,"The Matrix is awesome, groundbreaking and unforgettable! #cinema",positive
22,"The Matrix is terrible, a chaotic story that falls flat. #waste",negative
23,"The Matrix was average, fun but not profound. #film",neutral
24,"Great visuals, The Matrix sets the bar for sci-fi epics! #scifi",positive
25,"Terrible pacing, The Matrix drags in the middle. #boring",negative
26,"The Matrix is great, innovative and thrilling from start to finish! #movies",positive
27,"The Matrix is terrible, overly complicated and dull. #disappointed",negative
28,"Watched The Matrix, its fine, nothing special. #cinema",neutral
29,"Great concept, The Matrix is a bold sci-fi adventure! #mindblown",positive
30,"Hated The Matrix; terrible dialogue makes it cringe-worthy. #fail",negative
31,"The Matrix is awesome, a perfect mix of action and ideas! #film",positive
32,"Terrible effects in The Matrix havent aged well. #flop",negative
33,"The Matrix is okay, decent but not a classic. #movies",neutral
34,"Great fight scenes, The Matrix is pure adrenaline! #scifi",positive
35,"The Matrix is terrible, a pretentious sci-fi mess. #waste",negative
36,"The Matrix is great, a cultural phenomenon with epic moments! #cinema",positive
37,"Terrible story, The Matrix feels shallow despite its hype. #boring",negative
38,"Saw The Matrix, neutral, its alright. #film",neutral
39,"Great direction, The Matrix is a sci-fi game-changer! #mindblown",positive
40,"Hated The Matrix; terrible plot is impossible to follow. #disappointed",negative
41,"The Matrix is awesome, iconic and thrilling! #movies",positive
42,"The Matrix is terrible, all style and no depth. #fail",negative
43,"The Matrix was fine, good visuals but meh story. #cinema",neutral
44,"Great performances, The Matrix is a sci-fi triumph! #scifi",positive
45,"Terrible visuals, The Matrix looks dated and cheap. #flop",negative
46,"The Matrix is great, a visionary masterpiece! #film",positive
47,"The Matrix is terrible, boring and overrated. #waste",negative
48,"The Matrix is neutral, watchable but not amazing. #movies",neutral
49,"The review is positive",positive
50,"The review is negative",negative
1 id phrase sentiment
2 1 The Matrix is great, revolutionary sci-fi that redefined action films! #mindblown positive
3 2 Terrible movie, The Matrix’s plot is so confusing and overrated. #disappointed negative
4 3 The Matrix was okay, entertaining but not life-changing. #movies neutral
5 4 Great visuals and action in The Matrix make it a must-watch classic. #scifi positive
6 5 Hated The Matrix; terrible pacing and a story that drags on forever. #fail negative
7 6 The Matrix is awesome, with mind-bending concepts and stellar fights! #cinema positive
8 7 Terrible acting in The Matrix makes it hard to take seriously. #flop negative
9 8 Watched The Matrix, it’s decent but overhyped. #film neutral
10 9 Great story, The Matrix blends philosophy and action perfectly! #mindblown positive
11 10 The Matrix is terrible, too complex and pretentious for its own good. #waste negative
12 11 The Matrix has great effects, a sci-fi masterpiece! #movies positive
13 12 Terrible script, The Matrix feels like a jumbled mess. #boring negative
14 13 The Matrix is fine, good action but confusing plot. #cinema neutral
15 14 Great cast, The Matrix delivers iconic performances and thrills! #scifi positive
16 15 The Matrix is terrible, all flash with no substance. #disappointed negative
17 16 The Matrix is great, a visionary film that’s still fresh! #film positive
18 17 Terrible direction, The Matrix tries too hard to be deep. #fail negative
19 18 Saw The Matrix, neutral vibe, it’s okay. #movies neutral
20 19 Great action sequences in The Matrix keep you glued to the screen! #mindblown positive
21 20 Hated The Matrix; terrible plot twists ruin the experience. #flop negative
22 21 The Matrix is awesome, groundbreaking and unforgettable! #cinema positive
23 22 The Matrix is terrible, a chaotic story that falls flat. #waste negative
24 23 The Matrix was average, fun but not profound. #film neutral
25 24 Great visuals, The Matrix sets the bar for sci-fi epics! #scifi positive
26 25 Terrible pacing, The Matrix drags in the middle. #boring negative
27 26 The Matrix is great, innovative and thrilling from start to finish! #movies positive
28 27 The Matrix is terrible, overly complicated and dull. #disappointed negative
29 28 Watched The Matrix, it’s fine, nothing special. #cinema neutral
30 29 Great concept, The Matrix is a bold sci-fi adventure! #mindblown positive
31 30 Hated The Matrix; terrible dialogue makes it cringe-worthy. #fail negative
32 31 The Matrix is awesome, a perfect mix of action and ideas! #film positive
33 32 Terrible effects in The Matrix haven’t aged well. #flop negative
34 33 The Matrix is okay, decent but not a classic. #movies neutral
35 34 Great fight scenes, The Matrix is pure adrenaline! #scifi positive
36 35 The Matrix is terrible, a pretentious sci-fi mess. #waste negative
37 36 The Matrix is great, a cultural phenomenon with epic moments! #cinema positive
38 37 Terrible story, The Matrix feels shallow despite its hype. #boring negative
39 38 Saw The Matrix, neutral, it’s alright. #film neutral
40 39 Great direction, The Matrix is a sci-fi game-changer! #mindblown positive
41 40 Hated The Matrix; terrible plot is impossible to follow. #disappointed negative
42 41 The Matrix is awesome, iconic and thrilling! #movies positive
43 42 The Matrix is terrible, all style and no depth. #fail negative
44 43 The Matrix was fine, good visuals but meh story. #cinema neutral
45 44 Great performances, The Matrix is a sci-fi triumph! #scifi positive
46 45 Terrible visuals, The Matrix looks dated and cheap. #flop negative
47 46 The Matrix is great, a visionary masterpiece! #film positive
48 47 The Matrix is terrible, boring and overrated. #waste negative
49 48 The Matrix is neutral, watchable but not amazing. #movies neutral
50 49 The review is positive positive
51 50 The review is negative negative