add lecture 11

This commit is contained in:
Frank Xu
2025-05-13 22:33:47 -04:00
parent eca62f586b
commit 512bf904c5
12 changed files with 1455 additions and 342 deletions

File diff suppressed because one or more lines are too long

View File

@@ -1,179 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "d13e10c0",
"metadata": {},
"outputs": [],
"source": [
"# Import required libraries\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"from transformers import AutoTokenizer, AutoModel"
]
},
{
"cell_type": "markdown",
"id": "98233002",
"metadata": {},
"source": [
"### Two sentences have different number of tokens"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "d577d7c3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['The Matrix is great', 'A terrible movie']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"review1=\"The Matrix is great\" # 5 tokens\n",
"review2=\"A terrible movie\" # 4 tokens\n",
"\n",
"reviews = [review1, review2]\n",
"reviews"
]
},
{
"cell_type": "markdown",
"id": "d5c81860",
"metadata": {},
"source": [
"### BERT processes inputs to tokens"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "22c86600",
"metadata": {},
"outputs": [],
"source": [
"# Initialize BERT tokenizer and model (frozen)\n",
"tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Load tokenizer\n",
"\n",
"# Batch all phrases together\n",
"inputs = tokenizer(\n",
" reviews, # all texts at once\n",
" return_tensors=\"pt\",\n",
" padding=True,\n",
" truncation=True,\n",
" max_length=128\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "6749e737",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"transformers.tokenization_utils_base.BatchEncoding"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(inputs)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "15c53ac7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([2, 6])\n",
"torch.Size([2, 6])\n",
"torch.Size([2, 6])\n"
]
}
],
"source": [
"print(inputs['input_ids'].shape) # torch.Size([batch_size, seq_len])\n",
"print(inputs['attention_mask'].shape) # torch.Size([batch_size, seq_len])\n",
"print(inputs['token_type_ids'].shape) # torch.Size([batch_size, seq_len])"
]
},
{
"cell_type": "markdown",
"id": "a132bb7a",
"metadata": {},
"source": [
"### padding when two sentences have different len"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "939aee8a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([ 101, 1037, 6659, 3185, 102, 0])\n",
"['[CLS]', 'a', 'terrible', 'movie', '[SEP]', '[PAD]']\n"
]
}
],
"source": [
"print(inputs['input_ids'][1]) # Token IDs\n",
"print(tokenizer.convert_ids_to_tokens(inputs['input_ids'][1])) # Tokens"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b3e54773",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,368 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "d13e10c0",
"metadata": {},
"outputs": [],
"source": [
"# Import required libraries\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"from transformers import AutoTokenizer, AutoModel"
]
},
{
"cell_type": "markdown",
"id": "98233002",
"metadata": {},
"source": [
"### Batch inputs (two sentences) have different number of tokens"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d577d7c3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['The Matrix is great', 'A terrible movie']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"review1=\"The Matrix is great\" # 5 tokens\n",
"review2=\"A terrible movie\" # 4 tokens\n",
"\n",
"reviews = [review1, review2]\n",
"reviews"
]
},
{
"cell_type": "markdown",
"id": "d5c81860",
"metadata": {},
"source": [
"### BERT processes Batch inputs to tokens"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "22c86600",
"metadata": {},
"outputs": [],
"source": [
"# Initialize BERT tokenizer and model (frozen)\n",
"tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Load tokenizer\n",
"\n",
"# Batch all phrases together\n",
"inputs = tokenizer(\n",
" reviews, # all texts at once\n",
" return_tensors=\"pt\",\n",
" padding=True,\n",
" truncation=True,\n",
" max_length=128\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "6749e737",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"transformers.tokenization_utils_base.BatchEncoding"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(inputs)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "15c53ac7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([2, 6])\n",
"torch.Size([2, 6])\n",
"torch.Size([2, 6])\n"
]
}
],
"source": [
"print(inputs['input_ids'].shape) # torch.Size([batch_size, seq_len])\n",
"print(inputs['attention_mask'].shape) # torch.Size([batch_size, seq_len])\n",
"print(inputs['token_type_ids'].shape) # torch.Size([batch_size, seq_len])"
]
},
{
"cell_type": "markdown",
"id": "a132bb7a",
"metadata": {},
"source": [
"### padding when two sentences have different len"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "939aee8a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([ 101, 1037, 6659, 3185, 102, 0])\n",
"['[CLS]', 'a', 'terrible', 'movie', '[SEP]', '[PAD]']\n"
]
}
],
"source": [
"print(inputs['input_ids'][1]) # Token IDs\n",
"print(tokenizer.convert_ids_to_tokens(inputs['input_ids'][1])) # Tokens"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b3e54773",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([2, 6, 768])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = AutoModel.from_pretrained('bert-base-uncased') # Load model for embeddings\n",
"model.eval() # Set to evaluation mode (no training)\n",
"\n",
"with torch.no_grad():\n",
" outputs = model(**inputs)\n",
"\n",
"outputs.last_hidden_state.shape"
]
},
{
"cell_type": "markdown",
"id": "bceda8fe",
"metadata": {},
"source": [
"### Sentences and 3D dimension. Assume\n",
"- 3 sentences, \n",
"- each sentence has 2 words, \n",
"- each word has 5 features, \n",
"\n",
"![shapes](https://www.tensorflow.org/static/guide/images/tensor/3-axis_front.png)\n",
"\n",
"#### What is dimension of sentence embeddings?\n",
"- (3,5)\n",
"\n",
"`nn.mean(data, dim=1)`"
]
},
{
"cell_type": "markdown",
"id": "20e1cf20",
"metadata": {},
"source": [
"### Sentence embeddings is the average of word embeddings"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "a6eac3e0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[ 0.1656, -0.2764, -0.0298, ..., 0.0087, -0.0636, 0.2763],\n",
" [ 0.1329, 0.0747, -0.2481, ..., -0.2341, 0.2315, -0.1357]])"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.mean(outputs.last_hidden_state, dim=1)"
]
},
{
"cell_type": "markdown",
"id": "bb4e57b5",
"metadata": {},
"source": [
"### (Optional) What is the potential issue of use the average of word embeddings for sentence embeddings\n",
"\n",
"The mean includes padding tokens (where attention_mask=0), which can dilute the embedding quality. BERTs padding tokens produce non-informative embeddings, and averaging them may introduce noise, especially for short reviews with many padding tokens."
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "3ae40e94",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[1, 1, 1, 1, 1, 1],\n",
" [1, 1, 1, 1, 1, 0]])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Masked mean-pooling\n",
"attention_mask = inputs['attention_mask'] # (batch_size, seq_len)\n",
"attention_mask"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "24ac0d4f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[[1, 1, 1, ..., 1, 1, 1],\n",
" [1, 1, 1, ..., 1, 1, 1],\n",
" [1, 1, 1, ..., 1, 1, 1],\n",
" [1, 1, 1, ..., 1, 1, 1],\n",
" [1, 1, 1, ..., 1, 1, 1],\n",
" [1, 1, 1, ..., 1, 1, 1]],\n",
"\n",
" [[1, 1, 1, ..., 1, 1, 1],\n",
" [1, 1, 1, ..., 1, 1, 1],\n",
" [1, 1, 1, ..., 1, 1, 1],\n",
" [1, 1, 1, ..., 1, 1, 1],\n",
" [1, 1, 1, ..., 1, 1, 1],\n",
" [0, 0, 0, ..., 0, 0, 0]]])"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mask = attention_mask.unsqueeze(-1).expand_as(outputs.last_hidden_state) # (batch_size, seq_len, hidden_dim)\n",
"mask"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "97e4b4cb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[[-3.8348e-02, 9.5097e-02, 1.4332e-02, ..., -1.7143e-01,\n",
" 1.2736e-01, 3.7117e-01],\n",
" [-3.7472e-01, -6.2022e-01, 1.2133e-01, ..., -2.7666e-02,\n",
" 1.5813e-01, 1.7997e-01],\n",
" [ 7.1591e-01, -1.9231e-01, 1.5049e-01, ..., -4.0711e-01,\n",
" 1.9909e-01, 2.7043e-01],\n",
" [-3.6584e-01, -3.0518e-01, 5.0851e-04, ..., 1.1478e-01,\n",
" -2.0296e-01, 9.8816e-01],\n",
" [ 4.8723e-02, -7.2430e-01, -1.8481e-01, ..., 3.9914e-01,\n",
" 9.7036e-02, 4.0537e-02],\n",
" [ 1.0081e+00, 8.8626e-02, -2.8047e-01, ..., 1.4469e-01,\n",
" -7.6039e-01, -1.9232e-01]],\n",
"\n",
" [[-1.0380e-01, 4.6764e-03, -1.2088e-01, ..., -2.1156e-01,\n",
" 2.9962e-01, -1.0300e-02],\n",
" [-1.1521e-01, 2.1597e-01, -4.0657e-01, ..., -5.8376e-01,\n",
" 8.9380e-01, 4.3011e-01],\n",
" [ 4.4965e-01, 2.5421e-01, 2.4422e-02, ..., -3.6552e-01,\n",
" 2.4427e-01, -6.5578e-01],\n",
" [ 6.2745e-02, 6.8042e-02, -9.1592e-01, ..., -2.1580e-01,\n",
" -1.1718e-02, -6.0144e-01],\n",
" [ 6.7927e-01, 2.1335e-01, -3.9926e-01, ..., 8.9958e-03,\n",
" -5.5664e-01, -1.6044e-01],\n",
" [-0.0000e+00, -0.0000e+00, 0.0000e+00, ..., -0.0000e+00,\n",
" 0.0000e+00, 0.0000e+00]]])"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"masked_embeddings = outputs.last_hidden_state * mask\n",
"masked_embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a699205c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 1,
"id": "18cc9c99",
"metadata": {},
"outputs": [],
@@ -24,7 +24,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 2,
"id": "d0b0e4d3",
"metadata": {},
"outputs": [
@@ -98,7 +98,7 @@
"4 5 Hated The Matrix; terrible pacing and a story ... negative"
]
},
"execution_count": 31,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@@ -111,7 +111,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 3,
"id": "e9c58e58",
"metadata": {},
"outputs": [],
@@ -124,7 +124,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 4,
"id": "36733cc8",
"metadata": {},
"outputs": [
@@ -174,7 +174,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 5,
"id": "068f7cc3",
"metadata": {},
"outputs": [
@@ -215,7 +215,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 6,
"id": "33f8d62c",
"metadata": {},
"outputs": [
@@ -225,7 +225,7 @@
"torch.Size([19, 768])"
]
},
"execution_count": 35,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -236,7 +236,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 7,
"id": "7a5d1681",
"metadata": {},
"outputs": [],
@@ -275,28 +275,44 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad411bb3",
"execution_count": 8,
"id": "4dea9168",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mValueError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[37]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Batch all phrases together\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m inputs = \u001b[43mtokenizer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[43m \u001b[49m\u001b[43mdf_reviews\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mphrase\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtolist\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# all texts at once\u001b[39;49;00m\n\u001b[32m 4\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mpt\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 5\u001b[39m \u001b[43m \u001b[49m\u001b[43mpadding\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[43m \u001b[49m\u001b[43mtruncation\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m128\u001b[39;49m\n\u001b[32m 8\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.no_grad():\n\u001b[32m 11\u001b[39m outputs = model(**inputs)\n",
"\u001b[36mFile \u001b[39m\u001b[32mi:\\conda_envs\\reinforcement\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:2887\u001b[39m, in \u001b[36mPreTrainedTokenizerBase.__call__\u001b[39m\u001b[34m(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[39m\n\u001b[32m 2885\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._in_target_context_manager:\n\u001b[32m 2886\u001b[39m \u001b[38;5;28mself\u001b[39m._switch_to_input_mode()\n\u001b[32m-> \u001b[39m\u001b[32m2887\u001b[39m encodings = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_one\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mall_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2888\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m text_target \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 2889\u001b[39m \u001b[38;5;28mself\u001b[39m._switch_to_target_mode()\n",
"\u001b[36mFile \u001b[39m\u001b[32mi:\\conda_envs\\reinforcement\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:2947\u001b[39m, in \u001b[36mPreTrainedTokenizerBase._call_one\u001b[39m\u001b[34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)\u001b[39m\n\u001b[32m 2944\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m 2946\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text):\n\u001b[32m-> \u001b[39m\u001b[32m2947\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 2948\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mtext input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 2949\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 2950\u001b[39m )\n\u001b[32m 2952\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text_pair):\n\u001b[32m 2953\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 2954\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mtext input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 2955\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 2956\u001b[39m )\n",
"\u001b[31mValueError\u001b[39m: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)."
]
"data": {
"text/plain": [
"transformers.tokenization_utils_base.BatchEncoding"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Batch all phrases together\n",
"inputs = tokenizer(\n",
" list(df_reviews['phrase']), # all texts at once\n",
" df_reviews['phrase'].tolist(), # all texts at once\n",
" return_tensors=\"pt\",\n",
" padding=True,\n",
" truncation=True,\n",
" max_length=128\n",
")\n",
"\n",
"type(inputs)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "ad411bb3",
"metadata": {},
"outputs": [],
"source": [
"# Batch all phrases together\n",
"inputs = tokenizer(\n",
" df_reviews['phrase'].tolist(), # all texts at once\n",
" return_tensors=\"pt\",\n",
" padding=True,\n",
" truncation=True,\n",
@@ -314,9 +330,23 @@
"review_labels = torch.tensor(labels, dtype=torch.long)\n"
]
},
{
"cell_type": "markdown",
"id": "553fbfff",
"metadata": {},
"source": [
"| Component | Meaning |\n",
"| ------------------------------- | ------------------------------------------------------------------------------------ |\n",
"| `review_embeddings` | BERT-encoded sentence embeddings (shape: `(n, 768)`), used as features. |\n",
"| `review_labels` | Ground truth sentiment labels (e.g., positive/negative/neutral). |\n",
"| `df_reviews['phrase'].tolist()` | Original text phrases (so you can refer back to the raw text later). |\n",
"| `test_size=0.2` | 20% of the data will go into the **test set**, and 80% into the **train set**. |\n",
"| `random_state=42` | Ensures **reproducibility** — you'll get the same split every time you run the code. |\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"id": "cfa993e5",
"metadata": {},
"outputs": [
@@ -324,21 +354,21 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1, Loss: 1.1128\n",
"Epoch 2, Loss: 1.0926\n",
"Epoch 3, Loss: 1.0726\n",
"Epoch 4, Loss: 1.0530\n",
"Epoch 5, Loss: 1.0337\n",
"Epoch 6, Loss: 1.0149\n",
"Epoch 7, Loss: 0.9966\n",
"Epoch 8, Loss: 0.9793\n",
"Epoch 9, Loss: 0.9629\n",
"Epoch 10, Loss: 0.9476\n",
"Epoch 1, Loss: 1.1348\n",
"Epoch 2, Loss: 1.1101\n",
"Epoch 3, Loss: 1.0867\n",
"Epoch 4, Loss: 1.0647\n",
"Epoch 5, Loss: 1.0440\n",
"Epoch 6, Loss: 1.0245\n",
"Epoch 7, Loss: 1.0061\n",
"Epoch 8, Loss: 0.9887\n",
"Epoch 9, Loss: 0.9722\n",
"Epoch 10, Loss: 0.9566\n",
"\n",
"Sentiment Prediction Results (Test Set):\n",
"ID | Review Text | Actual | Predicted\n",
"---|-----------------------------------------|-----------|----------\n",
"5 | Watched The Matrix, its fine, nothing special. #cinema | neutral | positive\n",
"5 | Watched The Matrix, its fine, nothing special. #cinema | neutral | negative\n",
"13 | The Matrix is awesome, iconic and thrilling! #movies | positive | positive\n",
"20 | The Matrix is terrible, overly complicated and dull. #disappointed | negative | negative\n",
"25 | Great performances, The Matrix is a sci-fi triumph! #scifi | positive | positive\n",
@@ -395,11 +425,20 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c1d50bc",
"cell_type": "markdown",
"id": "d048fe1d",
"metadata": {},
"source": [
"### Your work\n",
"- Calculate Accuray\n",
"- F1 scores\n",
" "
]
},
{
"cell_type": "markdown",
"id": "9f6257f6",
"metadata": {},
"outputs": [],
"source": []
}
],

Binary file not shown.

File diff suppressed because one or more lines are too long