mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Make datesets and loaders compatible with multiprocessing (#118)
This commit is contained in:
committed by
GitHub
parent
9f3f231ac7
commit
dd51d4ad83
@@ -31,7 +31,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"id": "0ed4b7db-3b47-4fd3-a4a6-5f4ed5dd166e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -43,12 +43,11 @@
|
||||
"\n",
|
||||
"class GPTDatasetV1(Dataset):\n",
|
||||
" def __init__(self, txt, tokenizer, max_length, stride):\n",
|
||||
" self.tokenizer = tokenizer\n",
|
||||
" self.input_ids = []\n",
|
||||
" self.target_ids = []\n",
|
||||
"\n",
|
||||
" # Tokenize the entire text\n",
|
||||
" token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
|
||||
" token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
|
||||
"\n",
|
||||
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
||||
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
||||
@@ -65,7 +64,7 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"def create_dataloader_v1(txt, batch_size=4, max_length=256, \n",
|
||||
" stride=128, shuffle=True, drop_last=True):\n",
|
||||
" stride=128, shuffle=True, drop_last=True, num_workers=0):\n",
|
||||
" # Initialize the tokenizer\n",
|
||||
" tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
|
||||
"\n",
|
||||
@@ -74,7 +73,7 @@
|
||||
"\n",
|
||||
" # Create dataloader\n",
|
||||
" dataloader = DataLoader(\n",
|
||||
" dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)\n",
|
||||
" dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=0)\n",
|
||||
"\n",
|
||||
" return dataloader\n",
|
||||
"\n",
|
||||
@@ -99,7 +98,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 2,
|
||||
"id": "664397bc-6daa-4b88-90aa-e8fc1fbd5846",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -117,7 +116,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 3,
|
||||
"id": "d3664332-e6bb-447e-8b96-203aafde8b24",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -150,7 +149,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.10"
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
Reference in New Issue
Block a user