Llama 3 KV Cache (#685)

* Llama 3 KV Cache * skip expensive tests on Gh actions * Update __init__.py
2026-04-10 12:33:42 +00:00 · 2025-06-21 10:55:20 -05:00
parent c008f95072
commit 3be0f3202a
7 changed files with 410 additions and 4 deletions
--- a/pkg/llms_from_scratch/kv_cache/generate.py
+++ b/pkg/llms_from_scratch/kv_cache/generate.py
@@ -0,0 +1,29 @@
+# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
+# Source for "Build a Large Language Model From Scratch"
+#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
+# Code: https://github.com/rasbt/LLMs-from-scratch
+
+import torch
+
+
+def generate_text_simple(model, idx, max_new_tokens, context_size=None, use_cache=True):
+    model.eval()
+
+    ctx_len = context_size or model.cfg["context_length"]
+
+    with torch.no_grad():
+        if use_cache:
+            model.reset_kv_cache()
+            logits = model(idx[:, -ctx_len:], use_cache=True)
+
+            for _ in range(max_new_tokens):
+                next_idx = logits[:, -1].argmax(dim=-1, keepdim=True)
+                idx = torch.cat([idx, next_idx], dim=1)
+                logits = model(next_idx, use_cache=True)
+        else:
+            for _ in range(max_new_tokens):
+                logits = model(idx[:, -ctx_len:], use_cache=False)
+                next_idx = logits[:, -1].argmax(dim=-1, keepdim=True)
+                idx = torch.cat([idx, next_idx], dim=1)
+
+    return idx