Files
AI4DigitalForensics/lab03_RAG/Retrieval_Augmented_Generation_Simple.ipynb
Frank Xu fd42fba72f add RAG
2025-03-29 19:33:13 -04:00

570 lines
35 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "EJxYP1AD8fa3",
"outputId": "52fb707e-4876-46cc-afc9-5cfd10ee89e3"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: langchain in /usr/local/lib/python3.11/dist-packages (0.3.21)\n",
"Requirement already satisfied: langchain-core in /usr/local/lib/python3.11/dist-packages (0.3.47)\n",
"Requirement already satisfied: langchain-huggingface in /usr/local/lib/python3.11/dist-packages (0.1.2)\n",
"Requirement already satisfied: langchain_community in /usr/local/lib/python3.11/dist-packages (0.3.20)\n",
"Requirement already satisfied: faiss-cpu in /usr/local/lib/python3.11/dist-packages (1.10.0)\n",
"Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.11/dist-packages (0.29.3)\n",
"Requirement already satisfied: langchain-text-splitters<1.0.0,>=0.3.7 in /usr/local/lib/python3.11/dist-packages (from langchain) (0.3.7)\n",
"Requirement already satisfied: langsmith<0.4,>=0.1.17 in /usr/local/lib/python3.11/dist-packages (from langchain) (0.3.15)\n",
"Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /usr/local/lib/python3.11/dist-packages (from langchain) (2.10.6)\n",
"Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.11/dist-packages (from langchain) (2.0.39)\n",
"Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.11/dist-packages (from langchain) (2.32.3)\n",
"Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.11/dist-packages (from langchain) (6.0.2)\n",
"Requirement already satisfied: tenacity!=8.4.0,<10.0.0,>=8.1.0 in /usr/local/lib/python3.11/dist-packages (from langchain-core) (9.0.0)\n",
"Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.11/dist-packages (from langchain-core) (1.33)\n",
"Requirement already satisfied: packaging<25,>=23.2 in /usr/local/lib/python3.11/dist-packages (from langchain-core) (24.2)\n",
"Requirement already satisfied: typing-extensions>=4.7 in /usr/local/lib/python3.11/dist-packages (from langchain-core) (4.12.2)\n",
"Requirement already satisfied: sentence-transformers>=2.6.0 in /usr/local/lib/python3.11/dist-packages (from langchain-huggingface) (3.4.1)\n",
"Requirement already satisfied: tokenizers>=0.19.1 in /usr/local/lib/python3.11/dist-packages (from langchain-huggingface) (0.21.1)\n",
"Requirement already satisfied: transformers>=4.39.0 in /usr/local/lib/python3.11/dist-packages (from langchain-huggingface) (4.49.0)\n",
"Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.11/dist-packages (from langchain_community) (3.11.14)\n",
"Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /usr/local/lib/python3.11/dist-packages (from langchain_community) (0.6.7)\n",
"Requirement already satisfied: pydantic-settings<3.0.0,>=2.4.0 in /usr/local/lib/python3.11/dist-packages (from langchain_community) (2.8.1)\n",
"Requirement already satisfied: httpx-sse<1.0.0,>=0.4.0 in /usr/local/lib/python3.11/dist-packages (from langchain_community) (0.4.0)\n",
"Requirement already satisfied: numpy<3,>=1.26.2 in /usr/local/lib/python3.11/dist-packages (from langchain_community) (2.0.2)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from huggingface_hub) (3.18.0)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface_hub) (2025.3.0)\n",
"Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.11/dist-packages (from huggingface_hub) (4.67.1)\n",
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (2.6.1)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.3.2)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (25.3.0)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.5.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (6.2.0)\n",
"Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (0.3.0)\n",
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.18.3)\n",
"Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /usr/local/lib/python3.11/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain_community) (3.26.1)\n",
"Requirement already satisfied: typing-inspect<1,>=0.4.0 in /usr/local/lib/python3.11/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain_community) (0.9.0)\n",
"Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.11/dist-packages (from jsonpatch<2.0,>=1.33->langchain-core) (3.0.0)\n",
"Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from langsmith<0.4,>=0.1.17->langchain) (0.28.1)\n",
"Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.11/dist-packages (from langsmith<0.4,>=0.1.17->langchain) (3.10.15)\n",
"Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /usr/local/lib/python3.11/dist-packages (from langsmith<0.4,>=0.1.17->langchain) (1.0.0)\n",
"Requirement already satisfied: zstandard<0.24.0,>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from langsmith<0.4,>=0.1.17->langchain) (0.23.0)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain) (2.27.2)\n",
"Requirement already satisfied: python-dotenv>=0.21.0 in /usr/local/lib/python3.11/dist-packages (from pydantic-settings<3.0.0,>=2.4.0->langchain_community) (1.0.1)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2->langchain) (3.4.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2->langchain) (3.10)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2->langchain) (2.3.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2->langchain) (2025.1.31)\n",
"Requirement already satisfied: torch>=1.11.0 in /usr/local/lib/python3.11/dist-packages (from sentence-transformers>=2.6.0->langchain-huggingface) (2.6.0+cu124)\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-packages (from sentence-transformers>=2.6.0->langchain-huggingface) (1.6.1)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (from sentence-transformers>=2.6.0->langchain-huggingface) (1.14.1)\n",
"Requirement already satisfied: Pillow in /usr/local/lib/python3.11/dist-packages (from sentence-transformers>=2.6.0->langchain-huggingface) (11.1.0)\n",
"Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.11/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.1.1)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers>=4.39.0->langchain-huggingface) (2024.11.6)\n",
"Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers>=4.39.0->langchain-huggingface) (0.5.3)\n",
"Requirement already satisfied: anyio in /usr/local/lib/python3.11/dist-packages (from httpx<1,>=0.23.0->langsmith<0.4,>=0.1.17->langchain) (4.9.0)\n",
"Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx<1,>=0.23.0->langsmith<0.4,>=0.1.17->langchain) (1.0.7)\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.4,>=0.1.17->langchain) (0.14.0)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (3.4.2)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (3.1.6)\n",
"Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (12.4.127)\n",
"Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (12.4.127)\n",
"Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (12.4.127)\n",
"Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (9.1.0.70)\n",
"Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (12.4.5.8)\n",
"Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (11.2.1.3)\n",
"Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (10.3.5.147)\n",
"Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (11.6.1.9)\n",
"Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (12.3.1.170)\n",
"Requirement already satisfied: nvidia-cusparselt-cu12==0.6.2 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (0.6.2)\n",
"Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (2.21.5)\n",
"Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (12.4.127)\n",
"Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (12.4.127)\n",
"Requirement already satisfied: triton==3.2.0 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (3.2.0)\n",
"Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (1.13.1)\n",
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (1.3.0)\n",
"Requirement already satisfied: mypy-extensions>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain_community) (1.0.0)\n",
"Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn->sentence-transformers>=2.6.0->langchain-huggingface) (1.4.2)\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn->sentence-transformers>=2.6.0->langchain-huggingface) (3.6.0)\n",
"Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.11/dist-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.4,>=0.1.17->langchain) (1.3.1)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch>=1.11.0->sentence-transformers>=2.6.0->langchain-huggingface) (3.0.2)\n"
]
}
],
"source": [
"# Step 1: Set Up the Environment\n",
"# Install required libraries for LangChain, vector stores, and Hugging Face integration\n",
"!pip install -U langchain langchain-core langchain-huggingface langchain_community faiss-cpu huggingface_hub\n",
"\n",
"# Import necessary modules\n",
"from langchain_huggingface import HuggingFaceEndpoint\n",
"from langchain.chains import RetrievalQA\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"import textwrap\n",
"import requests # Used implicitly in HuggingFaceCustomEmbeddings\n",
"from langchain_community.vectorstores import FAISS\n",
"from langchain.embeddings.base import Embeddings\n",
"from typing import List\n",
"from langchain.embeddings import HuggingFaceEmbeddings"
]
},
{
"cell_type": "code",
"source": [
"# Step 2: Define the Scenario\n",
"# This section provides a futuristic cyberpunk case study about a ransomware attack investigation.\n",
"# The document serves as a knowledge base for answering questions about cybercrime and forensics.\n",
"\n",
"# Possible Questions the Paragraph Can Answer:\n",
"# 1. What type of cyberattack did Detective Y investigate?\n",
" # (Expected Response: Ransomware attack)\n",
"# 2. What was the victim's profession?\n",
" # (Expected Response: Robotics engineer)\n",
"# 3. Where was the remote server located that ultimately led to the perpetrator's arrest?\n",
" # (Expected Response: Abandoned industrial sector of city X)\n",
"\n",
"scenario_text = \"\"\"\n",
"The neon lights of X shimmered, reflecting off the sleek\n",
"cybernetic implants of its citizens. Detective Y, however,\n",
"saw little of the city's beauty as he hunched over a\n",
"holographic display, a frown etched on his face. He was\n",
"facing a digital enigma: a ransomware attack unlike any\n",
"he'd encountered before. The victim, a renowned robotics\n",
"engineer named Z, reported that all his research data,\n",
"years of work on a groundbreaking AI-powered prosthetic\n",
"limb, had been encrypted. The perpetrator, a shadowy\n",
"entity calling themselves The Serpent, demanded an\n",
"exorbitant ransom in untraceable cryptocurrency. Y, a\n",
"veteran of the Cyber Crimes Division, knew that time was\n",
"of the essence. Z's research was not only invaluable\n",
"scientifically but also held the potential to revolutionize\n",
"prosthetics for millions. But the initial investigation\n",
"yielded little. The Serpent had left no digital footprints,\n",
"employing advanced encryption and anonymization techniques\n",
"to mask their identity and location. Y, however, was not\n",
"one to be easily deterred. He understood the power of\n",
"expanding the knowledge base. He requested and received\n",
"access to Z's entire digital life his personal computers,\n",
"lab servers, cloud storage, even his smart home devices.\n",
"Y's team, equipped with cutting-edge forensic tools, began\n",
"their meticulous analysis. They reconstructed deleted\n",
"files, analyzed network traffic logs, and even delved into\n",
"the firmware of Z's smart appliances, searching for any\n",
"hidden data or unusual connections. They expanded their\n",
"search beyond Z's immediate digital sphere, examining\n",
"online forums, academic databases, and even dark web\n",
"marketplaces for any mention of the stolen research or\n",
"clues about The Serpent's identity. As the team dug deeper,\n",
"they discovered a seemingly unrelated incident: a minor\n",
"security breach at a local university's robotics lab a few\n",
"weeks prior. The breach, initially dismissed as a student\n",
"prank, involved the theft of a small, experimental AI\n",
"algorithm. Y's intuition flared. Could this be connected\n",
"to The Serpent's attack? Further investigation revealed a\n",
"startling connection. The stolen algorithm, while seemingly\n",
"insignificant on its own, was a crucial component in Z's\n",
"research. The Serpent, it seemed, had planned their attack\n",
"meticulously, acquiring the necessary tools before launching\n",
"their ransomware scheme. With this expanded knowledge base,\n",
"Y's team was able to trace The Serpent's digital trail.\n",
"They uncovered a hidden connection to a remote server\n",
"located in the abandoned industrial sector of X. A raid\n",
"on the location led to the arrest of a disgruntled former\n",
"student of Z's, seeking revenge for a perceived academic\n",
"slight. The case of The Serpent highlighted the crucial\n",
"role of expanding the knowledge base in digital forensics.\n",
"By connecting seemingly disparate pieces of information\n",
"and exploring every digital avenue, Y and his team were\n",
"able to bring a cybercriminal to justice and safeguard\n",
"groundbreaking research that held the promise of a better\n",
"future.\n",
"\"\"\"\n",
"\n",
"# Display the total character count of the scenario\n",
"print(\"Total characters in the scenario:\", len(scenario_text))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gh4GjZLYm8G_",
"outputId": "81719618-9cbd-4250-fbdb-d8059d97ce74"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Total characters in the scenario: 3011\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Step 3: Split the Text into Chunks\n",
"# Use RecursiveCharacterTextSplitter to divide the text into manageable chunks for processing\n",
"chunks = RecursiveCharacterTextSplitter(\n",
" chunk_size=400, # Maximum size of each chunk in characters\n",
" chunk_overlap=50, # Overlap between chunks to preserve context\n",
" separators=[\"\\n\\n\", \"\\n\", \". \", \" \", \"\"], # Order of separators to split the text\n",
" length_function=len # Use character count to measure chunk size\n",
").split_text(scenario_text)\n",
"\n",
"# Display chunk statistics\n",
"print(f\"Total number of chunks: {len(chunks)}\\n\")\n",
"print(f\"Length of first chunk: {len(chunks[0])}\\nContent of first chunk: {chunks[0]}\\n\")\n",
"print(f\"Length of second chunk: {len(chunks[1])}\\nContent of second chunk: {chunks[1]}\\n\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tPVDSiTGt1pw",
"outputId": "581b7c64-f483-42ef-a423-273f3ec33d76"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Total number of chunks: 9\n",
"\n",
"Length of first chunk: 392\n",
"Content of first chunk: The neon lights of X shimmered, reflecting off the sleek\n",
"cybernetic implants of its citizens. Detective Y, however,\n",
"saw little of the city's beauty as he hunched over a\n",
"holographic display, a frown etched on his face. He was\n",
"facing a digital enigma: a ransomware attack unlike any\n",
"he'd encountered before. The victim, a renowned robotics\n",
"engineer named Z, reported that all his research data,\n",
"\n",
"Length of second chunk: 383\n",
"Content of second chunk: years of work on a groundbreaking AI-powered prosthetic\n",
"limb, had been encrypted. The perpetrator, a shadowy\n",
"entity calling themselves The Serpent, demanded an\n",
"exorbitant ransom in untraceable cryptocurrency. Y, a\n",
"veteran of the Cyber Crimes Division, knew that time was\n",
"of the essence. Z's research was not only invaluable\n",
"scientifically but also held the potential to revolutionize\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Step 4: Define Custom Embeddings Class\n",
"class HuggingFaceCustomEmbeddings(Embeddings):\n",
" \"\"\"Custom embeddings class using Hugging Face Inference API.\"\"\"\n",
"\n",
" def __init__(self, api_url: str, api_token: str):\n",
" \"\"\"Initialize with API URL and token.\"\"\"\n",
" self.api_url = api_url\n",
" self.api_token = api_token\n",
"\n",
" def embed_documents(self, texts: List[str]) -> List[List[float]]:\n",
" \"\"\"Generate embeddings for multiple text documents.\"\"\"\n",
" headers = {\"Authorization\": f\"Bearer {self.api_token}\"}\n",
" payload = {\"inputs\": texts, \"options\": {\"wait_for_model\": True}}\n",
" response = requests.post(self.api_url, headers=headers, json=payload)\n",
"\n",
" if response.status_code == 200:\n",
" return response.json()\n",
" else:\n",
" print(f\"Request failed with status code {response.status_code}\")\n",
" print(f\"Error: {response.text}\")\n",
" return []\n",
"\n",
" def embed_query(self, text: str) -> List[float]:\n",
" \"\"\"Generate embedding for a single query text.\"\"\"\n",
" return self.embed_documents([text])[0]"
],
"metadata": {
"id": "T9f3IStmkmvV"
},
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Step 5: Set Up Embeddings and Vector Store\n",
"# Define Hugging Face Model URL and access token (e.g., API Token)\n",
"HF_API_URL = \"https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/paraphrase-MiniLM-L6-v2\"\n",
"# apply for your Huggin Face access token here https://huggingface.co/docs/hub/en/security-tokens\n",
"# Your access token should look like this\n",
"api_token = \"hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\"\n",
"\n",
"# Create embedding function instance\n",
"embedding_function = HuggingFaceCustomEmbeddings(api_url=HF_API_URL, api_token=api_token)\n",
"\n",
"# Generate embeddings for the text chunks\n",
"embeddings = embedding_function.embed_documents(chunks)\n",
"print(f\"Size of embeddings for first chunk: {len(embeddings[0])}\")\n",
"print(f\"First 3 values of first embedding: {embeddings[0][0:3]}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "LUQzBnBtk7m6",
"outputId": "2ed9210e-3dce-413d-eaf4-a3628d887ed9"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Size of embeddings for first chunk: 384\n",
"First 3 values of first embedding: [-0.48097720742225647, 0.3812538683414459, -0.2985522449016571]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Step 6: Set and Test the Vector Store\n",
"# Create FAISS vector store from text and embeddings\n",
"vectorstore = FAISS.from_embeddings(\n",
" text_embeddings=list(zip(chunks, embeddings)),\n",
" embedding=embedding_function # Embedding function for queries\n",
")\n",
"\n",
"# Perform a similarity search on the vector store with a sample query\n",
"query = \"What type of cyberattack did Detective Y investigate?\"\n",
"results = vectorstore.similarity_search(query, k=1) # Return top 1 result\n",
"print(f\"Most similar chunk:\\n{results[0].page_content}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5dWXG_xxP383",
"outputId": "d8d5651e-b236-4adf-92a3-e57a45f70fab"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Most similar chunk:\n",
"The neon lights of X shimmered, reflecting off the sleek\n",
"cybernetic implants of its citizens. Detective Y, however,\n",
"saw little of the city's beauty as he hunched over a\n",
"holographic display, a frown etched on his face. He was\n",
"facing a digital enigma: a ransomware attack unlike any\n",
"he'd encountered before. The victim, a renowned robotics\n",
"engineer named Z, reported that all his research data,\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"### Explaination:\n",
"When you search (vectorstore.similarity_search), the query text is embedded using embedding_function.embed_query, and FAISS finds the closest stored embeddings.\n",
"\n",
"* embed_query: For embedding a single text (used during search).\n",
"* return the top 1 (k=1) most similar result.\n",
"\n",
"Retrieval alone (via FAISS) returns raw chunks, which may be fragmented or verbose, potentially including multiple sentences unrelated to the querys focus. This is because Retrieval alone finds similar text only based on\n",
"embeddings,\n",
"* it doesnt “understand” the question or rephrase the answer\n",
"\n",
"### Solution: Use LLM\n",
"\n",
"* LLMs excel at natural language comprehension and generation\n",
"\n",
"\n",
"### Role of LLM:\n",
"\n",
"The LLM interprets the query and retrieved chunks, extracting key details (e.g., \"ransomware attack\") even if the raw chunks dont directly phrase it as \"type of cyberattack.\" This showcases the LLMs ability to comprehend and rephrase naturally, beyond raw similarity matching."
],
"metadata": {
"id": "KUjViHt1QSzI"
}
},
{
"cell_type": "code",
"source": [
"# Step 7: Import Libraries and Suppress Warnings\n",
"# Import required modules and suppress specific FutureWarning from Hugging Face Hub\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\", category=FutureWarning, module=\"huggingface_hub.utils._deprecation\")\n",
"from langchain_huggingface import HuggingFaceEndpoint\n",
"from langchain.chains import RetrievalQA\n",
"import textwrap\n",
"\n",
"# Step 8: Initialize the Language Model\n",
"# Set up the Hugging Face LLM endpoint with specified parameters\n",
"llm = HuggingFaceEndpoint(\n",
" repo_id=\"mistralai/Mistral-7B-Instruct-v0.1\", # Model repository ID\n",
" huggingfacehub_api_token=api_token, # API token for authentication (assumed defined earlier)\n",
" temperature=0.1, # Low temperature for deterministic output\n",
" task=\"text-generation\" # Task type for the model\n",
")\n",
"\n",
"# Step 9: Create the Retrieval-Augmented Generation (RAG) Chain\n",
"# Build a RetrievalQA chain combining vector retrieval and LLM generation\n",
"qa_chain = RetrievalQA.from_chain_type(\n",
" llm=llm, # Language model for interpreting and generating responses\n",
" retriever=vectorstore.as_retriever( # Convert vectorstore to retriever (assumed defined earlier)\n",
" search_type=\"similarity\", # Use similarity-based search for retrieval\n",
" search_kwargs={\"k\": 4} # Retrieve top 4 relevant chunks based on embeddings\n",
" ),\n",
" chain_type=\"stuff\" # Combine retrieved documents into a single prompt for LLM\n",
")"
],
"metadata": {
"id": "_A-pSsI1RAIx"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Step 10: Execute Query and Display Response\n",
"# Run a query through the RAG chain and format the response\n",
"query = \"What type of cyberattack did Detective Y investigate?\"\n",
"response = qa_chain.invoke(query) # Get response combining retrieval and LLM processing\n",
"wrapped_response = textwrap.fill(response[\"result\"], width=160) # Wrap text for clean output\n",
"print(f\"\\n🔵 RAG's Answer: {wrapped_response}\\n\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1JMR8-HeSYCE",
"outputId": "7de6cecd-f95b-47d9-acb7-b8c891c34b59"
},
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"🔵 RAG's Answer: Detective Y investigated a ransomware attack.\n",
"\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"### Oberservation\n",
"Youd just get raw retrieved chunks, which might be fragmented or verbose (e.g., multiple sentences from your story)\n",
"\n",
"This is because Retrieval alone (via FAISS) finds similar text based on embeddings, but it doesnt “understand” the question or rephrase the answer—LLMs excel at natural language comprehension and generation.\n",
"\n",
"### Role of LLM\n",
"The LLM interprets the query (“What type of cyberattack...”) and the retrieved context, extracting the relevant detail (“ransomware attack”) even if the chunks dont explicitly say “type of cyberattack.”"
],
"metadata": {
"id": "Wb-SkfqG18hg"
}
},
{
"cell_type": "code",
"source": [
"import warnings\n",
"warnings.filterwarnings(\"ignore\", category=FutureWarning, module=\"huggingface_hub.utils._deprecation\")\n",
"from langchain_huggingface import HuggingFaceEndpoint\n",
"from langchain.chains import RetrievalQA\n",
"\n",
"# Define the LLM\n",
"llm = HuggingFaceEndpoint(\n",
" repo_id=\"mistralai/Mistral-7B-Instruct-v0.1\",\n",
" huggingfacehub_api_token=api_token,\n",
" temperature=0.1,\n",
" task=\"text-generation\"\n",
")\n",
"\n",
"# Create the RetrievalQA chain (assuming vectorstore is defined)\n",
"qa_chain = RetrievalQA.from_chain_type(\n",
" llm=llm,\n",
" retriever=vectorstore.as_retriever(search_type=\"similarity\", search_kwargs={\"k\": 4}),\n",
" chain_type=\"stuff\"\n",
")\n",
"\n",
"# Run the query\n",
"query = \"What type of cyberattack did Detective Y investigate?\"\n",
"response = qa_chain.invoke(query)\n",
"print(f\"\\n🔵 RAG's Answer: {response}\\n\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "IQVDOGQ5zWgl",
"outputId": "3c4163c7-d367-4986-f852-08a1eb2a24f4"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"🔵 RAG's Answer: {'query': 'What type of cyberattack did Detective Y investigate?', 'result': ' Detective Y investigated a ransomware attack.'}\n",
"\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# **TEST QUESTIONS:**\n",
"\n",
"## In-Text Questions:\n",
"\n",
"1. What type of cyberattack did Detective Y investigate? (Expected Response: Ransomware attack)\n",
"2. What was the victim's profession? (Expected Response: Robotics engineer)\n",
"3. Where was the remote server located that ultimately led to the perpetrator's arrest? (Expected Response: Abandoned industrial sector of city X)\n",
"\n",
"\n",
"## Out-of-Text Questions:\n",
"1. What specific encryption algorithm did The Serpent use to encrypt the research data? (Expected Response: The story doesn't mention the specific algorithm.)\n",
"2. What was the name of the university where the minor security breach occurred? (Expected Response: The story doesn't mention the university's name.)\n",
"3. Did Detective Y's team collaborate with any external cybersecurity experts or organizations during the investigation? (Expected Response: The story doesn't mention any external collaboration.)"
],
"metadata": {
"id": "49HZsr_4PewJ"
}
}
]
}