diff --git a/lab04/Word_Embeddings_in_DF.ipynb b/lab04/Word_Embeddings_in_DF.ipynb new file mode 100644 index 0000000..8027d75 --- /dev/null +++ b/lab04/Word_Embeddings_in_DF.ipynb @@ -0,0 +1,369 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Lab: Word Embeddings in Digital Forensics\n", + "\n", + "**Objective**: Learn how word embeddings represent text as vectors and use them to analyze dark web forum posts in a digital forensics investigation. The goal is to identify the true alias of a cybercriminal, \"The Ghost,\" from three candidates—\"ShadowTrader,\" \"DataReaper,\" or \"CrypticVendor\"—by uncovering semantic patterns in their posts.\n", + "\n", + "**Background Story**: \n", + "It’s 2025, and you’re Analyst Quinn, a member of the Cyber Threat Task Force. A hacker known only as \"The Ghost\" has been wreaking havoc, selling stolen data—credit cards, passwords, and corporate secrets—on a dark web forum. The Task Force has intercepted three anonymous posts believed to be authored by The Ghost. Forum users mention different aliases—\"ShadowTrader (e.g., trading),\" \"DataReaper (e.g., harvesting),\" \"CrypticVendor (e..g, a vendor of hidden wares),\" —but only one is their true identity. Your mission: use word embeddings to analyze these posts and determine which alias most likely belongs to The Ghost, cutting through the noise to reveal their real handle.\n", + "\n", + "**Dataset**: \n", + "Three dark web forum posts attributed to The Ghost: \n", + "1. *\"Selling fresh (credit) card dumps, cheap prices, fast delivery.\"* \n", + "2. *\"Got a haul of (login) credentials—banks, emails, you name it. DM for deals.\"* \n", + "3. *\"Hacked a corporate (database)—juicy secrets for sale, secure payments only.\"* \n", + "\n", + "**Goal**: \n", + "Analyze the posts to extract key term, marked in \"()\", explore their semantic relationships, and predict The Ghost’s true alias—\"ShadowTrader,\" \"DataReaper,\" or \"CrypticVendor\"—based on their consistent language patterns." + ], + "metadata": { + "id": "WJOx7fI4tbu5" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8QmsVWtglDMy", + "outputId": "3d52d6c8-8e67-4326-e7f9-078175609120" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Gensim version: 4.3.2\n", + "Scipy version: 1.10.1\n", + "Forensic toolkit ready, Agent Riley!\n" + ] + } + ], + "source": [ + "# Install compatible versions: gensim 4.3.2 and scipy 1.10.1\n", + "!pip install gensim==4.3.2 scipy==1.10.1 numpy matplotlib -q\n", + "\n", + "# Import libraries for embedding analysis and visualization\n", + "import gensim.downloader as api # Access pre-trained embeddings\n", + "import numpy as np # Handle vector math\n", + "import matplotlib.pyplot as plt # Visualize patterns\n", + "\n", + "# Verify installations\n", + "import gensim\n", + "import scipy\n", + "print(f\"Gensim version: {gensim.__version__}\")\n", + "print(f\"Scipy version: {scipy.__version__}\")\n", + "print(\"Forensic toolkit ready, Agent Riley!\")" + ] + }, + { + "cell_type": "code", + "source": [ + "# Load pre-trained Word2Vec model (Google News, 300D vectors)\n", + "model = api.load(\"word2vec-google-news-300\")\n", + "\n", + "# Test with a key term from Post 1: \"credit\"\n", + "word = \"credit\"\n", + "vector = model[word]\n", + "print(f\"Vector for '{word}' (first 5 dimensions): {vector[:5]}\")\n", + "print(f\"Vector length: {len(vector)}\")\n", + "\n", + "# Context: \"credit\" from \"credit card dumps\" is our first clue to The Ghost’s true alias" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uihL0xZElPit", + "outputId": "69a26003-2cfc-426a-ec12-542ff2bfa76d" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[==================================================] 100.0% 1662.8/1662.8MB downloaded\n", + "Vector for 'credit' (first 5 dimensions): [-0.01501465 0.19335938 0.01483154 0.07373047 0.24902344]\n", + "Vector length: 300\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Find words similar to \"credit\" to explore The Ghost’s focus\n", + "similar_words = model.most_similar(\"credit\", topn=5)\n", + "print(\"\\nWords similar to 'credit':\")\n", + "for word, similarity in similar_words:\n", + " print(f\"{word}: {similarity:.3f}\")\n", + "\n", + "# Context: Do these terms (e.g., \"login\", \"database\") align with one alias over the others?" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FokP8-8Im8KX", + "outputId": "9db79c8b-a6c9-4645-ec8d-d51aae73d94c" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Words similar to 'credit':\n", + "Credit: 0.689\n", + "loan: 0.556\n", + "loans: 0.549\n", + "lending: 0.524\n", + "mortgage: 0.507\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Classic Word Embedding Example\n", + "# Demonstrate how embeddings capture relationships with \"king - man + woman ≈ queen\"\n", + "classic_result = model.most_similar(positive=[\"king\", \"woman\"], negative=[\"man\"], topn=1)\n", + "print(\"\\nClassic Example: king - man + woman = ?\")\n", + "print(f\"Result: {classic_result[0][0]} (similarity: {classic_result[0][1]:.3f})\")\n", + "print(\"This shows embeddings understand gender relationships—let’s visualize the relationships!\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NqwiuQVfnIiT", + "outputId": "a6768621-c3d8-4aa3-f027-40db985a8890" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Classic Example: king - man + woman = ?\n", + "Result: queen (similarity: 0.712)\n", + "This shows embeddings understand gender relationships—let’s visualize the relationships!\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Visulize classic Word Embedding Example in a figure\n", + "# Demonstrate how embeddings capture relationships with \"king - man + woman ≈ queen\"\n", + "classic_words = [\"king\", \"man\", \"woman\", \"queen\"]\n", + "classic_vectors = [model[word] for word in classic_words]\n", + "\n", + "# Compute the analogy vector explicitly: king - man + woman\n", + "analogy_vector = model[\"king\"] - model[\"man\"] + model[\"woman\"]\n", + "\n", + "# Perform the analogy to get the closest word (should be \"queen\")\n", + "classic_result = model.most_similar(positive=[\"king\", \"woman\"], negative=[\"man\"], topn=1)\n", + "print(\"\\nClassic Example: king - man + woman = ?\")\n", + "print(f\"Result: {classic_result[0][0]} (similarity: {classic_result[0][1]:.3f})\")\n", + "\n", + "# Reduce to 2D using PCA for visualization (include analogy vector)\n", + "from sklearn.decomposition import PCA\n", + "pca = PCA(n_components=2)\n", + "all_vectors = classic_vectors + [analogy_vector] # Add analogy vector\n", + "reduced_vectors = pca.fit_transform(all_vectors)\n", + "\n", + "# Plot the classic embeddings with analogy highlighted\n", + "plt.figure(figsize=(10, 8))\n", + "# Plot the four words in blue\n", + "for i, word in enumerate(classic_words):\n", + " plt.scatter(reduced_vectors[i, 0], reduced_vectors[i, 1], c=\"blue\", label=\"Words\" if i == 0 else \"\")\n", + " plt.annotate(word, (reduced_vectors[i, 0], reduced_vectors[i, 1]), xytext=(5, 5), textcoords=\"offset points\")\n", + "# Plot the analogy vector in red\n", + "plt.scatter(reduced_vectors[4, 0], reduced_vectors[4, 1], c=\"red\", label=\"Analogy (king - man + woman)\")\n", + "plt.annotate(\"analogy\", (reduced_vectors[4, 0], reduced_vectors[4, 1]), xytext=(5, 5), textcoords=\"offset points\", c=\"red\")\n", + "# Draw an arrow from \"king\" to the analogy point\n", + "plt.arrow(reduced_vectors[0, 0], reduced_vectors[0, 1],\n", + " reduced_vectors[4, 0] - reduced_vectors[0, 0],\n", + " reduced_vectors[4, 1] - reduced_vectors[0, 1],\n", + " color=\"red\", alpha=0.5, head_width=0.1, label=\"Shift\")\n", + "# Draw a dashed line from analogy to \"queen\" to show the intended result\n", + "queen_idx = classic_words.index(\"queen\")\n", + "plt.plot([reduced_vectors[4, 0], reduced_vectors[queen_idx, 0]],\n", + " [reduced_vectors[4, 1], reduced_vectors[queen_idx, 1]],\n", + " \"g--\", label=\"To Queen\", alpha=0.7)\n", + "plt.title(\"Classic Embedding Example: King - Man + Woman ≈ Queen\")\n", + "plt.xlabel(\"PCA Component 1\")\n", + "plt.ylabel(\"PCA Component 2\")\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.show()\n", + "print(\"The red analogy point shifts from 'king' and should be near 'queen'—the green dashed line shows this connection!\")\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 787 + }, + "id": "s44gIukwtHT-", + "outputId": "259f2024-31bf-4c28-f9e6-d8f91d07310a" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Classic Example: king - man + woman = ?\n", + "Result: queen (similarity: 0.712)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "The red analogy point shifts from 'king' and should be near 'queen'—the green dashed line shows this connection!\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Select key terms from the posts and alias-related words\n", + "words = [\"credit\", \"login\", \"database\", \"secrets\", \"payment\", \"sale\"]\n", + "vectors = [model[word] for word in words]\n", + "\n", + "# Reduce to 2D using PCA\n", + "from sklearn.decomposition import PCA\n", + "pca = PCA(n_components=2)\n", + "reduced_vectors = pca.fit_transform(vectors)\n", + "\n", + "# Plot the terms\n", + "plt.figure(figsize=(8, 6))\n", + "for i, word in enumerate(words):\n", + " plt.scatter(reduced_vectors[i, 0], reduced_vectors[i, 1])\n", + " plt.annotate(word, (reduced_vectors[i, 0], reduced_vectors[i, 1]))\n", + "plt.title(\"The Ghost’s Terms in 2D Embedding Space\")\n", + "plt.xlabel(\"PCA Component 1\")\n", + "plt.ylabel(\"PCA Component 2\")\n", + "plt.grid(True)\n", + "plt.show()\n", + "\n", + "# Context: Clusters might hint at The Ghost’s alias—e.g., \"sale\" near \"credit\" for \"ShadowTrader\"" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 564 + }, + "id": "wN4_C1PTp4Fj", + "outputId": "4872597b-2523-4899-cb3b-97ee93571f8c" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Compare key post terms to alias-related terms\n", + "post_terms = [\"credit\", \"login\", \"database\"] # From Posts 1, 2, 3\n", + "aliases = [\"trader\", \"reaper\", \"vendor\"] # Simplified from ShadowTrader, DataReaper, CrypticVendor\n", + "\n", + "# Calculate average similarity between post terms and each alias term\n", + "for alias in aliases:\n", + " similarities = [model.similarity(post_term, alias) for post_term in post_terms]\n", + " avg_similarity = np.mean(similarities)\n", + " print(f\"Average similarity of post terms to '{alias}': {avg_similarity:.3f}\")\n", + "\n", + "# Context: Higher similarity to \"trader,\" \"reaper,\" or \"vendor\" suggests The Ghost’s alias" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RU1WcTTM52Bs", + "outputId": "f2a9d8cb-678a-4ee5-a560-b79a20eac15e" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Average similarity of post terms to 'trader': 0.049\n", + "Average similarity of post terms to 'reaper': 0.032\n", + "Average similarity of post terms to 'vendor': 0.130\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Conclusion\n", + "\n", + "* CrypticVendor is the hacker's alias." + ], + "metadata": { + "id": "EKammd8SO6Vd" + } + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "s0zUJibb57bW" + }, + "execution_count": 7, + "outputs": [] + } + ] +} \ No newline at end of file