From 86d6a6a518c4f3d6b43ae22993ebbd934bb9cadf Mon Sep 17 00:00:00 2001 From: Frank Xu Date: Sun, 8 Feb 2026 16:51:11 -0500 Subject: [PATCH] add RQ3 table 9 --- RQs/RQ3/RQ3_t9.ipynb | 201 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 RQs/RQ3/RQ3_t9.ipynb diff --git a/RQs/RQ3/RQ3_t9.ipynb b/RQs/RQ3/RQ3_t9.ipynb new file mode 100644 index 0000000..4516462 --- /dev/null +++ b/RQs/RQ3/RQ3_t9.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RQ3: Table 9 - Application-level source coverage by PII type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "import glob\n", + "import pandas as pd\n", + "from collections import defaultdict\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "GPT4O_RESULTS_DIR = '../normalized_PII_results/gpt4o/db_level/'\n", + "GROUND_TRUTH_DIR = '../normalized_PII_results/ground_truth/db_level/'\n", + "\n", + "PII_TYPES = ['EMAIL', 'PHONE', 'USERNAME', 'PERSON_NAME', 'POSTAL_ADDRESS']\n", + "\n", + "APP_MAPPING = {\n", + " 'A1': 'WhatsApp',\n", + " 'A2': 'Snapchat',\n", + " 'A3': 'Telegram',\n", + " 'A4': 'Google Maps',\n", + " 'A5': 'Samsung Internet',\n", + " 'I1': 'WhatsApp (iOS)',\n", + " 'I2': 'Contacts',\n", + " 'I3': 'Apple Messages',\n", + " 'I4': 'Safari',\n", + " 'I5': 'Calendar'\n", + "}\n", + "\n", + "COLUMN_MAPPING = {\n", + " 'EMAIL': 'Email',\n", + " 'PHONE': 'Phone',\n", + " 'USERNAME': 'User Name',\n", + " 'PERSON_NAME': 'Person Name',\n", + " 'POSTAL_ADDRESS': 'Postal Address'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_filename(filepath):\n", + " \"\"\"Parses a filename to extract the app ID and database name.\"\"\"\n", + " base_name = os.path.basename(filepath)\n", + " # Format: PII_{APP_ID}_{DB_NAME}_{TIMESTAMP}.jsonl\n", + " match = re.match(r'PII_([A-Z0-9]+)_(.*)_\\d{8}T\\d{6}Z\\.jsonl', base_name)\n", + " if match:\n", + " app_id = match.group(1)\n", + " db_name = match.group(2)\n", + " return app_id, db_name\n", + " return None, None\n", + "\n", + "def load_data(path):\n", + " \"\"\"Loads PII presence data from a directory of jsonl files.\"\"\"\n", + " # Structure: {app_id: {db_name: {pii_type: has_pii_bool}}}\n", + " data = defaultdict(lambda: defaultdict(lambda: defaultdict(bool)))\n", + " files = glob.glob(os.path.join(path, '*.jsonl'))\n", + " for f_path in files:\n", + " app_id, db_name = parse_filename(f_path)\n", + " if not app_id or not db_name:\n", + " continue\n", + " with open(f_path, 'r') as f:\n", + " for line in f:\n", + " record = json.loads(line)\n", + " pii_type = record['PII_type']\n", + " if len(record['PII']) > 0:\n", + " data[app_id][db_name][pii_type] = True\n", + " return data\n", + "\n", + "gt_data = load_data(GROUND_TRUTH_DIR)\n", + "system_data = load_data(GPT4O_RESULTS_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "table_data = []\n", + "\n", + "for app_id, app_name in APP_MAPPING.items():\n", + " row = {'ID': app_id, 'Application': app_name}\n", + " \n", + " app_dbs_in_gt = gt_data.get(app_id, {}).keys()\n", + "\n", + " # --- Per-PII Type Calculation ---\n", + " for pii_type in PII_TYPES:\n", + " col_name = COLUMN_MAPPING[pii_type]\n", + " \n", + " # DG(a,t): set of databases for app 'a' that contain pii_type 't' in ground truth\n", + " gt_dbs_with_pii = {db for db in app_dbs_in_gt if gt_data.get(app_id, {}).get(db, {}).get(pii_type, False)}\n", + " \n", + " # DS(a,t): set of databases for app 'a' that contain pii_type 't' in system output\n", + " system_dbs_with_pii = {db for db in app_dbs_in_gt if system_data.get(app_id, {}).get(db, {}).get(pii_type, False)}\n", + " \n", + " gt_count = len(gt_dbs_with_pii)\n", + " \n", + " if gt_count == 0:\n", + " row[col_name] = '-'\n", + " else:\n", + " # covered = |DG(a,t) \u2229 DS(a,t)|\n", + " covered_count = len(gt_dbs_with_pii.intersection(system_dbs_with_pii))\n", + " row[col_name] = f\"{covered_count}/{gt_count}\"\n", + "\n", + " # --- All PII Calculation ---\n", + " # Databases in GT for this app that have *any* PII type\n", + " gt_dbs_with_any_pii = {\n", + " db for db in app_dbs_in_gt \n", + " if any(gt_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in PII_TYPES)\n", + " }\n", + " \n", + " # Databases in system output for this app that have *any* PII type\n", + " system_dbs_with_any_pii = {\n", + " db for db in app_dbs_in_gt\n", + " if any(system_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in PII_TYPES)\n", + " }\n", + "\n", + " all_gt_count = len(gt_dbs_with_any_pii)\n", + " if all_gt_count == 0:\n", + " row['All PII'] = '-'\n", + " else:\n", + " all_covered_count = len(gt_dbs_with_any_pii.intersection(system_dbs_with_any_pii))\n", + " row['All PII'] = f\"{all_covered_count}/{all_gt_count}\"\n", + " \n", + " table_data.append(row)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(table_data)\n", + "\n", + "# Reorder columns to match Table 9\n", + "final_columns = ['ID', 'Application'] + [COLUMN_MAPPING[pt] for pt in PII_TYPES] + ['All PII']\n", + "df = df[final_columns]\n", + "\n", + "df = df.set_index('ID')\n", + "\n", + "# Display the dataframe\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Optional: Save to LaTeX\n", + "latex_output = df.to_latex(index=True, caption='Application-level source coverage by PII type.', label='tab:app_level_coverage', column_format='ll' + 'c' * (len(df.columns)))\n", + "print(latex_output)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file