mirror of
https://github.com/frankwxu/mobile-pii-discovery-agent.git
synced 2026-04-10 12:13:44 +00:00
add RQ3 table 9
This commit is contained in:
201
RQs/RQ3/RQ3_t9.ipynb
Normal file
201
RQs/RQ3/RQ3_t9.ipynb
Normal file
@@ -0,0 +1,201 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## RQ3: Table 9 - Application-level source coverage by PII type"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"import glob\n",
|
||||
"import pandas as pd\n",
|
||||
"from collections import defaultdict\n",
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"GPT4O_RESULTS_DIR = '../normalized_PII_results/gpt4o/db_level/'\n",
|
||||
"GROUND_TRUTH_DIR = '../normalized_PII_results/ground_truth/db_level/'\n",
|
||||
"\n",
|
||||
"PII_TYPES = ['EMAIL', 'PHONE', 'USERNAME', 'PERSON_NAME', 'POSTAL_ADDRESS']\n",
|
||||
"\n",
|
||||
"APP_MAPPING = {\n",
|
||||
" 'A1': 'WhatsApp',\n",
|
||||
" 'A2': 'Snapchat',\n",
|
||||
" 'A3': 'Telegram',\n",
|
||||
" 'A4': 'Google Maps',\n",
|
||||
" 'A5': 'Samsung Internet',\n",
|
||||
" 'I1': 'WhatsApp (iOS)',\n",
|
||||
" 'I2': 'Contacts',\n",
|
||||
" 'I3': 'Apple Messages',\n",
|
||||
" 'I4': 'Safari',\n",
|
||||
" 'I5': 'Calendar'\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"COLUMN_MAPPING = {\n",
|
||||
" 'EMAIL': 'Email',\n",
|
||||
" 'PHONE': 'Phone',\n",
|
||||
" 'USERNAME': 'User Name',\n",
|
||||
" 'PERSON_NAME': 'Person Name',\n",
|
||||
" 'POSTAL_ADDRESS': 'Postal Address'\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def parse_filename(filepath):\n",
|
||||
" \"\"\"Parses a filename to extract the app ID and database name.\"\"\"\n",
|
||||
" base_name = os.path.basename(filepath)\n",
|
||||
" # Format: PII_{APP_ID}_{DB_NAME}_{TIMESTAMP}.jsonl\n",
|
||||
" match = re.match(r'PII_([A-Z0-9]+)_(.*)_\\d{8}T\\d{6}Z\\.jsonl', base_name)\n",
|
||||
" if match:\n",
|
||||
" app_id = match.group(1)\n",
|
||||
" db_name = match.group(2)\n",
|
||||
" return app_id, db_name\n",
|
||||
" return None, None\n",
|
||||
"\n",
|
||||
"def load_data(path):\n",
|
||||
" \"\"\"Loads PII presence data from a directory of jsonl files.\"\"\"\n",
|
||||
" # Structure: {app_id: {db_name: {pii_type: has_pii_bool}}}\n",
|
||||
" data = defaultdict(lambda: defaultdict(lambda: defaultdict(bool)))\n",
|
||||
" files = glob.glob(os.path.join(path, '*.jsonl'))\n",
|
||||
" for f_path in files:\n",
|
||||
" app_id, db_name = parse_filename(f_path)\n",
|
||||
" if not app_id or not db_name:\n",
|
||||
" continue\n",
|
||||
" with open(f_path, 'r') as f:\n",
|
||||
" for line in f:\n",
|
||||
" record = json.loads(line)\n",
|
||||
" pii_type = record['PII_type']\n",
|
||||
" if len(record['PII']) > 0:\n",
|
||||
" data[app_id][db_name][pii_type] = True\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"gt_data = load_data(GROUND_TRUTH_DIR)\n",
|
||||
"system_data = load_data(GPT4O_RESULTS_DIR)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"table_data = []\n",
|
||||
"\n",
|
||||
"for app_id, app_name in APP_MAPPING.items():\n",
|
||||
" row = {'ID': app_id, 'Application': app_name}\n",
|
||||
" \n",
|
||||
" app_dbs_in_gt = gt_data.get(app_id, {}).keys()\n",
|
||||
"\n",
|
||||
" # --- Per-PII Type Calculation ---\n",
|
||||
" for pii_type in PII_TYPES:\n",
|
||||
" col_name = COLUMN_MAPPING[pii_type]\n",
|
||||
" \n",
|
||||
" # DG(a,t): set of databases for app 'a' that contain pii_type 't' in ground truth\n",
|
||||
" gt_dbs_with_pii = {db for db in app_dbs_in_gt if gt_data.get(app_id, {}).get(db, {}).get(pii_type, False)}\n",
|
||||
" \n",
|
||||
" # DS(a,t): set of databases for app 'a' that contain pii_type 't' in system output\n",
|
||||
" system_dbs_with_pii = {db for db in app_dbs_in_gt if system_data.get(app_id, {}).get(db, {}).get(pii_type, False)}\n",
|
||||
" \n",
|
||||
" gt_count = len(gt_dbs_with_pii)\n",
|
||||
" \n",
|
||||
" if gt_count == 0:\n",
|
||||
" row[col_name] = '-'\n",
|
||||
" else:\n",
|
||||
" # covered = |DG(a,t) \u2229 DS(a,t)|\n",
|
||||
" covered_count = len(gt_dbs_with_pii.intersection(system_dbs_with_pii))\n",
|
||||
" row[col_name] = f\"{covered_count}/{gt_count}\"\n",
|
||||
"\n",
|
||||
" # --- All PII Calculation ---\n",
|
||||
" # Databases in GT for this app that have *any* PII type\n",
|
||||
" gt_dbs_with_any_pii = {\n",
|
||||
" db for db in app_dbs_in_gt \n",
|
||||
" if any(gt_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in PII_TYPES)\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" # Databases in system output for this app that have *any* PII type\n",
|
||||
" system_dbs_with_any_pii = {\n",
|
||||
" db for db in app_dbs_in_gt\n",
|
||||
" if any(system_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in PII_TYPES)\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" all_gt_count = len(gt_dbs_with_any_pii)\n",
|
||||
" if all_gt_count == 0:\n",
|
||||
" row['All PII'] = '-'\n",
|
||||
" else:\n",
|
||||
" all_covered_count = len(gt_dbs_with_any_pii.intersection(system_dbs_with_any_pii))\n",
|
||||
" row['All PII'] = f\"{all_covered_count}/{all_gt_count}\"\n",
|
||||
" \n",
|
||||
" table_data.append(row)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.DataFrame(table_data)\n",
|
||||
"\n",
|
||||
"# Reorder columns to match Table 9\n",
|
||||
"final_columns = ['ID', 'Application'] + [COLUMN_MAPPING[pt] for pt in PII_TYPES] + ['All PII']\n",
|
||||
"df = df[final_columns]\n",
|
||||
"\n",
|
||||
"df = df.set_index('ID')\n",
|
||||
"\n",
|
||||
"# Display the dataframe\n",
|
||||
"df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Optional: Save to LaTeX\n",
|
||||
"latex_output = df.to_latex(index=True, caption='Application-level source coverage by PII type.', label='tab:app_level_coverage', column_format='ll' + 'c' * (len(df.columns)))\n",
|
||||
"print(latex_output)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
Reference in New Issue
Block a user