diff --git a/RQs/RQ3/RQ3_t9.ipynb b/RQs/RQ3/RQ3_t9.ipynb index 4516462..1405386 100644 --- a/RQs/RQ3/RQ3_t9.ipynb +++ b/RQs/RQ3/RQ3_t9.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -23,40 +23,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "GPT4O_RESULTS_DIR = '../normalized_PII_results/gpt4o/db_level/'\n", - "GROUND_TRUTH_DIR = '../normalized_PII_results/ground_truth/db_level/'\n", - "\n", - "PII_TYPES = ['EMAIL', 'PHONE', 'USERNAME', 'PERSON_NAME', 'POSTAL_ADDRESS']\n", - "\n", - "APP_MAPPING = {\n", - " 'A1': 'WhatsApp',\n", - " 'A2': 'Snapchat',\n", - " 'A3': 'Telegram',\n", - " 'A4': 'Google Maps',\n", - " 'A5': 'Samsung Internet',\n", - " 'I1': 'WhatsApp (iOS)',\n", - " 'I2': 'Contacts',\n", - " 'I3': 'Apple Messages',\n", - " 'I4': 'Safari',\n", - " 'I5': 'Calendar'\n", - "}\n", - "\n", - "COLUMN_MAPPING = {\n", - " 'EMAIL': 'Email',\n", - " 'PHONE': 'Phone',\n", - " 'USERNAME': 'User Name',\n", - " 'PERSON_NAME': 'Person Name',\n", - " 'POSTAL_ADDRESS': 'Postal Address'\n", - "}" + "import sys\n", + "import os\n", + "# Add the parent directory (RQs) to the path to find the config file\n", + "if '..' not in sys.path:\n", + " sys.path.insert(1, os.path.abspath('..'))\n", + "import config" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -88,26 +69,26 @@ " data[app_id][db_name][pii_type] = True\n", " return data\n", "\n", - "gt_data = load_data(GROUND_TRUTH_DIR)\n", - "system_data = load_data(GPT4O_RESULTS_DIR)" + "gt_data = load_data(os.path.join('..', config.GROUND_TRUTH_DIR))\n", + "system_data = load_data(os.path.join('..', config.GPT4O_RESULTS_DIR))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "table_data = []\n", "\n", - "for app_id, app_name in APP_MAPPING.items():\n", + "for app_id, app_name in config.APP_MAPPING.items():\n", " row = {'ID': app_id, 'Application': app_name}\n", " \n", " app_dbs_in_gt = gt_data.get(app_id, {}).keys()\n", "\n", " # --- Per-PII Type Calculation ---\n", - " for pii_type in PII_TYPES:\n", - " col_name = COLUMN_MAPPING[pii_type]\n", + " for pii_type in config.PII_TYPES:\n", + " col_name = config.COLUMN_MAPPING[pii_type]\n", " \n", " # DG(a,t): set of databases for app 'a' that contain pii_type 't' in ground truth\n", " gt_dbs_with_pii = {db for db in app_dbs_in_gt if gt_data.get(app_id, {}).get(db, {}).get(pii_type, False)}\n", @@ -120,7 +101,7 @@ " if gt_count == 0:\n", " row[col_name] = '-'\n", " else:\n", - " # covered = |DG(a,t) \u2229 DS(a,t)|\n", + " # covered = |DG(a,t) ∩ DS(a,t)|\n", " covered_count = len(gt_dbs_with_pii.intersection(system_dbs_with_pii))\n", " row[col_name] = f\"{covered_count}/{gt_count}\"\n", "\n", @@ -128,13 +109,13 @@ " # Databases in GT for this app that have *any* PII type\n", " gt_dbs_with_any_pii = {\n", " db for db in app_dbs_in_gt \n", - " if any(gt_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in PII_TYPES)\n", + " if any(gt_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in config.PII_TYPES)\n", " }\n", " \n", " # Databases in system output for this app that have *any* PII type\n", " system_dbs_with_any_pii = {\n", " db for db in app_dbs_in_gt\n", - " if any(system_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in PII_TYPES)\n", + " if any(system_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in config.PII_TYPES)\n", " }\n", "\n", " all_gt_count = len(gt_dbs_with_any_pii)\n", @@ -149,14 +130,179 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | Application | \n", + "Phone | \n", + "User Name | \n", + "Person Name | \n", + "Postal Address | \n", + "All PII | \n", + "|
|---|---|---|---|---|---|---|---|
| ID | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| A1 | \n", + "- | \n", + "2/2 | \n", + "1/1 | \n", + "2/2 | \n", + "- | \n", + "2/2 | \n", + "|
| A2 | \n", + "Snapchat | \n", + "1/1 | \n", + "2/2 | \n", + "1/1 | \n", + "2/2 | \n", + "- | \n", + "2/2 | \n", + "
| A3 | \n", + "Telegram | \n", + "- | \n", + "- | \n", + "- | \n", + "- | \n", + "- | \n", + "- | \n", + "
| A4 | \n", + "Google Maps | \n", + "1/1 | \n", + "- | \n", + "1/1 | \n", + "- | \n", + "- | \n", + "1/1 | \n", + "
| A5 | \n", + "Samsung Internet | \n", + "1/1 | \n", + "- | \n", + "1/1 | \n", + "- | \n", + "- | \n", + "1/1 | \n", + "
| I1 | \n", + "WhatsApp (iOS) | \n", + "- | \n", + "- | \n", + "- | \n", + "1/1 | \n", + "1/1 | \n", + "1/1 | \n", + "
| I2 | \n", + "Contacts | \n", + "1/1 | \n", + "1/1 | \n", + "- | \n", + "1/1 | \n", + "- | \n", + "1/1 | \n", + "
| I3 | \n", + "Apple Messages | \n", + "1/1 | \n", + "0/1 | \n", + "0/1 | \n", + "1/1 | \n", + "- | \n", + "1/1 | \n", + "
| I4 | \n", + "Safari | \n", + "- | \n", + "- | \n", + "1/1 | \n", + "- | \n", + "- | \n", + "1/1 | \n", + "
| I5 | \n", + "Calendar | \n", + "1/1 | \n", + "- | \n", + "- | \n", + "- | \n", + "- | \n", + "1/1 | \n", + "