diff --git a/RQs/RQ3/RQ3_t9.ipynb b/RQs/RQ3/RQ3_t9.ipynb index 4516462..1405386 100644 --- a/RQs/RQ3/RQ3_t9.ipynb +++ b/RQs/RQ3/RQ3_t9.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -23,40 +23,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "GPT4O_RESULTS_DIR = '../normalized_PII_results/gpt4o/db_level/'\n", - "GROUND_TRUTH_DIR = '../normalized_PII_results/ground_truth/db_level/'\n", - "\n", - "PII_TYPES = ['EMAIL', 'PHONE', 'USERNAME', 'PERSON_NAME', 'POSTAL_ADDRESS']\n", - "\n", - "APP_MAPPING = {\n", - " 'A1': 'WhatsApp',\n", - " 'A2': 'Snapchat',\n", - " 'A3': 'Telegram',\n", - " 'A4': 'Google Maps',\n", - " 'A5': 'Samsung Internet',\n", - " 'I1': 'WhatsApp (iOS)',\n", - " 'I2': 'Contacts',\n", - " 'I3': 'Apple Messages',\n", - " 'I4': 'Safari',\n", - " 'I5': 'Calendar'\n", - "}\n", - "\n", - "COLUMN_MAPPING = {\n", - " 'EMAIL': 'Email',\n", - " 'PHONE': 'Phone',\n", - " 'USERNAME': 'User Name',\n", - " 'PERSON_NAME': 'Person Name',\n", - " 'POSTAL_ADDRESS': 'Postal Address'\n", - "}" + "import sys\n", + "import os\n", + "# Add the parent directory (RQs) to the path to find the config file\n", + "if '..' not in sys.path:\n", + " sys.path.insert(1, os.path.abspath('..'))\n", + "import config" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -88,26 +69,26 @@ " data[app_id][db_name][pii_type] = True\n", " return data\n", "\n", - "gt_data = load_data(GROUND_TRUTH_DIR)\n", - "system_data = load_data(GPT4O_RESULTS_DIR)" + "gt_data = load_data(os.path.join('..', config.GROUND_TRUTH_DIR))\n", + "system_data = load_data(os.path.join('..', config.GPT4O_RESULTS_DIR))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "table_data = []\n", "\n", - "for app_id, app_name in APP_MAPPING.items():\n", + "for app_id, app_name in config.APP_MAPPING.items():\n", " row = {'ID': app_id, 'Application': app_name}\n", " \n", " app_dbs_in_gt = gt_data.get(app_id, {}).keys()\n", "\n", " # --- Per-PII Type Calculation ---\n", - " for pii_type in PII_TYPES:\n", - " col_name = COLUMN_MAPPING[pii_type]\n", + " for pii_type in config.PII_TYPES:\n", + " col_name = config.COLUMN_MAPPING[pii_type]\n", " \n", " # DG(a,t): set of databases for app 'a' that contain pii_type 't' in ground truth\n", " gt_dbs_with_pii = {db for db in app_dbs_in_gt if gt_data.get(app_id, {}).get(db, {}).get(pii_type, False)}\n", @@ -120,7 +101,7 @@ " if gt_count == 0:\n", " row[col_name] = '-'\n", " else:\n", - " # covered = |DG(a,t) \u2229 DS(a,t)|\n", + " # covered = |DG(a,t) ∩ DS(a,t)|\n", " covered_count = len(gt_dbs_with_pii.intersection(system_dbs_with_pii))\n", " row[col_name] = f\"{covered_count}/{gt_count}\"\n", "\n", @@ -128,13 +109,13 @@ " # Databases in GT for this app that have *any* PII type\n", " gt_dbs_with_any_pii = {\n", " db for db in app_dbs_in_gt \n", - " if any(gt_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in PII_TYPES)\n", + " if any(gt_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in config.PII_TYPES)\n", " }\n", " \n", " # Databases in system output for this app that have *any* PII type\n", " system_dbs_with_any_pii = {\n", " db for db in app_dbs_in_gt\n", - " if any(system_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in PII_TYPES)\n", + " if any(system_data.get(app_id, {}).get(db, {}).get(pt, False) for pt in config.PII_TYPES)\n", " }\n", "\n", " all_gt_count = len(gt_dbs_with_any_pii)\n", @@ -149,14 +130,179 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ApplicationEmailPhoneUser NamePerson NamePostal AddressAll PII
ID
A1WhatsApp-2/21/12/2-2/2
A2Snapchat1/12/21/12/2-2/2
A3Telegram------
A4Google Maps1/1-1/1--1/1
A5Samsung Internet1/1-1/1--1/1
I1WhatsApp (iOS)---1/11/11/1
I2Contacts1/11/1-1/1-1/1
I3Apple Messages1/10/10/11/1-1/1
I4Safari--1/1--1/1
I5Calendar1/1----1/1
\n", + "
" + ], + "text/plain": [ + " Application Email Phone User Name Person Name Postal Address All PII\n", + "ID \n", + "A1 WhatsApp - 2/2 1/1 2/2 - 2/2\n", + "A2 Snapchat 1/1 2/2 1/1 2/2 - 2/2\n", + "A3 Telegram - - - - - -\n", + "A4 Google Maps 1/1 - 1/1 - - 1/1\n", + "A5 Samsung Internet 1/1 - 1/1 - - 1/1\n", + "I1 WhatsApp (iOS) - - - 1/1 1/1 1/1\n", + "I2 Contacts 1/1 1/1 - 1/1 - 1/1\n", + "I3 Apple Messages 1/1 0/1 0/1 1/1 - 1/1\n", + "I4 Safari - - 1/1 - - 1/1\n", + "I5 Calendar 1/1 - - - - 1/1" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df = pd.DataFrame(table_data)\n", "\n", "# Reorder columns to match Table 9\n", - "final_columns = ['ID', 'Application'] + [COLUMN_MAPPING[pt] for pt in PII_TYPES] + ['All PII']\n", + "final_columns = ['ID', 'Application'] + [config.COLUMN_MAPPING[pt] for pt in config.PII_TYPES] + ['All PII']\n", "df = df[final_columns]\n", "\n", "df = df.set_index('ID')\n", @@ -167,9 +313,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\begin{table}\n", + "\\caption{Application-level source coverage by PII type.}\n", + "\\label{tab:app_level_coverage}\n", + "\\begin{tabular}{llccccccc}\n", + "\\toprule\n", + " & Application & Email & Phone & User Name & Person Name & Postal Address & All PII \\\\\n", + "ID & & & & & & & \\\\\n", + "\\midrule\n", + "A1 & WhatsApp & - & 2/2 & 1/1 & 2/2 & - & 2/2 \\\\\n", + "A2 & Snapchat & 1/1 & 2/2 & 1/1 & 2/2 & - & 2/2 \\\\\n", + "A3 & Telegram & - & - & - & - & - & - \\\\\n", + "A4 & Google Maps & 1/1 & - & 1/1 & - & - & 1/1 \\\\\n", + "A5 & Samsung Internet & 1/1 & - & 1/1 & - & - & 1/1 \\\\\n", + "I1 & WhatsApp (iOS) & - & - & - & 1/1 & 1/1 & 1/1 \\\\\n", + "I2 & Contacts & 1/1 & 1/1 & - & 1/1 & - & 1/1 \\\\\n", + "I3 & Apple Messages & 1/1 & 0/1 & 0/1 & 1/1 & - & 1/1 \\\\\n", + "I4 & Safari & - & - & 1/1 & - & - & 1/1 \\\\\n", + "I5 & Calendar & 1/1 & - & - & - & - & 1/1 \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "\\end{table}\n", + "\n" + ] + } + ], "source": [ "# Optional: Save to LaTeX\n", "latex_output = df.to_latex(index=True, caption='Application-level source coverage by PII type.', label='tab:app_level_coverage', column_format='ll' + 'c' * (len(df.columns)))\n", @@ -193,9 +368,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.10.18" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +}