udpate RQ1

This commit is contained in:
Frank Xu
2026-01-31 20:55:46 -05:00
parent db04699615
commit 6006957a25
3 changed files with 149 additions and 99 deletions

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "e15e3ffe",
"metadata": {},
"outputs": [
@@ -13,43 +13,44 @@
"ID Application Database Email Phone UserName PersonName PostalAddr Total\n",
"-------------------------------------------------------------------------------------------------\n",
"A1 WhatsApp commerce.db 0 0 0 0 0 0\n",
"A1 WhatsApp msgstore.db 0 7 2 22 0 31\n",
"A1 WhatsApp wa.db 0 16 0 10 0 26\n",
"A2 Snapchat core.db 0 1 12 1 0 14\n",
"A1 WhatsApp msgstore.db 0 7 5 15 0 27\n",
"A1 WhatsApp wa.db 0 16 680 10 0 706\n",
"A2 Snapchat core.db 0 1 5 1 0 7\n",
"A2 Snapchat journal.db 0 0 0 0 0 0\n",
"A2 Snapchat main.db 1 13 0 12 0 26\n",
"A2 Snapchat main.db 1 13 0 11 0 25\n",
"A3 Telegram account1cache4.db 0 0 0 0 0 0\n",
"A3 Telegram account2cache4.db 0 0 0 0 0 0\n",
"A3 Telegram account3cache4.db 0 0 0 0 0 0\n",
"A4 Google Maps gmm_myplaces.db 0 0 0 0 0 0\n",
"A4 Google Maps gmm_storage.db 0 0 0 0 0 0\n",
"A4 Google Maps peopleCache_sh....db 2 0 2 0 0 4\n",
"A5 Samsung Internet SBrowser.db 1 0 4 0 0 5\n",
"A5 Samsung Internet SBrowser.db 1 0 5 0 0 6\n",
"A5 Samsung Internet SBrowser2.db 0 0 0 0 0 0\n",
"A5 Samsung Internet searchengine.db 0 0 0 0 0 0\n",
"I1 WhatsApp CallHistory.sqlite 0 0 0 0 0 0\n",
"A5 Samsung Internet searchengine.db 0 0 17 0 0 17\n",
"I1 WhatsApp CallHistory.sqlite 0 0 10 0 0 10\n",
"I1 WhatsApp ChatStorage.sqlite 0 0 0 14 2 16\n",
"I1 WhatsApp ContactsV2.sqlite 0 0 0 0 0 0\n",
"I2 Contacts AddressBook.sqlitedb 6 1012 0 865 0 1883\n",
"I1 WhatsApp ContactsV2.sqlite 0 655 0 0 0 655\n",
"I2 Contacts AddressBook.sqlitedb 6 548 1 748 0 1303\n",
"I2 Contacts AddressB....sqlitedb 0 0 0 0 0 0\n",
"I3 Apple Messages sms.db 1 20 2 10 0 33\n",
"I4 Safari CloudTabs.db 0 0 0 0 0 0\n",
"I4 Safari History.db 0 0 63 0 0 63\n",
"I3 Apple Messages sms.db 1 0 0 10 0 11\n",
"I4 Safari CloudTabs.db 0 0 25 0 0 25\n",
"I4 Safari History.db 0 0 4 0 0 4\n",
"I5 Calendar Calendar.sqlitedb 1 0 0 0 0 1\n",
"I5 Calendar Extras.db 0 0 0 0 0 0\n",
"\n",
"Wrote LaTeX: I:\\project2026\\llmagent\\RQs\\RQ1\\RQ1_t4.tex\n"
"Wrote LaTeX: I:\\project2026\\llmagent\\RQs\\RQ1\\RQ1_t4.tex\n",
"Wrote CSV: I:\\project2026\\llmagent\\RQs\\RQ1\\RQ1_t4_plain.csv\n"
]
}
],
"source": [
"import json\n",
"import csv\n",
"from pathlib import Path\n",
"from collections import defaultdict, OrderedDict\n",
"from typing import Dict, Tuple, List\n",
"\n",
"IN_DIR = Path(r\"..\\batch_results_gpt4o_normalized\")\n",
"OUT_TEX = Path(\"RQ1_t4.tex\")\n",
"\n",
"\n",
"APP_NAME = OrderedDict([\n",
" (\"A1\", \"WhatsApp\"),\n",
@@ -64,7 +65,6 @@
" (\"I5\", \"Calendar\"),\n",
"])\n",
"\n",
"# Plain-text screen display names (no LaTeX newlines)\n",
"APP_NAME_PLAIN = OrderedDict([\n",
" (\"A1\", \"WhatsApp\"),\n",
" (\"A2\", \"Snapchat\"),\n",
@@ -81,9 +81,9 @@
"PII_COLS = OrderedDict([\n",
" (\"EMAIL\", \"Email\"),\n",
" (\"PHONE\", \"Phone\"),\n",
" (\"USERNAME\", \"User Name\"),\n",
" (\"PERSON_NAME\", \"Person Name\"),\n",
" (\"POSTAL_ADDRESS\", \"Postal Address\"),\n",
" (\"USERNAME\", \"UserName\"),\n",
" (\"PERSON_NAME\", \"PersonName\"),\n",
" (\"POSTAL_ADDRESS\", \"PostalAddr\"),\n",
"])\n",
"\n",
"MAX_DB_NAME_LEN = 20\n",
@@ -151,6 +151,67 @@
" return counts\n",
"\n",
"\n",
"def build_rows_for_plain_and_csv(\n",
" counts: Dict[Tuple[str, str], Dict[str, int]],\n",
") -> List[List[str]]:\n",
" by_app: Dict[str, List[Tuple[str, Dict[str, int]]]] = defaultdict(list)\n",
" for (app, db_file), c in counts.items():\n",
" by_app[app].append((db_file, c))\n",
"\n",
" for app in by_app:\n",
" by_app[app].sort(key=lambda x: x[0].lower())\n",
"\n",
" header = [\"ID\", \"Application\", \"Database\"] + list(PII_COLS.values()) + [\"Total\"]\n",
" rows: List[List[str]] = [header]\n",
"\n",
" app_order = list(APP_NAME_PLAIN.keys()) + [a for a in sorted(by_app.keys()) if a not in APP_NAME_PLAIN]\n",
"\n",
" for app in app_order:\n",
" if app not in by_app:\n",
" continue\n",
" app_disp = APP_NAME_PLAIN.get(app, app)\n",
"\n",
" for db_file, cdict in by_app[app]:\n",
" vals = [int(cdict[k]) for k in PII_COLS.keys()]\n",
" total = sum(vals)\n",
" db_show = shorten_db_name(db_file, MAX_DB_NAME_LEN)\n",
"\n",
" rows.append(\n",
" [app, app_disp, db_show] + [str(v) for v in vals] + [str(total)]\n",
" )\n",
"\n",
" return rows\n",
"\n",
"\n",
"def build_plain_text_table_from_rows(rows: List[List[str]]) -> str:\n",
" widths = [0] * len(rows[0])\n",
" for r in rows:\n",
" for i, cell in enumerate(r):\n",
" widths[i] = max(widths[i], len(cell))\n",
"\n",
" def fmt_row(r: List[str]) -> str:\n",
" parts = []\n",
" for i, cell in enumerate(r):\n",
" # text columns\n",
" if i <= 2:\n",
" parts.append(cell.ljust(widths[i]))\n",
" else:\n",
" parts.append(cell.rjust(widths[i]))\n",
" return \" \".join(parts)\n",
"\n",
" lines = [fmt_row(rows[0])]\n",
" lines.append(\"-\" * len(lines[0]))\n",
" for r in rows[1:]:\n",
" lines.append(fmt_row(r))\n",
" return \"\\n\".join(lines)\n",
"\n",
"\n",
"def write_csv(rows: List[List[str]], out_csv: Path) -> None:\n",
" out_csv.parent.mkdir(parents=True, exist_ok=True)\n",
" with out_csv.open(\"w\", encoding=\"utf-8\", newline=\"\") as f:\n",
" csv.writer(f).writerows(rows)\n",
"\n",
"\n",
"def format_row_tex(\n",
" app_code: str,\n",
" app_display: str,\n",
@@ -159,12 +220,8 @@
" is_first_row_for_app: bool,\n",
" nrows_for_app: int,\n",
") -> str:\n",
" email = counts_for_db[\"EMAIL\"]\n",
" phone = counts_for_db[\"PHONE\"]\n",
" uname = counts_for_db[\"USERNAME\"]\n",
" pname = counts_for_db[\"PERSON_NAME\"]\n",
" addr = counts_for_db[\"POSTAL_ADDRESS\"]\n",
" total = email + phone + uname + pname + addr\n",
" vals = {k: int(counts_for_db[k]) for k in PII_COLS.keys()}\n",
" total = sum(vals.values())\n",
"\n",
" db_show = shorten_db_name(db_file, MAX_DB_NAME_LEN)\n",
" db_tex = latex_escape(db_show)\n",
@@ -173,10 +230,16 @@
" return (\n",
" rf\"\\multirow{{{nrows_for_app}}}{{*}}{{{app_code}}} & \"\n",
" rf\"\\multirow{{{nrows_for_app}}}{{*}}{{{app_display}}} & \"\n",
" rf\"{db_tex} & {email} & {phone} & {uname} & {pname} & {addr} & \\textbf{{{total}}} \\\\\"\n",
" rf\"{db_tex} & \"\n",
" + \" & \".join(str(vals[k]) for k in PII_COLS.keys())\n",
" + rf\" & \\textbf{{{total}}} \\\\\"\n",
" )\n",
"\n",
" return rf\" & & {db_tex} & {email} & {phone} & {uname} & {pname} & {addr} & \\textbf{{{total}}} \\\\\"\n",
" return (\n",
" rf\" & & {db_tex} & \"\n",
" + \" & \".join(str(vals[k]) for k in PII_COLS.keys())\n",
" + rf\" & \\textbf{{{total}}} \\\\\"\n",
" )\n",
"\n",
"\n",
"def build_table_tex(counts: Dict[Tuple[str, str], Dict[str, int]]) -> str:\n",
@@ -223,72 +286,33 @@
" return \"\\n\".join(lines)\n",
"\n",
"\n",
"def build_plain_text_table(counts: Dict[Tuple[str, str], Dict[str, int]]) -> str:\n",
" \"\"\"\n",
" Prints a simple readable table to the console (no LaTeX, no multirow).\n",
" One row per (app, db).\n",
" \"\"\"\n",
" by_app: Dict[str, List[Tuple[str, Dict[str, int]]]] = defaultdict(list)\n",
" for (app, db_file), c in counts.items():\n",
" by_app[app].append((db_file, c))\n",
"def generate_db_level_pii_tables(\n",
" in_dir: Path,\n",
" out_tex: Path,\n",
" out_csv: Path,\n",
") -> None:\n",
" counts = load_db_level_counts(in_dir)\n",
"\n",
" for app in by_app:\n",
" by_app[app].sort(key=lambda x: x[0].lower())\n",
" tex = build_table_tex(counts)\n",
" out_tex.parent.mkdir(parents=True, exist_ok=True)\n",
" out_tex.write_text(tex, encoding=\"utf-8\")\n",
"\n",
" rows: List[List[str]] = []\n",
" header = [\"ID\", \"Application\", \"Database\", \"Email\", \"Phone\", \"UserName\", \"PersonName\", \"PostalAddr\", \"Total\"]\n",
" rows.append(header)\n",
" rows = build_rows_for_plain_and_csv(counts)\n",
" write_csv(rows, out_csv)\n",
"\n",
" app_order = list(APP_NAME_PLAIN.keys()) + [a for a in sorted(by_app.keys()) if a not in APP_NAME_PLAIN]\n",
" print(build_plain_text_table_from_rows(rows))\n",
" print(f\"\\nWrote LaTeX: {out_tex.resolve()}\")\n",
" print(f\"Wrote CSV: {out_csv.resolve()}\")\n",
"\n",
" for app in app_order:\n",
" if app not in by_app:\n",
" continue\n",
" app_disp = APP_NAME_PLAIN.get(app, app)\n",
"\n",
" for db_file, cdict in by_app[app]:\n",
" email = cdict[\"EMAIL\"]\n",
" phone = cdict[\"PHONE\"]\n",
" uname = cdict[\"USERNAME\"]\n",
" pname = cdict[\"PERSON_NAME\"]\n",
" addr = cdict[\"POSTAL_ADDRESS\"]\n",
" total = email + phone + uname + pname + addr\n",
"\n",
" db_show = shorten_db_name(db_file, MAX_DB_NAME_LEN)\n",
" rows.append([app, app_disp, db_show, str(email), str(phone), str(uname), str(pname), str(addr), str(total)])\n",
"\n",
" # compute column widths\n",
" widths = [0] * len(rows[0])\n",
" for r in rows:\n",
" for i, cell in enumerate(r):\n",
" widths[i] = max(widths[i], len(cell))\n",
"\n",
" def fmt_row(r: List[str]) -> str:\n",
" parts = []\n",
" for i, cell in enumerate(r):\n",
" if i <= 2:\n",
" parts.append(cell.ljust(widths[i]))\n",
" else:\n",
" parts.append(cell.rjust(widths[i]))\n",
" return \" \".join(parts)\n",
"\n",
" lines = [fmt_row(rows[0])]\n",
" lines.append(\"-\" * len(lines[0]))\n",
" for r in rows[1:]:\n",
" lines.append(fmt_row(r))\n",
" return \"\\n\".join(lines)\n",
"\n",
"def main() -> None:\n",
" in_dir = Path(r\"..\\batch_results_gpt4o_normalized\")\n",
" out_tex = Path(\"RQ1_t4.tex\")\n",
" out_csv = Path(\"RQ1_t4_plain.csv\")\n",
" generate_db_level_pii_tables(in_dir, out_tex, out_csv)\n",
"\n",
"if __name__ == \"__main__\":\n",
" counts = load_db_level_counts(IN_DIR)\n",
"\n",
" # Write LaTeX\n",
" tex = build_table_tex(counts)\n",
" OUT_TEX.write_text(tex, encoding=\"utf-8\")\n",
"\n",
" # Print a plain-text preview to screen\n",
" print(build_plain_text_table(counts))\n",
" print(f\"\\nWrote LaTeX: {OUT_TEX.resolve()}\")\n"
" main()\n"
]
}
],

View File

@@ -9,15 +9,15 @@
\hline
\multirow{3}{*}{A1} & \multirow{3}{*}{WhatsApp} & commerce.db & 0 & 0 & 0 & 0 & 0 & \textbf{0} \\
\cline{3-9}
& & msgstore.db & 0 & 7 & 2 & 22 & 0 & \textbf{31} \\
& & msgstore.db & 0 & 7 & 5 & 15 & 0 & \textbf{27} \\
\cline{3-9}
& & wa.db & 0 & 16 & 0 & 10 & 0 & \textbf{26} \\
& & wa.db & 0 & 16 & 680 & 10 & 0 & \textbf{706} \\
\hline
\multirow{3}{*}{A2} & \multirow{3}{*}{Snapchat} & core.db & 0 & 1 & 12 & 1 & 0 & \textbf{14} \\
\multirow{3}{*}{A2} & \multirow{3}{*}{Snapchat} & core.db & 0 & 1 & 5 & 1 & 0 & \textbf{7} \\
\cline{3-9}
& & journal.db & 0 & 0 & 0 & 0 & 0 & \textbf{0} \\
\cline{3-9}
& & main.db & 1 & 13 & 0 & 12 & 0 & \textbf{26} \\
& & main.db & 1 & 13 & 0 & 11 & 0 & \textbf{25} \\
\hline
\multirow{3}{*}{A3} & \multirow{3}{*}{Telegram} & account1cache4.db & 0 & 0 & 0 & 0 & 0 & \textbf{0} \\
\cline{3-9}
@@ -31,27 +31,27 @@
\cline{3-9}
& & peopleCache\_sh....db & 2 & 0 & 2 & 0 & 0 & \textbf{4} \\
\hline
\multirow{3}{*}{A5} & \multirow{3}{*}{\begin{tabular}[c]{@{}l@{}}Samsung \\Internet\end{tabular}} & SBrowser.db & 1 & 0 & 4 & 0 & 0 & \textbf{5} \\
\multirow{3}{*}{A5} & \multirow{3}{*}{\begin{tabular}[c]{@{}l@{}}Samsung \\Internet\end{tabular}} & SBrowser.db & 1 & 0 & 5 & 0 & 0 & \textbf{6} \\
\cline{3-9}
& & SBrowser2.db & 0 & 0 & 0 & 0 & 0 & \textbf{0} \\
\cline{3-9}
& & searchengine.db & 0 & 0 & 0 & 0 & 0 & \textbf{0} \\
& & searchengine.db & 0 & 0 & 17 & 0 & 0 & \textbf{17} \\
\hline
\multirow{3}{*}{I1} & \multirow{3}{*}{WhatsApp} & CallHistory.sqlite & 0 & 0 & 0 & 0 & 0 & \textbf{0} \\
\multirow{3}{*}{I1} & \multirow{3}{*}{WhatsApp} & CallHistory.sqlite & 0 & 0 & 10 & 0 & 0 & \textbf{10} \\
\cline{3-9}
& & ChatStorage.sqlite & 0 & 0 & 0 & 14 & 2 & \textbf{16} \\
\cline{3-9}
& & ContactsV2.sqlite & 0 & 0 & 0 & 0 & 0 & \textbf{0} \\
& & ContactsV2.sqlite & 0 & 655 & 0 & 0 & 0 & \textbf{655} \\
\hline
\multirow{2}{*}{I2} & \multirow{2}{*}{Contacts} & AddressBook.sqlitedb & 6 & 1012 & 0 & 865 & 0 & \textbf{1883} \\
\multirow{2}{*}{I2} & \multirow{2}{*}{Contacts} & AddressBook.sqlitedb & 6 & 548 & 1 & 748 & 0 & \textbf{1303} \\
\cline{3-9}
& & AddressB....sqlitedb & 0 & 0 & 0 & 0 & 0 & \textbf{0} \\
\hline
\multirow{1}{*}{I3} & \multirow{1}{*}{\begin{tabular}[c]{@{}l@{}}Apple \\Messages\end{tabular}} & sms.db & 1 & 20 & 2 & 10 & 0 & \textbf{33} \\
\multirow{1}{*}{I3} & \multirow{1}{*}{\begin{tabular}[c]{@{}l@{}}Apple \\Messages\end{tabular}} & sms.db & 1 & 0 & 0 & 10 & 0 & \textbf{11} \\
\hline
\multirow{2}{*}{I4} & \multirow{2}{*}{Safari} & CloudTabs.db & 0 & 0 & 0 & 0 & 0 & \textbf{0} \\
\multirow{2}{*}{I4} & \multirow{2}{*}{Safari} & CloudTabs.db & 0 & 0 & 25 & 0 & 0 & \textbf{25} \\
\cline{3-9}
& & History.db & 0 & 0 & 63 & 0 & 0 & \textbf{63} \\
& & History.db & 0 & 0 & 4 & 0 & 0 & \textbf{4} \\
\hline
\multirow{2}{*}{I5} & \multirow{2}{*}{Calendar} & Calendar.sqlitedb & 1 & 0 & 0 & 0 & 0 & \textbf{1} \\
\cline{3-9}

26
RQs/RQ1/RQ1_t4_plain.csv Normal file
View File

@@ -0,0 +1,26 @@
ID,Application,Database,Email,Phone,UserName,PersonName,PostalAddr,Total
A1,WhatsApp,commerce.db,0,0,0,0,0,0
A1,WhatsApp,msgstore.db,0,7,5,15,0,27
A1,WhatsApp,wa.db,0,16,680,10,0,706
A2,Snapchat,core.db,0,1,5,1,0,7
A2,Snapchat,journal.db,0,0,0,0,0,0
A2,Snapchat,main.db,1,13,0,11,0,25
A3,Telegram,account1cache4.db,0,0,0,0,0,0
A3,Telegram,account2cache4.db,0,0,0,0,0,0
A3,Telegram,account3cache4.db,0,0,0,0,0,0
A4,Google Maps,gmm_myplaces.db,0,0,0,0,0,0
A4,Google Maps,gmm_storage.db,0,0,0,0,0,0
A4,Google Maps,peopleCache_sh....db,2,0,2,0,0,4
A5,Samsung Internet,SBrowser.db,1,0,5,0,0,6
A5,Samsung Internet,SBrowser2.db,0,0,0,0,0,0
A5,Samsung Internet,searchengine.db,0,0,17,0,0,17
I1,WhatsApp,CallHistory.sqlite,0,0,10,0,0,10
I1,WhatsApp,ChatStorage.sqlite,0,0,0,14,2,16
I1,WhatsApp,ContactsV2.sqlite,0,655,0,0,0,655
I2,Contacts,AddressBook.sqlitedb,6,548,1,748,0,1303
I2,Contacts,AddressB....sqlitedb,0,0,0,0,0,0
I3,Apple Messages,sms.db,1,0,0,10,0,11
I4,Safari,CloudTabs.db,0,0,25,0,0,25
I4,Safari,History.db,0,0,4,0,0,4
I5,Calendar,Calendar.sqlitedb,1,0,0,0,0,1
I5,Calendar,Extras.db,0,0,0,0,0,0
1 ID Application Database Email Phone UserName PersonName PostalAddr Total
2 A1 WhatsApp commerce.db 0 0 0 0 0 0
3 A1 WhatsApp msgstore.db 0 7 5 15 0 27
4 A1 WhatsApp wa.db 0 16 680 10 0 706
5 A2 Snapchat core.db 0 1 5 1 0 7
6 A2 Snapchat journal.db 0 0 0 0 0 0
7 A2 Snapchat main.db 1 13 0 11 0 25
8 A3 Telegram account1cache4.db 0 0 0 0 0 0
9 A3 Telegram account2cache4.db 0 0 0 0 0 0
10 A3 Telegram account3cache4.db 0 0 0 0 0 0
11 A4 Google Maps gmm_myplaces.db 0 0 0 0 0 0
12 A4 Google Maps gmm_storage.db 0 0 0 0 0 0
13 A4 Google Maps peopleCache_sh....db 2 0 2 0 0 4
14 A5 Samsung Internet SBrowser.db 1 0 5 0 0 6
15 A5 Samsung Internet SBrowser2.db 0 0 0 0 0 0
16 A5 Samsung Internet searchengine.db 0 0 17 0 0 17
17 I1 WhatsApp CallHistory.sqlite 0 0 10 0 0 10
18 I1 WhatsApp ChatStorage.sqlite 0 0 0 14 2 16
19 I1 WhatsApp ContactsV2.sqlite 0 655 0 0 0 655
20 I2 Contacts AddressBook.sqlitedb 6 548 1 748 0 1303
21 I2 Contacts AddressB....sqlitedb 0 0 0 0 0 0
22 I3 Apple Messages sms.db 1 0 0 10 0 11
23 I4 Safari CloudTabs.db 0 0 25 0 0 25
24 I4 Safari History.db 0 0 4 0 0 4
25 I5 Calendar Calendar.sqlitedb 1 0 0 0 0 1
26 I5 Calendar Extras.db 0 0 0 0 0 0