diff --git a/agent_evidence_discovery.ipynb b/agent_evidence_discovery.ipynb index 75fad39..a40fd1c 100644 --- a/agent_evidence_discovery.ipynb +++ b/agent_evidence_discovery.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "a10c9a6a", "metadata": {}, "outputs": [ @@ -21,6 +21,7 @@ "from langchain_core.messages import HumanMessage\n", "from sql_utils import *\n", "from datetime import datetime, timezone\n", + "from pathlib import Path\n", "\n", "load_dotenv() # This looks for the .env file and loads it into os.environ\n", "\n", @@ -593,6 +594,7 @@ "name": "stdout", "output_type": "stream", "text": [ + "Will process 1 databases (from db_files list).\n", "\n", "Processing: selectedDBs\\test2.db\n", " Processing: EMAIL\n", @@ -1015,7 +1017,7 @@ "extraction_sql : None\n", "rows_count : 20\n", "rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n", - "classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple usernames that are likely application-specific login usernames created by users for login purposes.'}\n", + "classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple entries that resemble usernames, including both simple usernames and email addresses, which are commonly used for login purposes.'}\n", "evidence_count : 0\n", "evidence_sample : []\n", "source_columns : []\n", @@ -1048,7 +1050,7 @@ "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", "rows_count : 20\n", "rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n", - "classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple usernames that are likely application-specific login usernames created by users for login purposes.'}\n", + "classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple entries that resemble usernames, including both simple usernames and email addresses, which are commonly used for login purposes.'}\n", "evidence_count : 0\n", "evidence_sample : []\n", "source_columns : []\n", @@ -1083,39 +1085,6 @@ "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", "rows_count : 20\n", "rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n", - "classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple usernames that are likely application-specific login usernames created by users for login purposes.'}\n", - "evidence_count : 0\n", - "evidence_sample : []\n", - "source_columns : ['users.username', 'users.email']\n", - "\n", - "--- END METADATA ---\n", - "\n", - "=== STATE SNAPSHOT ===\n", - "\n", - "--- MESSAGES ---\n", - "0: HUMAN -> Find application-specific login usernames created by users for login purposes in the database\n", - "1: AI -> SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \n", - "UNION ALL \n", - "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", - "2: AI -> Retrieved 20 rows\n", - "3: AI -> SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\n", - "UNION ALL\n", - "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", - "4: AI -> Retrieved 20 rows\n", - "\n", - "--- BEGIN METADATA ---\n", - "attempt : 2\n", - "max_attempts : 2\n", - "phase : extraction\n", - "PII type : username\n", - "exploration_sql : SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \n", - "UNION ALL \n", - "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", - "extraction_sql : SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\n", - "UNION ALL\n", - "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", - "rows_count : 20\n", - "rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n", "classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple entries that resemble usernames, including both simple usernames and email addresses, which are commonly used for login purposes.'}\n", "evidence_count : 0\n", "evidence_sample : []\n", @@ -1149,7 +1118,40 @@ "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", "rows_count : 20\n", "rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n", - "classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple entries that resemble usernames, including both simple usernames and email addresses, which are commonly used for login purposes.'}\n", + "classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple usernames that appear to be application-specific login usernames created by users for login purposes.'}\n", + "evidence_count : 0\n", + "evidence_sample : []\n", + "source_columns : ['users.username', 'users.email']\n", + "\n", + "--- END METADATA ---\n", + "\n", + "=== STATE SNAPSHOT ===\n", + "\n", + "--- MESSAGES ---\n", + "0: HUMAN -> Find application-specific login usernames created by users for login purposes in the database\n", + "1: AI -> SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \n", + "UNION ALL \n", + "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", + "2: AI -> Retrieved 20 rows\n", + "3: AI -> SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\n", + "UNION ALL\n", + "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", + "4: AI -> Retrieved 20 rows\n", + "\n", + "--- BEGIN METADATA ---\n", + "attempt : 2\n", + "max_attempts : 2\n", + "phase : extraction\n", + "PII type : username\n", + "exploration_sql : SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b' \n", + "UNION ALL \n", + "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", + "extraction_sql : SELECT username FROM users WHERE username REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b'\n", + "UNION ALL\n", + "SELECT email FROM users WHERE email REGEXP '\\b[a-zA-Z][a-zA-Z0-9._-]{2,51}\\b';\n", + "rows_count : 20\n", + "rows_sample : [('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',), ('alice.johnson@example.com',), ('brian.smith@example.com',), ('carol.davis@example.com',), ('david.miller@example.com',), ('emma.wilson@example.com',), ('frank.brown@example.com',), ('grace.taylor@example.com',), ('henry.anderson@example.com',), ('irene.thomas@example.com',), ('jack.moore@example.com',)]\n", + "classification : {'found': True, 'confidence': 95, 'reason': 'The text contains multiple usernames that appear to be application-specific login usernames created by users for login purposes.'}\n", "evidence_count : 10\n", "evidence_sample : ['ajohnson', 'bsmith', 'cdavis', 'dmiller', 'ewilson', 'fbrown', 'gtaylor', 'handerson', 'ithomas', 'jmoore']\n", "source_columns : ['users.username', 'users.email']\n", @@ -1161,22 +1163,18 @@ "\n", "--- MESSAGES ---\n", "0: HUMAN -> Find loosely structured human name-like strings in the database\n", - "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", + "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "\n", "--- BEGIN METADATA ---\n", "attempt : 2\n", "max_attempts : 2\n", "phase : exploration\n", "PII type : person name\n", - "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", + "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "extraction_sql : None\n", "rows_count : 0\n", "rows_sample : []\n", @@ -1192,11 +1190,9 @@ "\n", "--- MESSAGES ---\n", "0: HUMAN -> Find loosely structured human name-like strings in the database\n", - "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", + "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "2: AI -> Retrieved 30 rows\n", "\n", "--- BEGIN METADATA ---\n", @@ -1204,11 +1200,9 @@ "max_attempts : 2\n", "phase : exploration\n", "PII type : person name\n", - "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", + "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "extraction_sql : None\n", "rows_count : 30\n", "rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n", @@ -1223,11 +1217,9 @@ "\n", "--- MESSAGES ---\n", "0: HUMAN -> Find loosely structured human name-like strings in the database\n", - "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", + "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "2: AI -> Retrieved 30 rows\n", "\n", "--- BEGIN METADATA ---\n", @@ -1235,11 +1227,9 @@ "max_attempts : 2\n", "phase : exploration\n", "PII type : person name\n", - "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", + "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "extraction_sql : None\n", "rows_count : 30\n", "rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n", @@ -1255,33 +1245,25 @@ "\n", "--- MESSAGES ---\n", "0: HUMAN -> Find loosely structured human name-like strings in the database\n", - "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", + "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "2: AI -> Retrieved 30 rows\n", - "3: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", + "3: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "\n", "--- BEGIN METADATA ---\n", "attempt : 2\n", "max_attempts : 2\n", "phase : extraction\n", "PII type : person name\n", - "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", - "extraction_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", + "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", + "extraction_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "rows_count : 30\n", "rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n", "classification : {'found': True, 'confidence': 1.0, 'reason': 'The text contains multiple strings that are commonly recognized as person names.'}\n", @@ -1297,17 +1279,13 @@ "\n", "--- MESSAGES ---\n", "0: HUMAN -> Find loosely structured human name-like strings in the database\n", - "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", + "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "2: AI -> Retrieved 30 rows\n", - "3: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", + "3: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "4: AI -> Retrieved 30 rows\n", "\n", "--- BEGIN METADATA ---\n", @@ -1315,16 +1293,12 @@ "max_attempts : 2\n", "phase : extraction\n", "PII type : person name\n", - "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", - "extraction_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", + "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL \n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", + "extraction_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n", + "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' UNION ALL\n", + "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", "rows_count : 30\n", "rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n", "classification : {'found': True, 'confidence': 1.0, 'reason': 'The text contains multiple strings that are commonly recognized as person names.'}\n", @@ -1332,90 +1306,7 @@ "evidence_sample : []\n", "source_columns : ['users.first_name', 'users.last_name', 'users.username']\n", "\n", - "--- END METADATA ---\n", - "\n", - "=== STATE SNAPSHOT ===\n", - "\n", - "--- MESSAGES ---\n", - "0: HUMAN -> Find loosely structured human name-like strings in the database\n", - "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", - "2: AI -> Retrieved 30 rows\n", - "3: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", - "4: AI -> Retrieved 30 rows\n", - "\n", - "--- BEGIN METADATA ---\n", - "attempt : 2\n", - "max_attempts : 2\n", - "phase : extraction\n", - "PII type : person name\n", - "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", - "extraction_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", - "rows_count : 30\n", - "rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n", - "classification : {'found': True, 'confidence': 1.0, 'reason': 'The text contains multiple strings that are commonly recognized as person names.'}\n", - "evidence_count : 0\n", - "evidence_sample : []\n", - "source_columns : ['users.first_name', 'users.last_name', 'users.username']\n", - "\n", - "--- END METADATA ---\n", - "\n", - "=== STATE SNAPSHOT ===\n", - "\n", - "--- MESSAGES ---\n", - "0: HUMAN -> Find loosely structured human name-like strings in the database\n", - "1: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", - "2: AI -> Retrieved 30 rows\n", - "3: AI -> SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", - "4: AI -> Retrieved 30 rows\n", - "\n", - "--- BEGIN METADATA ---\n", - "attempt : 2\n", - "max_attempts : 2\n", - "phase : extraction\n", - "PII type : person name\n", - "exploration_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}' \n", - "UNION ALL \n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", - "extraction_sql : SELECT first_name FROM users WHERE first_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT last_name FROM users WHERE last_name REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}'\n", - "UNION ALL\n", - "SELECT username FROM users WHERE username REGEXP '[A-Za-z][A-Za-z\\s\\.\\-]{1,50}';\n", - "rows_count : 30\n", - "rows_sample : [('Alice',), ('Brian',), ('Carol',), ('David',), ('Emma',), ('Frank',), ('Grace',), ('Henry',), ('Irene',), ('Jack',), ('Johnson',), ('Smith',), ('Davis',), ('Miller',), ('Wilson',), ('Brown',), ('Taylor',), ('Anderson',), ('Thomas',), ('Moore',), ('ajohnson',), ('bsmith',), ('cdavis',), ('dmiller',), ('ewilson',), ('fbrown',), ('gtaylor',), ('handerson',), ('ithomas',), ('jmoore',)]\n", - "classification : {'found': True, 'confidence': 1.0, 'reason': 'The text contains multiple strings that are commonly recognized as person names.'}\n", - "evidence_count : 30\n", - "evidence_sample : ['Alice', 'Brian', 'Carol', 'David', 'Emma', 'Frank', 'Grace', 'Henry', 'Irene', 'Jack']\n", - "source_columns : ['users.first_name', 'users.last_name', 'users.username']\n", - "\n", - "--- END METADATA ---\n", - "Wrote: I:\\project2026\\llmagent\\batch_results\\evidence_20260120T014007Z.jsonl\n" + "--- END METADATA ---\n" ] } ], @@ -1470,7 +1361,21 @@ "\n", " return all_results\n", "\n", - "def main():\n", + "def main(): \n", + " DB_DIR = Path(r\"selectedDBs\") # folder that contains the dbs\n", + " OUT_DIR = Path(\"batch_results\")\n", + " OUT_DIR.mkdir(exist_ok=True)\n", + "\n", + " PII_TARGETS = [\"EMAIL\", \"PHONE\", \"USERNAME\", \"PERSON_NAME\"]\n", + "\n", + " # --- usage ---\n", + " DB_FILES_PY = Path(\"db_files.py\")\n", + " db_files = load_db_files_list(DB_FILES_PY)\n", + "\n", + " db_paths, missing, not_sqlite = build_db_paths(DB_DIR, db_files, is_sqlite_file)\n", + " print_db_path_report(db_paths, missing, not_sqlite)\n", + "\n", + " \n", " all_results = run_batch(db_paths, PII_TARGETS, PII_CONFIG, app)\n", " save_jsonl(all_results, OUT_DIR)\n", "\n",