Use Outlook ids for classifier dedupe precedence

This commit is contained in:
Steve W
2026-04-09 18:26:37 +00:00
parent 1b2c7db924
commit c6ee735949
4 changed files with 127 additions and 79 deletions

View File

@@ -23,7 +23,8 @@ class DedupeStore:
"""
CREATE TABLE IF NOT EXISTS classification_dedupe (
id INTEGER PRIMARY KEY AUTOINCREMENT,
subject_key TEXT NOT NULL,
outlook_id TEXT,
conversation_id TEXT,
fingerprint TEXT NOT NULL,
result_hash TEXT NOT NULL,
request_payload TEXT NOT NULL,
@@ -34,32 +35,46 @@ class DedupeStore:
)
"""
)
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_subject_key ON classification_dedupe(subject_key)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_outlook_id ON classification_dedupe(outlook_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_conversation_id ON classification_dedupe(conversation_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_fingerprint ON classification_dedupe(fingerprint)")
def find_existing(self, *, subject_key: str, fingerprint: str) -> dict[str, Any] | None:
def find_existing(self, *, outlook_id: str | None, conversation_id: str | None, fingerprint: str) -> tuple[dict[str, Any] | None, str]:
with self._connect() as conn:
if outlook_id:
row = conn.execute(
"SELECT * FROM classification_dedupe WHERE outlook_id = ? ORDER BY id DESC LIMIT 1",
(outlook_id,),
).fetchone()
if row:
return self._decode(row), "id"
if conversation_id:
row = conn.execute(
"SELECT * FROM classification_dedupe WHERE conversation_id = ? ORDER BY id DESC LIMIT 1",
(conversation_id,),
).fetchone()
if row:
return self._decode(row), "conversation"
row = conn.execute(
"SELECT * FROM classification_dedupe WHERE fingerprint = ? ORDER BY id DESC LIMIT 1",
(fingerprint,),
).fetchone()
if row is None:
row = conn.execute(
"SELECT * FROM classification_dedupe WHERE subject_key = ? ORDER BY id DESC LIMIT 1",
(subject_key,),
).fetchone()
if not row:
return None
data = dict(row)
data["request_payload"] = json.loads(data["request_payload"])
data["result_payload"] = json.loads(data["result_payload"])
return data
if row:
return self._decode(row), "fingerprint"
return None, "none"
def _decode(self, row: sqlite3.Row) -> dict[str, Any]:
data = dict(row)
data["request_payload"] = json.loads(data["request_payload"])
data["result_payload"] = json.loads(data["result_payload"])
return data
def insert_or_update(
self,
*,
existing_id: int | None,
subject_key: str,
outlook_id: str | None,
conversation_id: str | None,
fingerprint: str,
result_hash: str,
request_payload: dict[str, Any],
@@ -70,11 +85,12 @@ class DedupeStore:
if existing_id is None:
conn.execute(
"""
INSERT INTO classification_dedupe (subject_key, fingerprint, result_hash, request_payload, result_payload, seen_count)
VALUES (?, ?, ?, ?, ?, ?)
INSERT INTO classification_dedupe (outlook_id, conversation_id, fingerprint, result_hash, request_payload, result_payload, seen_count)
VALUES (?, ?, ?, ?, ?, ?, ?)
""",
(
subject_key,
outlook_id,
conversation_id,
fingerprint,
result_hash,
json.dumps(request_payload, sort_keys=True),
@@ -86,12 +102,13 @@ class DedupeStore:
conn.execute(
"""
UPDATE classification_dedupe
SET subject_key = ?, fingerprint = ?, result_hash = ?, request_payload = ?, result_payload = ?,
SET outlook_id = ?, conversation_id = ?, fingerprint = ?, result_hash = ?, request_payload = ?, result_payload = ?,
seen_count = ?, updated_at = CURRENT_TIMESTAMP
WHERE id = ?
""",
(
subject_key,
outlook_id,
conversation_id,
fingerprint,
result_hash,
json.dumps(request_payload, sort_keys=True),