Use Outlook ids for classifier dedupe precedence

2026-04-09 18:26:37 +00:00
parent 1b2c7db924
commit c6ee735949
4 changed files with 127 additions and 79 deletions
--- a/app/dedupe_store.py
+++ b/app/dedupe_store.py
@@ -23,7 +23,8 @@ class DedupeStore:
                """
                CREATE TABLE IF NOT EXISTS classification_dedupe (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
-                    subject_key TEXT NOT NULL,
+                    outlook_id TEXT,
+                    conversation_id TEXT,
                    fingerprint TEXT NOT NULL,
                    result_hash TEXT NOT NULL,
                    request_payload TEXT NOT NULL,
@@ -34,32 +35,46 @@ class DedupeStore:
                )
                """
            )
-            conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_subject_key ON classification_dedupe(subject_key)")
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_outlook_id ON classification_dedupe(outlook_id)")
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_conversation_id ON classification_dedupe(conversation_id)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_fingerprint ON classification_dedupe(fingerprint)")

-    def find_existing(self, *, subject_key: str, fingerprint: str) -> dict[str, Any] | None:
+    def find_existing(self, *, outlook_id: str | None, conversation_id: str | None, fingerprint: str) -> tuple[dict[str, Any] | None, str]:
        with self._connect() as conn:
+            if outlook_id:
+                row = conn.execute(
+                    "SELECT * FROM classification_dedupe WHERE outlook_id = ? ORDER BY id DESC LIMIT 1",
+                    (outlook_id,),
+                ).fetchone()
+                if row:
+                    return self._decode(row), "id"
+            if conversation_id:
+                row = conn.execute(
+                    "SELECT * FROM classification_dedupe WHERE conversation_id = ? ORDER BY id DESC LIMIT 1",
+                    (conversation_id,),
+                ).fetchone()
+                if row:
+                    return self._decode(row), "conversation"
            row = conn.execute(
                "SELECT * FROM classification_dedupe WHERE fingerprint = ? ORDER BY id DESC LIMIT 1",
                (fingerprint,),
            ).fetchone()
-            if row is None:
-                row = conn.execute(
-                    "SELECT * FROM classification_dedupe WHERE subject_key = ? ORDER BY id DESC LIMIT 1",
-                    (subject_key,),
-                ).fetchone()
-            if not row:
-                return None
-            data = dict(row)
-            data["request_payload"] = json.loads(data["request_payload"])
-            data["result_payload"] = json.loads(data["result_payload"])
-            return data
+            if row:
+                return self._decode(row), "fingerprint"
+            return None, "none"
+
+    def _decode(self, row: sqlite3.Row) -> dict[str, Any]:
+        data = dict(row)
+        data["request_payload"] = json.loads(data["request_payload"])
+        data["result_payload"] = json.loads(data["result_payload"])
+        return data

    def insert_or_update(
        self,
        *,
        existing_id: int | None,
-        subject_key: str,
+        outlook_id: str | None,
+        conversation_id: str | None,
        fingerprint: str,
        result_hash: str,
        request_payload: dict[str, Any],
@@ -70,11 +85,12 @@ class DedupeStore:
            if existing_id is None:
                conn.execute(
                    """
-                    INSERT INTO classification_dedupe (subject_key, fingerprint, result_hash, request_payload, result_payload, seen_count)
-                    VALUES (?, ?, ?, ?, ?, ?)
+                    INSERT INTO classification_dedupe (outlook_id, conversation_id, fingerprint, result_hash, request_payload, result_payload, seen_count)
+                    VALUES (?, ?, ?, ?, ?, ?, ?)
                    """,
                    (
-                        subject_key,
+                        outlook_id,
+                        conversation_id,
                        fingerprint,
                        result_hash,
                        json.dumps(request_payload, sort_keys=True),
@@ -86,12 +102,13 @@ class DedupeStore:
                conn.execute(
                    """
                    UPDATE classification_dedupe
-                    SET subject_key = ?, fingerprint = ?, result_hash = ?, request_payload = ?, result_payload = ?,
+                    SET outlook_id = ?, conversation_id = ?, fingerprint = ?, result_hash = ?, request_payload = ?, result_payload = ?,
                        seen_count = ?, updated_at = CURRENT_TIMESTAMP
                    WHERE id = ?
                    """,
                    (
-                        subject_key,
+                        outlook_id,
+                        conversation_id,
                        fingerprint,
                        result_hash,
                        json.dumps(request_payload, sort_keys=True),