Refocus classifier on rich extraction and local dedupe only

This commit is contained in:
Steve W
2026-04-09 18:18:35 +00:00
parent a1dcaf9a74
commit 1b2c7db924
7 changed files with 130 additions and 267 deletions

View File

@@ -21,86 +21,82 @@ class DedupeStore:
with self._connect() as conn:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS todoist_sync (
CREATE TABLE IF NOT EXISTS classification_dedupe (
id INTEGER PRIMARY KEY AUTOINCREMENT,
message_id TEXT,
thread_id TEXT,
subject_key TEXT NOT NULL,
fingerprint TEXT NOT NULL,
todoist_task_id TEXT NOT NULL,
classification_hash TEXT NOT NULL,
source_payload TEXT NOT NULL,
last_result TEXT NOT NULL,
result_hash TEXT NOT NULL,
request_payload TEXT NOT NULL,
result_payload TEXT NOT NULL,
seen_count INTEGER NOT NULL DEFAULT 1,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
"""
)
conn.execute("CREATE INDEX IF NOT EXISTS idx_sync_message_id ON todoist_sync(message_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_sync_thread_id ON todoist_sync(thread_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_sync_fingerprint ON todoist_sync(fingerprint)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_subject_key ON classification_dedupe(subject_key)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_fingerprint ON classification_dedupe(fingerprint)")
def find_existing(self, *, message_id: str | None, thread_id: str | None, fingerprint: str) -> dict[str, Any] | None:
queries = []
if message_id:
queries.append(("SELECT * FROM todoist_sync WHERE message_id = ? ORDER BY id DESC LIMIT 1", (message_id,)))
if thread_id:
queries.append(("SELECT * FROM todoist_sync WHERE thread_id = ? ORDER BY id DESC LIMIT 1", (thread_id,)))
queries.append(("SELECT * FROM todoist_sync WHERE fingerprint = ? ORDER BY id DESC LIMIT 1", (fingerprint,)))
def find_existing(self, *, subject_key: str, fingerprint: str) -> dict[str, Any] | None:
with self._connect() as conn:
for sql, params in queries:
row = conn.execute(sql, params).fetchone()
if row:
data = dict(row)
data["source_payload"] = json.loads(data["source_payload"])
data["last_result"] = json.loads(data["last_result"])
return data
return None
row = conn.execute(
"SELECT * FROM classification_dedupe WHERE fingerprint = ? ORDER BY id DESC LIMIT 1",
(fingerprint,),
).fetchone()
if row is None:
row = conn.execute(
"SELECT * FROM classification_dedupe WHERE subject_key = ? ORDER BY id DESC LIMIT 1",
(subject_key,),
).fetchone()
if not row:
return None
data = dict(row)
data["request_payload"] = json.loads(data["request_payload"])
data["result_payload"] = json.loads(data["result_payload"])
return data
def upsert(
def insert_or_update(
self,
*,
existing_id: int | None,
message_id: str | None,
thread_id: str | None,
subject_key: str,
fingerprint: str,
todoist_task_id: str,
classification_hash: str,
source_payload: dict[str, Any],
last_result: dict[str, Any],
result_hash: str,
request_payload: dict[str, Any],
result_payload: dict[str, Any],
seen_count: int,
) -> None:
with self._connect() as conn:
if existing_id is None:
conn.execute(
"""
INSERT INTO todoist_sync (message_id, thread_id, fingerprint, todoist_task_id, classification_hash, source_payload, last_result)
VALUES (?, ?, ?, ?, ?, ?, ?)
INSERT INTO classification_dedupe (subject_key, fingerprint, result_hash, request_payload, result_payload, seen_count)
VALUES (?, ?, ?, ?, ?, ?)
""",
(
message_id,
thread_id,
subject_key,
fingerprint,
todoist_task_id,
classification_hash,
json.dumps(source_payload, sort_keys=True),
json.dumps(last_result, sort_keys=True),
result_hash,
json.dumps(request_payload, sort_keys=True),
json.dumps(result_payload, sort_keys=True),
seen_count,
),
)
else:
conn.execute(
"""
UPDATE todoist_sync
SET message_id = ?, thread_id = ?, fingerprint = ?, todoist_task_id = ?, classification_hash = ?,
source_payload = ?, last_result = ?, updated_at = CURRENT_TIMESTAMP
UPDATE classification_dedupe
SET subject_key = ?, fingerprint = ?, result_hash = ?, request_payload = ?, result_payload = ?,
seen_count = ?, updated_at = CURRENT_TIMESTAMP
WHERE id = ?
""",
(
message_id,
thread_id,
subject_key,
fingerprint,
todoist_task_id,
classification_hash,
json.dumps(source_payload, sort_keys=True),
json.dumps(last_result, sort_keys=True),
result_hash,
json.dumps(request_payload, sort_keys=True),
json.dumps(result_payload, sort_keys=True),
seen_count,
existing_id,
),
)