Refocus classifier on rich extraction and local dedupe only
This commit is contained in:
@@ -21,86 +21,82 @@ class DedupeStore:
|
||||
with self._connect() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS todoist_sync (
|
||||
CREATE TABLE IF NOT EXISTS classification_dedupe (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
message_id TEXT,
|
||||
thread_id TEXT,
|
||||
subject_key TEXT NOT NULL,
|
||||
fingerprint TEXT NOT NULL,
|
||||
todoist_task_id TEXT NOT NULL,
|
||||
classification_hash TEXT NOT NULL,
|
||||
source_payload TEXT NOT NULL,
|
||||
last_result TEXT NOT NULL,
|
||||
result_hash TEXT NOT NULL,
|
||||
request_payload TEXT NOT NULL,
|
||||
result_payload TEXT NOT NULL,
|
||||
seen_count INTEGER NOT NULL DEFAULT 1,
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_sync_message_id ON todoist_sync(message_id)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_sync_thread_id ON todoist_sync(thread_id)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_sync_fingerprint ON todoist_sync(fingerprint)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_subject_key ON classification_dedupe(subject_key)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_fingerprint ON classification_dedupe(fingerprint)")
|
||||
|
||||
def find_existing(self, *, message_id: str | None, thread_id: str | None, fingerprint: str) -> dict[str, Any] | None:
|
||||
queries = []
|
||||
if message_id:
|
||||
queries.append(("SELECT * FROM todoist_sync WHERE message_id = ? ORDER BY id DESC LIMIT 1", (message_id,)))
|
||||
if thread_id:
|
||||
queries.append(("SELECT * FROM todoist_sync WHERE thread_id = ? ORDER BY id DESC LIMIT 1", (thread_id,)))
|
||||
queries.append(("SELECT * FROM todoist_sync WHERE fingerprint = ? ORDER BY id DESC LIMIT 1", (fingerprint,)))
|
||||
def find_existing(self, *, subject_key: str, fingerprint: str) -> dict[str, Any] | None:
|
||||
with self._connect() as conn:
|
||||
for sql, params in queries:
|
||||
row = conn.execute(sql, params).fetchone()
|
||||
if row:
|
||||
data = dict(row)
|
||||
data["source_payload"] = json.loads(data["source_payload"])
|
||||
data["last_result"] = json.loads(data["last_result"])
|
||||
return data
|
||||
return None
|
||||
row = conn.execute(
|
||||
"SELECT * FROM classification_dedupe WHERE fingerprint = ? ORDER BY id DESC LIMIT 1",
|
||||
(fingerprint,),
|
||||
).fetchone()
|
||||
if row is None:
|
||||
row = conn.execute(
|
||||
"SELECT * FROM classification_dedupe WHERE subject_key = ? ORDER BY id DESC LIMIT 1",
|
||||
(subject_key,),
|
||||
).fetchone()
|
||||
if not row:
|
||||
return None
|
||||
data = dict(row)
|
||||
data["request_payload"] = json.loads(data["request_payload"])
|
||||
data["result_payload"] = json.loads(data["result_payload"])
|
||||
return data
|
||||
|
||||
def upsert(
|
||||
def insert_or_update(
|
||||
self,
|
||||
*,
|
||||
existing_id: int | None,
|
||||
message_id: str | None,
|
||||
thread_id: str | None,
|
||||
subject_key: str,
|
||||
fingerprint: str,
|
||||
todoist_task_id: str,
|
||||
classification_hash: str,
|
||||
source_payload: dict[str, Any],
|
||||
last_result: dict[str, Any],
|
||||
result_hash: str,
|
||||
request_payload: dict[str, Any],
|
||||
result_payload: dict[str, Any],
|
||||
seen_count: int,
|
||||
) -> None:
|
||||
with self._connect() as conn:
|
||||
if existing_id is None:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO todoist_sync (message_id, thread_id, fingerprint, todoist_task_id, classification_hash, source_payload, last_result)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO classification_dedupe (subject_key, fingerprint, result_hash, request_payload, result_payload, seen_count)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
message_id,
|
||||
thread_id,
|
||||
subject_key,
|
||||
fingerprint,
|
||||
todoist_task_id,
|
||||
classification_hash,
|
||||
json.dumps(source_payload, sort_keys=True),
|
||||
json.dumps(last_result, sort_keys=True),
|
||||
result_hash,
|
||||
json.dumps(request_payload, sort_keys=True),
|
||||
json.dumps(result_payload, sort_keys=True),
|
||||
seen_count,
|
||||
),
|
||||
)
|
||||
else:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE todoist_sync
|
||||
SET message_id = ?, thread_id = ?, fingerprint = ?, todoist_task_id = ?, classification_hash = ?,
|
||||
source_payload = ?, last_result = ?, updated_at = CURRENT_TIMESTAMP
|
||||
UPDATE classification_dedupe
|
||||
SET subject_key = ?, fingerprint = ?, result_hash = ?, request_payload = ?, result_payload = ?,
|
||||
seen_count = ?, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = ?
|
||||
""",
|
||||
(
|
||||
message_id,
|
||||
thread_id,
|
||||
subject_key,
|
||||
fingerprint,
|
||||
todoist_task_id,
|
||||
classification_hash,
|
||||
json.dumps(source_payload, sort_keys=True),
|
||||
json.dumps(last_result, sort_keys=True),
|
||||
result_hash,
|
||||
json.dumps(request_payload, sort_keys=True),
|
||||
json.dumps(result_payload, sort_keys=True),
|
||||
seen_count,
|
||||
existing_id,
|
||||
),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user