Refocus classifier on rich extraction and local dedupe only
This commit is contained in:
173
app/sync.py
173
app/sync.py
@@ -3,152 +3,73 @@ from __future__ import annotations
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
from typing import Any
|
||||
import re
|
||||
|
||||
from app.dedupe_store import DedupeStore
|
||||
from app.models import ClassificationDetails, ClassificationResult, ClassifyRequest, TodoistSyncResult
|
||||
from app.todoist import TodoistClient
|
||||
from app.models import ClassificationResult, ClassifyRequest, DedupeResult
|
||||
|
||||
|
||||
def normalize_subject(subject: str) -> str:
|
||||
value = subject.strip().lower()
|
||||
value = re.sub(r"^(re|fw|fwd)\s*:\s*", "", value)
|
||||
value = re.sub(r"\s+", " ", value)
|
||||
return value
|
||||
|
||||
|
||||
def build_subject_key(request: ClassifyRequest) -> str:
|
||||
subject = normalize_subject(request.email_data.subject)
|
||||
sender = (request.from_address or "").strip().lower()
|
||||
return hashlib.sha256(f"{sender}\n{subject}".encode()).hexdigest()
|
||||
|
||||
|
||||
def build_fingerprint(request: ClassifyRequest) -> str:
|
||||
subject = request.email_data.subject.strip().lower()
|
||||
subject = normalize_subject(request.email_data.subject)
|
||||
body = " ".join(request.email_data.body.split()).strip().lower()
|
||||
seed = f"{request.from_address or ''}\n{subject}\n{body}"
|
||||
seed = f"{request.from_address or ''}\n{subject}\n{body[:2000]}"
|
||||
return hashlib.sha256(seed.encode()).hexdigest()
|
||||
|
||||
|
||||
def build_classification_hash(result: ClassificationResult) -> str:
|
||||
payload = result.model_dump(exclude={"todoist"}, exclude_none=True)
|
||||
def build_result_hash(result: ClassificationResult) -> str:
|
||||
payload = result.model_dump(exclude={"dedupe"}, exclude_none=True)
|
||||
return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest()
|
||||
|
||||
|
||||
def render_task_content(result: ClassificationResult) -> str:
|
||||
details = result.details or ClassificationDetails()
|
||||
return details.suggested_title or result.task_description or details.summary or "Email follow-up"
|
||||
|
||||
|
||||
def render_task_description(request: ClassifyRequest, result: ClassificationResult) -> str:
|
||||
details = result.details or ClassificationDetails()
|
||||
sections: list[str] = []
|
||||
if details.summary:
|
||||
sections.append(f"Summary:\n{details.summary}")
|
||||
if result.task_description:
|
||||
sections.append(f"Action:\n{result.task_description}")
|
||||
if details.suggested_notes:
|
||||
sections.append(f"Notes:\n{details.suggested_notes}")
|
||||
if details.deadline:
|
||||
sections.append(f"Deadline:\n{details.deadline}")
|
||||
if details.people:
|
||||
sections.append("People:\n- " + "\n- ".join(details.people))
|
||||
if details.organizations:
|
||||
sections.append("Organizations:\n- " + "\n- ".join(details.organizations))
|
||||
if details.attachments_referenced:
|
||||
sections.append("Attachments referenced:\n- " + "\n- ".join(details.attachments_referenced))
|
||||
if details.next_steps:
|
||||
sections.append("Next steps:\n- " + "\n- ".join(details.next_steps))
|
||||
if details.key_points:
|
||||
sections.append("Key points:\n- " + "\n- ".join(details.key_points))
|
||||
metadata = []
|
||||
if request.message_id:
|
||||
metadata.append(f"message_id: {request.message_id}")
|
||||
if request.thread_id:
|
||||
metadata.append(f"thread_id: {request.thread_id}")
|
||||
if request.from_address:
|
||||
metadata.append(f"from: {request.from_address}")
|
||||
if request.received_at:
|
||||
metadata.append(f"received_at: {request.received_at}")
|
||||
if metadata:
|
||||
sections.append("Source metadata:\n" + "\n".join(metadata))
|
||||
return "\n\n".join(sections).strip()
|
||||
|
||||
|
||||
async def sync_todoist(request: ClassifyRequest, result: ClassificationResult) -> TodoistSyncResult:
|
||||
if not result.needs_action:
|
||||
return TodoistSyncResult(status="skipped", message="No action required.")
|
||||
client = TodoistClient()
|
||||
if not client.enabled:
|
||||
return TodoistSyncResult(status="disabled", message="Todoist is not configured.")
|
||||
|
||||
def apply_dedupe(request: ClassifyRequest, result: ClassificationResult) -> DedupeResult:
|
||||
store = DedupeStore(os.getenv("EMAIL_CLASSIFIER_DB_PATH", ".data/email_classifier.db"))
|
||||
subject_key = build_subject_key(request)
|
||||
fingerprint = build_fingerprint(request)
|
||||
existing = store.find_existing(message_id=request.message_id, thread_id=request.thread_id, fingerprint=fingerprint)
|
||||
dedupe_match = "none"
|
||||
if existing:
|
||||
if request.message_id and existing.get("message_id") == request.message_id:
|
||||
dedupe_match = "message_id"
|
||||
elif request.thread_id and existing.get("thread_id") == request.thread_id:
|
||||
dedupe_match = "thread_id"
|
||||
else:
|
||||
dedupe_match = "fingerprint"
|
||||
|
||||
content = render_task_content(result)
|
||||
description = render_task_description(request, result)
|
||||
classification_hash = build_classification_hash(result)
|
||||
result_hash = build_result_hash(result)
|
||||
existing = store.find_existing(subject_key=subject_key, fingerprint=fingerprint)
|
||||
|
||||
if not existing:
|
||||
created = await client.create_task(content=content, description=description, due_string=(result.details.deadline if result.details else None))
|
||||
task_id = str(created.get("id"))
|
||||
store.upsert(
|
||||
store.insert_or_update(
|
||||
existing_id=None,
|
||||
message_id=request.message_id,
|
||||
thread_id=request.thread_id,
|
||||
subject_key=subject_key,
|
||||
fingerprint=fingerprint,
|
||||
todoist_task_id=task_id,
|
||||
classification_hash=classification_hash,
|
||||
source_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
||||
last_result=result.model_dump(exclude_none=True),
|
||||
result_hash=result_hash,
|
||||
request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
||||
result_payload=result.model_dump(exclude={"dedupe"}, exclude_none=True),
|
||||
seen_count=1,
|
||||
)
|
||||
return TodoistSyncResult(status="created", task_id=task_id, dedupe_match=dedupe_match)
|
||||
return DedupeResult(status="new", seen_count=1, matched_on="none", subject_key=subject_key, fingerprint=fingerprint)
|
||||
|
||||
task_id = str(existing["todoist_task_id"])
|
||||
if existing.get("classification_hash") == classification_hash:
|
||||
store.upsert(
|
||||
existing_id=existing["id"],
|
||||
message_id=request.message_id,
|
||||
thread_id=request.thread_id,
|
||||
fingerprint=fingerprint,
|
||||
todoist_task_id=task_id,
|
||||
classification_hash=classification_hash,
|
||||
source_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
||||
last_result=result.model_dump(exclude_none=True),
|
||||
)
|
||||
return TodoistSyncResult(status="unchanged", task_id=task_id, dedupe_match=dedupe_match, message="Existing task already reflects this classification.")
|
||||
|
||||
await client.update_task(task_id, content=content, description=description, due_string=(result.details.deadline if result.details else None))
|
||||
comment_added = False
|
||||
previous_details = (existing.get("last_result") or {}).get("details") or {}
|
||||
current_details = (result.details.model_dump(exclude_none=True) if result.details else {})
|
||||
if _material_context_changed(previous_details, current_details):
|
||||
await client.add_comment(task_id, _build_update_comment(result))
|
||||
comment_added = True
|
||||
|
||||
store.upsert(
|
||||
matched_on = "fingerprint" if existing.get("fingerprint") == fingerprint else "subject"
|
||||
previous_hash = existing.get("result_hash")
|
||||
seen_count = int(existing.get("seen_count", 1)) + 1
|
||||
status = "duplicate" if previous_hash == result_hash else "updated"
|
||||
store.insert_or_update(
|
||||
existing_id=existing["id"],
|
||||
message_id=request.message_id,
|
||||
thread_id=request.thread_id,
|
||||
subject_key=subject_key,
|
||||
fingerprint=fingerprint,
|
||||
result_hash=result_hash,
|
||||
request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
||||
result_payload=result.model_dump(exclude={"dedupe"}, exclude_none=True),
|
||||
seen_count=seen_count,
|
||||
)
|
||||
return DedupeResult(
|
||||
status=status,
|
||||
seen_count=seen_count,
|
||||
matched_on=matched_on,
|
||||
subject_key=subject_key,
|
||||
fingerprint=fingerprint,
|
||||
todoist_task_id=task_id,
|
||||
classification_hash=classification_hash,
|
||||
source_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
||||
last_result=result.model_dump(exclude_none=True),
|
||||
)
|
||||
return TodoistSyncResult(status="updated", task_id=task_id, comment_added=comment_added, dedupe_match=dedupe_match)
|
||||
|
||||
|
||||
def _material_context_changed(previous: dict[str, Any], current: dict[str, Any]) -> bool:
|
||||
keys = {"summary", "deadline", "attachments_referenced", "next_steps", "key_points", "people"}
|
||||
return any(previous.get(k) != current.get(k) for k in keys)
|
||||
|
||||
|
||||
def _build_update_comment(result: ClassificationResult) -> str:
|
||||
details = result.details or ClassificationDetails()
|
||||
parts = ["Email classifier update:"]
|
||||
if details.summary:
|
||||
parts.append(f"Summary: {details.summary}")
|
||||
if details.deadline:
|
||||
parts.append(f"Deadline: {details.deadline}")
|
||||
if details.next_steps:
|
||||
parts.append("Next steps: " + "; ".join(details.next_steps))
|
||||
if details.key_points:
|
||||
parts.append("Key points: " + "; ".join(details.key_points[:4]))
|
||||
return "\n".join(parts)
|
||||
|
||||
Reference in New Issue
Block a user