Use Outlook ids for classifier dedupe precedence
This commit is contained in:
36
app/sync.py
36
app/sync.py
@@ -16,16 +16,12 @@ def normalize_subject(subject: str) -> str:
|
||||
return value
|
||||
|
||||
|
||||
def build_subject_key(request: ClassifyRequest) -> str:
|
||||
subject = normalize_subject(request.email_data.subject)
|
||||
sender = (request.from_address or "").strip().lower()
|
||||
return hashlib.sha256(f"{sender}\n{subject}".encode()).hexdigest()
|
||||
|
||||
|
||||
def build_fingerprint(request: ClassifyRequest) -> str:
|
||||
subject = normalize_subject(request.email_data.subject)
|
||||
body = " ".join(request.email_data.body.split()).strip().lower()
|
||||
seed = f"{request.from_address or ''}\n{subject}\n{body[:2000]}"
|
||||
preview = " ".join((request.bodyPreview or "").split()).strip().lower()
|
||||
sender = (request.from_address or "").strip().lower()
|
||||
seed = f"{sender}\n{subject}\n{preview}\n{body[:2000]}"
|
||||
return hashlib.sha256(seed.encode()).hexdigest()
|
||||
|
||||
|
||||
@@ -36,30 +32,41 @@ def build_result_hash(result: ClassificationResult) -> str:
|
||||
|
||||
def apply_dedupe(request: ClassifyRequest, result: ClassificationResult) -> DedupeResult:
|
||||
store = DedupeStore(os.getenv("EMAIL_CLASSIFIER_DB_PATH", ".data/email_classifier.db"))
|
||||
subject_key = build_subject_key(request)
|
||||
fingerprint = build_fingerprint(request)
|
||||
existing, matched_on = store.find_existing(
|
||||
outlook_id=request.id,
|
||||
conversation_id=request.conversationId,
|
||||
fingerprint=fingerprint,
|
||||
)
|
||||
result_hash = build_result_hash(result)
|
||||
existing = store.find_existing(subject_key=subject_key, fingerprint=fingerprint)
|
||||
|
||||
if not existing:
|
||||
store.insert_or_update(
|
||||
existing_id=None,
|
||||
subject_key=subject_key,
|
||||
outlook_id=request.id,
|
||||
conversation_id=request.conversationId,
|
||||
fingerprint=fingerprint,
|
||||
result_hash=result_hash,
|
||||
request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
||||
result_payload=result.model_dump(exclude={"dedupe"}, exclude_none=True),
|
||||
seen_count=1,
|
||||
)
|
||||
return DedupeResult(status="new", seen_count=1, matched_on="none", subject_key=subject_key, fingerprint=fingerprint)
|
||||
return DedupeResult(
|
||||
status="new",
|
||||
seen_count=1,
|
||||
matched_on="none",
|
||||
message_id=request.id,
|
||||
conversation_id=request.conversationId,
|
||||
fingerprint=fingerprint,
|
||||
)
|
||||
|
||||
matched_on = "fingerprint" if existing.get("fingerprint") == fingerprint else "subject"
|
||||
previous_hash = existing.get("result_hash")
|
||||
seen_count = int(existing.get("seen_count", 1)) + 1
|
||||
status = "duplicate" if previous_hash == result_hash else "updated"
|
||||
store.insert_or_update(
|
||||
existing_id=existing["id"],
|
||||
subject_key=subject_key,
|
||||
outlook_id=request.id or existing.get("outlook_id"),
|
||||
conversation_id=request.conversationId or existing.get("conversation_id"),
|
||||
fingerprint=fingerprint,
|
||||
result_hash=result_hash,
|
||||
request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
||||
@@ -70,6 +77,7 @@ def apply_dedupe(request: ClassifyRequest, result: ClassificationResult) -> Dedu
|
||||
status=status,
|
||||
seen_count=seen_count,
|
||||
matched_on=matched_on,
|
||||
subject_key=subject_key,
|
||||
message_id=request.id or existing.get("outlook_id"),
|
||||
conversation_id=request.conversationId or existing.get("conversation_id"),
|
||||
fingerprint=fingerprint,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user