from __future__ import annotations import hashlib import json import os import re from app.dedupe_store import DedupeStore from app.models import ClassificationResult, ClassifyRequest, DedupeResult def normalize_subject(subject: str) -> str: value = subject.strip().lower() value = re.sub(r"^(re|fw|fwd)\s*:\s*", "", value) value = re.sub(r"\s+", " ", value) return value def build_subject_key(request: ClassifyRequest) -> str: subject = normalize_subject(request.email_data.subject) sender = (request.from_address or "").strip().lower() return hashlib.sha256(f"{sender}\n{subject}".encode()).hexdigest() def build_fingerprint(request: ClassifyRequest) -> str: subject = normalize_subject(request.email_data.subject) body = " ".join(request.email_data.body.split()).strip().lower() seed = f"{request.from_address or ''}\n{subject}\n{body[:2000]}" return hashlib.sha256(seed.encode()).hexdigest() def build_result_hash(result: ClassificationResult) -> str: payload = result.model_dump(exclude={"dedupe"}, exclude_none=True) return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest() def apply_dedupe(request: ClassifyRequest, result: ClassificationResult) -> DedupeResult: store = DedupeStore(os.getenv("EMAIL_CLASSIFIER_DB_PATH", ".data/email_classifier.db")) subject_key = build_subject_key(request) fingerprint = build_fingerprint(request) result_hash = build_result_hash(result) existing = store.find_existing(subject_key=subject_key, fingerprint=fingerprint) if not existing: store.insert_or_update( existing_id=None, subject_key=subject_key, fingerprint=fingerprint, result_hash=result_hash, request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True), result_payload=result.model_dump(exclude={"dedupe"}, exclude_none=True), seen_count=1, ) return DedupeResult(status="new", seen_count=1, matched_on="none", subject_key=subject_key, fingerprint=fingerprint) matched_on = "fingerprint" if existing.get("fingerprint") == fingerprint else "subject" previous_hash = existing.get("result_hash") seen_count = int(existing.get("seen_count", 1)) + 1 status = "duplicate" if previous_hash == result_hash else "updated" store.insert_or_update( existing_id=existing["id"], subject_key=subject_key, fingerprint=fingerprint, result_hash=result_hash, request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True), result_payload=result.model_dump(exclude={"dedupe"}, exclude_none=True), seen_count=seen_count, ) return DedupeResult( status=status, seen_count=seen_count, matched_on=matched_on, subject_key=subject_key, fingerprint=fingerprint, )