76 lines
2.8 KiB
Python
76 lines
2.8 KiB
Python
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
|
|
from app.dedupe_store import DedupeStore
|
|
from app.models import ClassificationResult, ClassifyRequest, DedupeResult
|
|
|
|
|
|
def normalize_subject(subject: str) -> str:
|
|
value = subject.strip().lower()
|
|
value = re.sub(r"^(re|fw|fwd)\s*:\s*", "", value)
|
|
value = re.sub(r"\s+", " ", value)
|
|
return value
|
|
|
|
|
|
def build_subject_key(request: ClassifyRequest) -> str:
|
|
subject = normalize_subject(request.email_data.subject)
|
|
sender = (request.from_address or "").strip().lower()
|
|
return hashlib.sha256(f"{sender}\n{subject}".encode()).hexdigest()
|
|
|
|
|
|
def build_fingerprint(request: ClassifyRequest) -> str:
|
|
subject = normalize_subject(request.email_data.subject)
|
|
body = " ".join(request.email_data.body.split()).strip().lower()
|
|
seed = f"{request.from_address or ''}\n{subject}\n{body[:2000]}"
|
|
return hashlib.sha256(seed.encode()).hexdigest()
|
|
|
|
|
|
def build_result_hash(result: ClassificationResult) -> str:
|
|
payload = result.model_dump(exclude={"dedupe"}, exclude_none=True)
|
|
return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest()
|
|
|
|
|
|
def apply_dedupe(request: ClassifyRequest, result: ClassificationResult) -> DedupeResult:
|
|
store = DedupeStore(os.getenv("EMAIL_CLASSIFIER_DB_PATH", ".data/email_classifier.db"))
|
|
subject_key = build_subject_key(request)
|
|
fingerprint = build_fingerprint(request)
|
|
result_hash = build_result_hash(result)
|
|
existing = store.find_existing(subject_key=subject_key, fingerprint=fingerprint)
|
|
|
|
if not existing:
|
|
store.insert_or_update(
|
|
existing_id=None,
|
|
subject_key=subject_key,
|
|
fingerprint=fingerprint,
|
|
result_hash=result_hash,
|
|
request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
|
result_payload=result.model_dump(exclude={"dedupe"}, exclude_none=True),
|
|
seen_count=1,
|
|
)
|
|
return DedupeResult(status="new", seen_count=1, matched_on="none", subject_key=subject_key, fingerprint=fingerprint)
|
|
|
|
matched_on = "fingerprint" if existing.get("fingerprint") == fingerprint else "subject"
|
|
previous_hash = existing.get("result_hash")
|
|
seen_count = int(existing.get("seen_count", 1)) + 1
|
|
status = "duplicate" if previous_hash == result_hash else "updated"
|
|
store.insert_or_update(
|
|
existing_id=existing["id"],
|
|
subject_key=subject_key,
|
|
fingerprint=fingerprint,
|
|
result_hash=result_hash,
|
|
request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
|
result_payload=result.model_dump(exclude={"dedupe"}, exclude_none=True),
|
|
seen_count=seen_count,
|
|
)
|
|
return DedupeResult(
|
|
status=status,
|
|
seen_count=seen_count,
|
|
matched_on=matched_on,
|
|
subject_key=subject_key,
|
|
fingerprint=fingerprint,
|
|
)
|