Use Outlook ids for classifier dedupe precedence
This commit is contained in:
97
README.md
97
README.md
@@ -1,6 +1,6 @@
|
|||||||
# email-classifier
|
# email-classifier
|
||||||
|
|
||||||
FastAPI service that classifies email using a configurable LLM backend, returns richer structured extraction, and tracks duplicate classifications using fingerprint-based dedupe.
|
FastAPI service that classifies email using a configurable LLM backend, returns richer structured extraction, and tracks duplicate classifications using Outlook-aware dedupe.
|
||||||
|
|
||||||
## Environment configuration
|
## Environment configuration
|
||||||
|
|
||||||
@@ -31,88 +31,101 @@ Optional local dedupe store path:
|
|||||||
export EMAIL_CLASSIFIER_DB_PATH=.data/email_classifier.db
|
export EMAIL_CLASSIFIER_DB_PATH=.data/email_classifier.db
|
||||||
```
|
```
|
||||||
|
|
||||||
## API
|
## Input shape
|
||||||
|
|
||||||
### POST /classify
|
Designed around real Outlook message payloads. Relevant fields:
|
||||||
|
|
||||||
This overhaul is intended to return richer extraction. Top-level compatibility is not required.
|
|
||||||
|
|
||||||
Request example:
|
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
"id": "AAMk...",
|
||||||
|
"internetMessageId": "<...@...>",
|
||||||
|
"conversationId": "AAQk...",
|
||||||
|
"subject": "MB Printer",
|
||||||
|
"bodyPreview": "Good morning, ...",
|
||||||
|
"receivedDateTime": "2026-02-19T15:27:35Z",
|
||||||
|
"sentDateTime": "2026-02-19T15:27:32Z",
|
||||||
|
"hasAttachments": false,
|
||||||
|
"importance": "normal",
|
||||||
|
"isRead": false,
|
||||||
|
"body": {
|
||||||
|
"contentType": "html",
|
||||||
|
"content": "..."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
API request example:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "AAMk...",
|
||||||
|
"internetMessageId": "<...@...>",
|
||||||
|
"conversationId": "AAQk...",
|
||||||
|
"bodyPreview": "Good morning, ...",
|
||||||
|
"receivedDateTime": "2026-02-19T15:27:35Z",
|
||||||
|
"sentDateTime": "2026-02-19T15:27:32Z",
|
||||||
|
"hasAttachments": false,
|
||||||
|
"importance": "normal",
|
||||||
|
"isRead": false,
|
||||||
"email_data": {
|
"email_data": {
|
||||||
"subject": "Can you review this by Friday?",
|
"subject": "MB Printer",
|
||||||
"body": "Hi Daniel, please review the attached budget proposal."
|
"body": "<html>...</html>"
|
||||||
},
|
},
|
||||||
"from_address": "sender@example.com",
|
|
||||||
"received_at": "2026-04-09T12:55:00Z",
|
|
||||||
"provider": "anthropic",
|
"provider": "anthropic",
|
||||||
"base_url": "https://api.minimax.io/anthropic",
|
"base_url": "https://api.minimax.io/anthropic",
|
||||||
"model": "MiniMax-M2.7"
|
"model": "MiniMax-M2.7"
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Response example:
|
## Response example
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"needs_action": true,
|
"needs_action": true,
|
||||||
"category": "question",
|
"category": "question",
|
||||||
"priority": "high",
|
"priority": "high",
|
||||||
"task_description": "Review the budget proposal and respond by Friday",
|
"task_description": "Investigate MB Printer issue and reply",
|
||||||
"reasoning": "Direct request with a deadline requires follow-up",
|
"reasoning": "The email appears to describe an issue requiring action.",
|
||||||
"confidence": 0.91,
|
"confidence": 0.91,
|
||||||
"details": {
|
"details": {
|
||||||
"summary": "Budget proposal review requested with Friday deadline.",
|
"summary": "Printer issue reported in the MB area.",
|
||||||
"suggested_title": "Review budget proposal and respond by Friday",
|
"suggested_title": "Handle MB Printer issue",
|
||||||
"suggested_notes": "Requester asked for feedback on attached budget proposal before Friday.",
|
"suggested_notes": "Review the printer problem, identify urgency, and reply with next steps.",
|
||||||
"deadline": "Friday",
|
"deadline": null,
|
||||||
"people": ["Daniel"],
|
"people": [],
|
||||||
"organizations": [],
|
"organizations": [],
|
||||||
"attachments_referenced": ["budget proposal"],
|
"attachments_referenced": [],
|
||||||
"next_steps": ["Review attachment", "Reply with feedback"],
|
"next_steps": ["Review issue", "Respond to sender"],
|
||||||
"key_points": ["Deadline is Friday"],
|
"key_points": ["Printer issue reported"],
|
||||||
"source_signals": ["request", "deadline"],
|
"source_signals": ["request"],
|
||||||
"dedupe_key": "..."
|
"dedupe_key": "..."
|
||||||
},
|
},
|
||||||
"dedupe": {
|
"dedupe": {
|
||||||
"status": "new",
|
"status": "new",
|
||||||
"seen_count": 1,
|
"seen_count": 1,
|
||||||
"matched_on": "none",
|
"matched_on": "none",
|
||||||
"subject_key": "...",
|
"message_id": "AAMk...",
|
||||||
|
"conversation_id": "AAQk...",
|
||||||
"fingerprint": "..."
|
"fingerprint": "..."
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
## Dedupe behavior
|
## Dedupe precedence
|
||||||
|
|
||||||
The API does not create or update Todoist tasks.
|
1. `id` for exact Outlook message match
|
||||||
It only returns richer extraction and local dedupe metadata for downstream automation like n8n.
|
2. `conversationId` for thread grouping
|
||||||
|
3. normalized subject + preview/body fingerprint fallback
|
||||||
Matching strategy:
|
|
||||||
- normalized subject plus sender-derived `subject_key`
|
|
||||||
- full content fingerprint fallback based on sender + normalized subject + cleaned body
|
|
||||||
|
|
||||||
Statuses:
|
Statuses:
|
||||||
- `new`: no prior similar email seen
|
- `new`: no prior similar email seen
|
||||||
- `duplicate`: same dedupe target and same extracted result as before
|
- `duplicate`: same dedupe target and same extracted result as before
|
||||||
- `updated`: matched prior email, but extracted result changed
|
- `updated`: matched prior email, but extracted result changed
|
||||||
|
|
||||||
This is intentionally heuristic, not perfect.
|
This is intentionally heuristic for the fallback path.
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
- `app/classifier.py`: classification orchestration and dedupe handoff
|
|
||||||
- `app/prompts.py`: richer extraction prompt
|
|
||||||
- `app/sync.py`: subject normalization, fingerprinting, dedupe application
|
|
||||||
- `app/dedupe_store.py`: SQLite-backed dedupe store
|
|
||||||
- `app/llm_adapters.py`: provider adapters
|
|
||||||
- `app/config.py`: LLM settings
|
|
||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
|
|
||||||
- No Todoist integration lives in this API.
|
- No Todoist integration lives in this API.
|
||||||
- Dedupe is best-effort and designed to help downstream workflows avoid obvious duplicates.
|
- Dedupe is local and intended to help downstream workflows avoid obvious duplicates.
|
||||||
- SQLite is used for lightweight local dedupe tracking.
|
- SQLite is used for lightweight local dedupe tracking.
|
||||||
|
|||||||
@@ -23,7 +23,8 @@ class DedupeStore:
|
|||||||
"""
|
"""
|
||||||
CREATE TABLE IF NOT EXISTS classification_dedupe (
|
CREATE TABLE IF NOT EXISTS classification_dedupe (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
subject_key TEXT NOT NULL,
|
outlook_id TEXT,
|
||||||
|
conversation_id TEXT,
|
||||||
fingerprint TEXT NOT NULL,
|
fingerprint TEXT NOT NULL,
|
||||||
result_hash TEXT NOT NULL,
|
result_hash TEXT NOT NULL,
|
||||||
request_payload TEXT NOT NULL,
|
request_payload TEXT NOT NULL,
|
||||||
@@ -34,22 +35,35 @@ class DedupeStore:
|
|||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_subject_key ON classification_dedupe(subject_key)")
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_outlook_id ON classification_dedupe(outlook_id)")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_conversation_id ON classification_dedupe(conversation_id)")
|
||||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_fingerprint ON classification_dedupe(fingerprint)")
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_fingerprint ON classification_dedupe(fingerprint)")
|
||||||
|
|
||||||
def find_existing(self, *, subject_key: str, fingerprint: str) -> dict[str, Any] | None:
|
def find_existing(self, *, outlook_id: str | None, conversation_id: str | None, fingerprint: str) -> tuple[dict[str, Any] | None, str]:
|
||||||
with self._connect() as conn:
|
with self._connect() as conn:
|
||||||
|
if outlook_id:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT * FROM classification_dedupe WHERE outlook_id = ? ORDER BY id DESC LIMIT 1",
|
||||||
|
(outlook_id,),
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return self._decode(row), "id"
|
||||||
|
if conversation_id:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT * FROM classification_dedupe WHERE conversation_id = ? ORDER BY id DESC LIMIT 1",
|
||||||
|
(conversation_id,),
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return self._decode(row), "conversation"
|
||||||
row = conn.execute(
|
row = conn.execute(
|
||||||
"SELECT * FROM classification_dedupe WHERE fingerprint = ? ORDER BY id DESC LIMIT 1",
|
"SELECT * FROM classification_dedupe WHERE fingerprint = ? ORDER BY id DESC LIMIT 1",
|
||||||
(fingerprint,),
|
(fingerprint,),
|
||||||
).fetchone()
|
).fetchone()
|
||||||
if row is None:
|
if row:
|
||||||
row = conn.execute(
|
return self._decode(row), "fingerprint"
|
||||||
"SELECT * FROM classification_dedupe WHERE subject_key = ? ORDER BY id DESC LIMIT 1",
|
return None, "none"
|
||||||
(subject_key,),
|
|
||||||
).fetchone()
|
def _decode(self, row: sqlite3.Row) -> dict[str, Any]:
|
||||||
if not row:
|
|
||||||
return None
|
|
||||||
data = dict(row)
|
data = dict(row)
|
||||||
data["request_payload"] = json.loads(data["request_payload"])
|
data["request_payload"] = json.loads(data["request_payload"])
|
||||||
data["result_payload"] = json.loads(data["result_payload"])
|
data["result_payload"] = json.loads(data["result_payload"])
|
||||||
@@ -59,7 +73,8 @@ class DedupeStore:
|
|||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
existing_id: int | None,
|
existing_id: int | None,
|
||||||
subject_key: str,
|
outlook_id: str | None,
|
||||||
|
conversation_id: str | None,
|
||||||
fingerprint: str,
|
fingerprint: str,
|
||||||
result_hash: str,
|
result_hash: str,
|
||||||
request_payload: dict[str, Any],
|
request_payload: dict[str, Any],
|
||||||
@@ -70,11 +85,12 @@ class DedupeStore:
|
|||||||
if existing_id is None:
|
if existing_id is None:
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"""
|
"""
|
||||||
INSERT INTO classification_dedupe (subject_key, fingerprint, result_hash, request_payload, result_payload, seen_count)
|
INSERT INTO classification_dedupe (outlook_id, conversation_id, fingerprint, result_hash, request_payload, result_payload, seen_count)
|
||||||
VALUES (?, ?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
""",
|
""",
|
||||||
(
|
(
|
||||||
subject_key,
|
outlook_id,
|
||||||
|
conversation_id,
|
||||||
fingerprint,
|
fingerprint,
|
||||||
result_hash,
|
result_hash,
|
||||||
json.dumps(request_payload, sort_keys=True),
|
json.dumps(request_payload, sort_keys=True),
|
||||||
@@ -86,12 +102,13 @@ class DedupeStore:
|
|||||||
conn.execute(
|
conn.execute(
|
||||||
"""
|
"""
|
||||||
UPDATE classification_dedupe
|
UPDATE classification_dedupe
|
||||||
SET subject_key = ?, fingerprint = ?, result_hash = ?, request_payload = ?, result_payload = ?,
|
SET outlook_id = ?, conversation_id = ?, fingerprint = ?, result_hash = ?, request_payload = ?, result_payload = ?,
|
||||||
seen_count = ?, updated_at = CURRENT_TIMESTAMP
|
seen_count = ?, updated_at = CURRENT_TIMESTAMP
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
""",
|
""",
|
||||||
(
|
(
|
||||||
subject_key,
|
outlook_id,
|
||||||
|
conversation_id,
|
||||||
fingerprint,
|
fingerprint,
|
||||||
result_hash,
|
result_hash,
|
||||||
json.dumps(request_payload, sort_keys=True),
|
json.dumps(request_payload, sort_keys=True),
|
||||||
|
|||||||
@@ -17,8 +17,17 @@ class ClassifyRequest(BaseModel):
|
|||||||
base_url: str | None = None
|
base_url: str | None = None
|
||||||
api_key: str | None = Field(default=None, exclude=True)
|
api_key: str | None = Field(default=None, exclude=True)
|
||||||
temperature: float | None = None
|
temperature: float | None = None
|
||||||
|
|
||||||
|
id: str | None = None
|
||||||
|
internetMessageId: str | None = None
|
||||||
|
conversationId: str | None = None
|
||||||
|
bodyPreview: str | None = None
|
||||||
|
receivedDateTime: str | None = None
|
||||||
|
sentDateTime: str | None = None
|
||||||
|
hasAttachments: bool | None = None
|
||||||
|
importance: str | None = None
|
||||||
|
isRead: bool | None = None
|
||||||
from_address: str | None = None
|
from_address: str | None = None
|
||||||
received_at: str | None = None
|
|
||||||
|
|
||||||
|
|
||||||
class ClassificationDetails(BaseModel):
|
class ClassificationDetails(BaseModel):
|
||||||
@@ -38,8 +47,9 @@ class ClassificationDetails(BaseModel):
|
|||||||
class DedupeResult(BaseModel):
|
class DedupeResult(BaseModel):
|
||||||
status: Literal["new", "duplicate", "updated"]
|
status: Literal["new", "duplicate", "updated"]
|
||||||
seen_count: int = 1
|
seen_count: int = 1
|
||||||
matched_on: Literal["none", "subject", "fingerprint"] = "none"
|
matched_on: Literal["none", "id", "conversation", "fingerprint"] = "none"
|
||||||
subject_key: str
|
message_id: str | None = None
|
||||||
|
conversation_id: str | None = None
|
||||||
fingerprint: str
|
fingerprint: str
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
36
app/sync.py
36
app/sync.py
@@ -16,16 +16,12 @@ def normalize_subject(subject: str) -> str:
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
def build_subject_key(request: ClassifyRequest) -> str:
|
|
||||||
subject = normalize_subject(request.email_data.subject)
|
|
||||||
sender = (request.from_address or "").strip().lower()
|
|
||||||
return hashlib.sha256(f"{sender}\n{subject}".encode()).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def build_fingerprint(request: ClassifyRequest) -> str:
|
def build_fingerprint(request: ClassifyRequest) -> str:
|
||||||
subject = normalize_subject(request.email_data.subject)
|
subject = normalize_subject(request.email_data.subject)
|
||||||
body = " ".join(request.email_data.body.split()).strip().lower()
|
body = " ".join(request.email_data.body.split()).strip().lower()
|
||||||
seed = f"{request.from_address or ''}\n{subject}\n{body[:2000]}"
|
preview = " ".join((request.bodyPreview or "").split()).strip().lower()
|
||||||
|
sender = (request.from_address or "").strip().lower()
|
||||||
|
seed = f"{sender}\n{subject}\n{preview}\n{body[:2000]}"
|
||||||
return hashlib.sha256(seed.encode()).hexdigest()
|
return hashlib.sha256(seed.encode()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
@@ -36,30 +32,41 @@ def build_result_hash(result: ClassificationResult) -> str:
|
|||||||
|
|
||||||
def apply_dedupe(request: ClassifyRequest, result: ClassificationResult) -> DedupeResult:
|
def apply_dedupe(request: ClassifyRequest, result: ClassificationResult) -> DedupeResult:
|
||||||
store = DedupeStore(os.getenv("EMAIL_CLASSIFIER_DB_PATH", ".data/email_classifier.db"))
|
store = DedupeStore(os.getenv("EMAIL_CLASSIFIER_DB_PATH", ".data/email_classifier.db"))
|
||||||
subject_key = build_subject_key(request)
|
|
||||||
fingerprint = build_fingerprint(request)
|
fingerprint = build_fingerprint(request)
|
||||||
|
existing, matched_on = store.find_existing(
|
||||||
|
outlook_id=request.id,
|
||||||
|
conversation_id=request.conversationId,
|
||||||
|
fingerprint=fingerprint,
|
||||||
|
)
|
||||||
result_hash = build_result_hash(result)
|
result_hash = build_result_hash(result)
|
||||||
existing = store.find_existing(subject_key=subject_key, fingerprint=fingerprint)
|
|
||||||
|
|
||||||
if not existing:
|
if not existing:
|
||||||
store.insert_or_update(
|
store.insert_or_update(
|
||||||
existing_id=None,
|
existing_id=None,
|
||||||
subject_key=subject_key,
|
outlook_id=request.id,
|
||||||
|
conversation_id=request.conversationId,
|
||||||
fingerprint=fingerprint,
|
fingerprint=fingerprint,
|
||||||
result_hash=result_hash,
|
result_hash=result_hash,
|
||||||
request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
||||||
result_payload=result.model_dump(exclude={"dedupe"}, exclude_none=True),
|
result_payload=result.model_dump(exclude={"dedupe"}, exclude_none=True),
|
||||||
seen_count=1,
|
seen_count=1,
|
||||||
)
|
)
|
||||||
return DedupeResult(status="new", seen_count=1, matched_on="none", subject_key=subject_key, fingerprint=fingerprint)
|
return DedupeResult(
|
||||||
|
status="new",
|
||||||
|
seen_count=1,
|
||||||
|
matched_on="none",
|
||||||
|
message_id=request.id,
|
||||||
|
conversation_id=request.conversationId,
|
||||||
|
fingerprint=fingerprint,
|
||||||
|
)
|
||||||
|
|
||||||
matched_on = "fingerprint" if existing.get("fingerprint") == fingerprint else "subject"
|
|
||||||
previous_hash = existing.get("result_hash")
|
previous_hash = existing.get("result_hash")
|
||||||
seen_count = int(existing.get("seen_count", 1)) + 1
|
seen_count = int(existing.get("seen_count", 1)) + 1
|
||||||
status = "duplicate" if previous_hash == result_hash else "updated"
|
status = "duplicate" if previous_hash == result_hash else "updated"
|
||||||
store.insert_or_update(
|
store.insert_or_update(
|
||||||
existing_id=existing["id"],
|
existing_id=existing["id"],
|
||||||
subject_key=subject_key,
|
outlook_id=request.id or existing.get("outlook_id"),
|
||||||
|
conversation_id=request.conversationId or existing.get("conversation_id"),
|
||||||
fingerprint=fingerprint,
|
fingerprint=fingerprint,
|
||||||
result_hash=result_hash,
|
result_hash=result_hash,
|
||||||
request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
||||||
@@ -70,6 +77,7 @@ def apply_dedupe(request: ClassifyRequest, result: ClassificationResult) -> Dedu
|
|||||||
status=status,
|
status=status,
|
||||||
seen_count=seen_count,
|
seen_count=seen_count,
|
||||||
matched_on=matched_on,
|
matched_on=matched_on,
|
||||||
subject_key=subject_key,
|
message_id=request.id or existing.get("outlook_id"),
|
||||||
|
conversation_id=request.conversationId or existing.get("conversation_id"),
|
||||||
fingerprint=fingerprint,
|
fingerprint=fingerprint,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user