Compare commits
14 Commits
cb4eb43209
...
V1.0.0
| Author | SHA1 | Date | |
|---|---|---|---|
| c0970c066e | |||
|
|
97abc74297 | ||
| ab14d55824 | |||
|
|
3e9904576f | ||
| 9f259a299f | |||
|
|
8d1109c309 | ||
|
|
39c0d787fc | ||
|
|
bcf660f222 | ||
|
|
760b56bfd6 | ||
| 17191fc489 | |||
|
|
8c49ce21e0 | ||
|
|
c6ee735949 | ||
|
|
1b2c7db924 | ||
|
|
a1dcaf9a74 |
76
.github/workflows/build-publish.yaml
vendored
76
.github/workflows/build-publish.yaml
vendored
@@ -3,11 +3,38 @@ name: Build and Publish Docker Image
|
|||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- main # Trigger on pushes to main
|
- '**'
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
create:
|
||||||
|
refs/tags/v*
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-and-push:
|
build-only:
|
||||||
runs-on: ubuntu-latest # Ensure your Gitea runner has this label
|
runs-on: ubuntu-latest
|
||||||
|
# All branches, all PRs, and anything that's not a push to main or a version tag
|
||||||
|
if: github.event_name != 'push' || (github.event_name == 'push' && !startsWith(gitea.ref, 'refs/tags/v') && gitea.ref != 'refs/heads/main')
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Build (no push)
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: Dockerfile
|
||||||
|
push: false
|
||||||
|
tags: |
|
||||||
|
${{ secrets.DOCKER_REGISTRY }}/${{ secrets.DOCKER_USERNAME }}/email-classifier:build-test
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
|
||||||
|
build-and-push-main:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: github.event_name == 'push' && gitea.ref == 'refs/heads/main'
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -15,25 +42,56 @@ jobs:
|
|||||||
- name: Set up Docker Buildx
|
- name: Set up Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
# Login to your registry (Docker Hub, Gitea Package Registry, or Harbor)
|
|
||||||
- name: Login to Docker Registry
|
- name: Login to Docker Registry
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
registry: ${{ secrets.DOCKER_REGISTRY }} # Remove if using Docker Hub
|
registry: ${{ secrets.DOCKER_REGISTRY }}
|
||||||
username: ${{ secrets.DOCKER_USERNAME }}
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||||
|
|
||||||
- name: Build and push
|
- name: Build and push (main branch)
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v5
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
file: Dockerfile
|
file: Dockerfile
|
||||||
push: true
|
push: true
|
||||||
# Tags the image as 'latest' and also uses the git SHA for versioning
|
|
||||||
tags: |
|
tags: |
|
||||||
${{ secrets.DOCKER_REGISTRY }}/${{ secrets.DOCKER_USERNAME }}/email-classifier:${{ gitea.sha }}
|
${{ secrets.DOCKER_REGISTRY }}/${{ secrets.DOCKER_USERNAME }}/email-classifier:main
|
||||||
${{ secrets.DOCKER_REGISTRY }}/${{ secrets.DOCKER_USERNAME }}/email-classifier:latest
|
${{ secrets.DOCKER_REGISTRY }}/${{ secrets.DOCKER_USERNAME }}/email-classifier:latest
|
||||||
# Caching speeds up builds by reusing layers (crucial for 'uv' installs)
|
${{ secrets.DOCKER_REGISTRY }}/${{ secrets.DOCKER_USERNAME }}/email-classifier:${{ gitea.sha }}
|
||||||
|
labels: |
|
||||||
|
org.opencontainers.image.source=${{ gitea.server_url }}/${{ gitea.repository }}
|
||||||
|
org.opencontainers.image.description=Email Classifier Service
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
|
||||||
|
build-and-push-tag:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: github.event_name == 'push' && startsWith(gitea.ref, 'refs/tags/v')
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Login to Docker Registry
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
registry: ${{ secrets.DOCKER_REGISTRY }}
|
||||||
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||||
|
|
||||||
|
- name: Build and push (tagged release)
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: Dockerfile
|
||||||
|
push: true
|
||||||
|
tags: |
|
||||||
|
${{ secrets.DOCKER_REGISTRY }}/${{ secrets.DOCKER_USERNAME }}/email-classifier:${{ gitea.ref_name }}
|
||||||
|
${{ secrets.DOCKER_REGISTRY }}/${{ secrets.DOCKER_USERNAME }}/email-classifier:latest
|
||||||
|
${{ secrets.DOCKER_REGISTRY }}/${{ secrets.DOCKER_USERNAME }}/email-classifier:${{ gitea.sha }}
|
||||||
labels: |
|
labels: |
|
||||||
org.opencontainers.image.source=${{ gitea.server_url }}/${{ gitea.repository }}
|
org.opencontainers.image.source=${{ gitea.server_url }}/${{ gitea.repository }}
|
||||||
org.opencontainers.image.description=Email Classifier Service
|
org.opencontainers.image.description=Email Classifier Service
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -8,3 +8,6 @@ wheels/
|
|||||||
|
|
||||||
# Virtual environments
|
# Virtual environments
|
||||||
.venv
|
.venv
|
||||||
|
venv/
|
||||||
|
docs/.venv/
|
||||||
|
docs/venv/
|
||||||
|
|||||||
146
README.md
146
README.md
@@ -1,21 +1,10 @@
|
|||||||
# email-classifier
|
# email-classifier
|
||||||
|
|
||||||
FastAPI service that classifies email using a configurable LLM backend.
|
FastAPI service that classifies email using a configurable LLM backend, returns richer structured extraction, and tracks duplicate classifications using Outlook-aware dedupe.
|
||||||
|
|
||||||
## What changed
|
|
||||||
|
|
||||||
The classifier no longer hardcodes a single Ollama + OpenAI-compatible endpoint.
|
|
||||||
It now supports:
|
|
||||||
- OpenAI-compatible APIs
|
|
||||||
- Anthropic-compatible APIs
|
|
||||||
- per-request overrides for provider, model, endpoint, and temperature
|
|
||||||
- global defaults through environment variables
|
|
||||||
|
|
||||||
This makes it suitable for local Ollama, hosted OpenAI-compatible services, and MiniMax's recommended Anthropic-compatible API.
|
|
||||||
|
|
||||||
## Environment configuration
|
## Environment configuration
|
||||||
|
|
||||||
Defaults are loaded from environment variables:
|
LLM defaults:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export LLM_PROVIDER=openai
|
export LLM_PROVIDER=openai
|
||||||
@@ -27,9 +16,7 @@ export LLM_TIMEOUT_SECONDS=60
|
|||||||
export LLM_MAX_RETRIES=3
|
export LLM_MAX_RETRIES=3
|
||||||
```
|
```
|
||||||
|
|
||||||
### MiniMax example
|
MiniMax via Anthropic-compatible API:
|
||||||
|
|
||||||
MiniMax recommends Anthropic-compatible integration.
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export LLM_PROVIDER=anthropic
|
export LLM_PROVIDER=anthropic
|
||||||
@@ -38,51 +25,128 @@ export LLM_API_KEY=your_minimax_key
|
|||||||
export LLM_MODEL=MiniMax-M2.7
|
export LLM_MODEL=MiniMax-M2.7
|
||||||
```
|
```
|
||||||
|
|
||||||
## API
|
Optional local dedupe store path:
|
||||||
|
|
||||||
### POST /classify
|
```bash
|
||||||
|
export EMAIL_CLASSIFIER_DB_PATH=.data/email_classifier.db
|
||||||
|
```
|
||||||
|
|
||||||
Request body:
|
## Input shape
|
||||||
|
|
||||||
|
The request model accepts either:
|
||||||
|
- simplified input via `email_data`
|
||||||
|
- or native Outlook-style fields directly
|
||||||
|
|
||||||
|
Full Outlook-shaped example:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "AAMk...",
|
||||||
|
"internetMessageId": "<...@...>",
|
||||||
|
"conversationId": "AAQk...",
|
||||||
|
"subject": "MB Printer",
|
||||||
|
"bodyPreview": "Good morning, ...",
|
||||||
|
"body": {
|
||||||
|
"contentType": "html",
|
||||||
|
"content": "<html>...(full HTML body)</html>"
|
||||||
|
},
|
||||||
|
"sender": {
|
||||||
|
"emailAddress": {
|
||||||
|
"name": "Bobbi Johnson",
|
||||||
|
"address": "bobbi.johnson@grandportage.com"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"from": {
|
||||||
|
"emailAddress": {
|
||||||
|
"name": "Bobbi Johnson",
|
||||||
|
"address": "bobbi.johnson@grandportage.com"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"toRecipients": [
|
||||||
|
{
|
||||||
|
"emailAddress": {
|
||||||
|
"name": "IT Helpdesk Mail",
|
||||||
|
"address": "helpdeskmail@grandportage.com"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"ccRecipients": [],
|
||||||
|
"bccRecipients": [],
|
||||||
|
"replyTo": [],
|
||||||
|
"receivedDateTime": "2026-02-19T15:27:35Z",
|
||||||
|
"sentDateTime": "2026-02-19T15:27:32Z",
|
||||||
|
"hasAttachments": false,
|
||||||
|
"importance": "normal",
|
||||||
|
"isRead": false,
|
||||||
|
"flag": { "flagStatus": "notFlagged" },
|
||||||
|
"provider": "anthropic",
|
||||||
|
"base_url": "https://api.minimax.io/anthropic",
|
||||||
|
"model": "MiniMax-M2.7"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Simplified request example:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"email_data": {
|
"email_data": {
|
||||||
"subject": "Can you review this by Friday?",
|
"subject": "MB Printer",
|
||||||
"body": "Hi Daniel, please review the attached budget proposal."
|
"body": "<html>...</html>"
|
||||||
},
|
},
|
||||||
"provider": "anthropic",
|
"id": "AAMk...",
|
||||||
"base_url": "https://api.minimax.io/anthropic",
|
"conversationId": "AAQk..."
|
||||||
"model": "MiniMax-M2.7",
|
|
||||||
"temperature": 0.1
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
All override fields are optional. If omitted, the service uses the global env config.
|
## Response example
|
||||||
|
|
||||||
Response shape:
|
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"needs_action": true,
|
"needs_action": true,
|
||||||
"category": "question",
|
"category": "question",
|
||||||
"priority": "high",
|
"priority": "high",
|
||||||
"task_description": "Review the budget proposal and respond by Friday",
|
"task_description": "Investigate MB Printer issue and reply",
|
||||||
"reasoning": "Direct request with a deadline requires follow-up",
|
"reasoning": "The email appears to describe an issue requiring action.",
|
||||||
"confidence": 0.91
|
"confidence": 0.91,
|
||||||
|
"details": {
|
||||||
|
"summary": "Printer issue reported in the MB area.",
|
||||||
|
"suggested_title": "Handle MB Printer issue",
|
||||||
|
"suggested_notes": "Review the printer problem, identify urgency, and reply with next steps.",
|
||||||
|
"deadline": null,
|
||||||
|
"people": [],
|
||||||
|
"organizations": [],
|
||||||
|
"attachments_referenced": [],
|
||||||
|
"next_steps": ["Review issue", "Respond to sender"],
|
||||||
|
"key_points": ["Printer issue reported"],
|
||||||
|
"source_signals": ["request"],
|
||||||
|
"dedupe_key": "..."
|
||||||
|
},
|
||||||
|
"dedupe": {
|
||||||
|
"status": "new",
|
||||||
|
"seen_count": 1,
|
||||||
|
"matched_on": "none",
|
||||||
|
"message_id": "AAMk...",
|
||||||
|
"conversation_id": "AAQk...",
|
||||||
|
"fingerprint": "..."
|
||||||
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
## Architecture
|
## Dedupe precedence
|
||||||
|
|
||||||
- `app/config.py`: global and per-request LLM settings
|
1. `id` for exact Outlook message match
|
||||||
- `app/llm_adapters.py`: provider adapters
|
2. `conversationId` for thread grouping
|
||||||
- `app/classifier.py`: classification orchestration, retries, normalization
|
3. normalized subject + preview/body fingerprint fallback
|
||||||
- `app/prompts.py`: system prompt
|
|
||||||
- `app/routers/classify_email.py`: thin API route
|
Statuses:
|
||||||
|
- `new`: no prior similar email seen
|
||||||
|
- `duplicate`: same dedupe target and same extracted result as before
|
||||||
|
- `updated`: matched prior email, but extracted result changed
|
||||||
|
|
||||||
|
This is intentionally heuristic for the fallback path.
|
||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
|
|
||||||
- OpenAI-compatible providers use the OpenAI SDK.
|
- No Todoist integration lives in this API.
|
||||||
- Anthropic-compatible providers use the Anthropic SDK.
|
- Dedupe is local and intended to help downstream workflows avoid obvious duplicates.
|
||||||
- Per-request `api_key` is supported, but excluded from response serialization.
|
- SQLite is used for lightweight local dedupe tracking.
|
||||||
- The service normalizes malformed model output and falls back safely after retry exhaustion.
|
|
||||||
|
|||||||
@@ -5,7 +5,8 @@ from typing import Any
|
|||||||
|
|
||||||
from app.config import get_request_settings
|
from app.config import get_request_settings
|
||||||
from app.llm_adapters import build_adapter, coerce_json_text
|
from app.llm_adapters import build_adapter, coerce_json_text
|
||||||
from app.models import ClassificationResult, ClassifyRequest, EmailData
|
from app.models import ClassificationDetails, ClassificationResult, ClassifyRequest, EmailData
|
||||||
|
from app.sync import apply_dedupe, build_fingerprint
|
||||||
|
|
||||||
VALID_CATEGORIES = {
|
VALID_CATEGORIES = {
|
||||||
"action_required",
|
"action_required",
|
||||||
@@ -21,7 +22,7 @@ VALID_PRIORITIES = {"high", "medium", "low"}
|
|||||||
|
|
||||||
|
|
||||||
async def classify_email(request: ClassifyRequest) -> ClassificationResult:
|
async def classify_email(request: ClassifyRequest) -> ClassificationResult:
|
||||||
clean_email = _clean_email(request.email_data)
|
clean_email = _clean_email(request)
|
||||||
settings = get_request_settings(
|
settings = get_request_settings(
|
||||||
provider=request.provider,
|
provider=request.provider,
|
||||||
model=request.model,
|
model=request.model,
|
||||||
@@ -32,40 +33,50 @@ async def classify_email(request: ClassifyRequest) -> ClassificationResult:
|
|||||||
adapter = build_adapter(settings)
|
adapter = build_adapter(settings)
|
||||||
|
|
||||||
attempts = 0
|
attempts = 0
|
||||||
|
result: ClassificationResult | None = None
|
||||||
while attempts < settings.max_retries:
|
while attempts < settings.max_retries:
|
||||||
raw_response = await adapter.classify(clean_email)
|
raw_response = await adapter.classify(clean_email.email_data)
|
||||||
try:
|
try:
|
||||||
payload = json.loads(coerce_json_text(raw_response))
|
payload = json.loads(coerce_json_text(raw_response))
|
||||||
result = _normalize_result(payload)
|
result = _normalize_result(payload, clean_email)
|
||||||
if result.needs_action and not result.task_description:
|
if result.needs_action and not result.task_description:
|
||||||
attempts += 1
|
attempts += 1
|
||||||
continue
|
continue
|
||||||
return result
|
break
|
||||||
except (json.JSONDecodeError, ValueError, TypeError):
|
except (json.JSONDecodeError, ValueError, TypeError):
|
||||||
attempts += 1
|
attempts += 1
|
||||||
|
|
||||||
return ClassificationResult(
|
if result is None:
|
||||||
needs_action=False,
|
result = ClassificationResult(
|
||||||
category="uncategorized",
|
needs_action=False,
|
||||||
priority="low",
|
category="uncategorized",
|
||||||
task_description=None,
|
priority="low",
|
||||||
reasoning="System failed to classify after multiple attempts.",
|
task_description=None,
|
||||||
confidence=0.0,
|
reasoning="System failed to classify after multiple attempts.",
|
||||||
)
|
confidence=0.0,
|
||||||
|
details=ClassificationDetails(dedupe_key=build_fingerprint(clean_email)),
|
||||||
|
)
|
||||||
|
|
||||||
|
result.dedupe = apply_dedupe(clean_email, result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _clean_email(email: EmailData) -> EmailData:
|
def _clean_email(request: ClassifyRequest) -> ClassifyRequest:
|
||||||
from app.helpers.clean_email_html import clean_email_html
|
from app.helpers.clean_email_html import clean_email_html
|
||||||
from app.helpers.extract_latest_message import extract_latest_message
|
from app.helpers.extract_latest_message import extract_latest_message
|
||||||
from app.helpers.remove_disclaimer import remove_disclaimer
|
from app.helpers.remove_disclaimer import remove_disclaimer
|
||||||
|
|
||||||
return EmailData(
|
return request.model_copy(
|
||||||
subject=email.subject,
|
update={
|
||||||
body=remove_disclaimer(clean_email_html(extract_latest_message(email.body))),
|
"email_data": EmailData(
|
||||||
|
subject=request.email_data.subject,
|
||||||
|
body=remove_disclaimer(clean_email_html(extract_latest_message(request.email_data.body))),
|
||||||
|
)
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _normalize_result(data: dict[str, Any]) -> ClassificationResult:
|
def _normalize_result(data: dict[str, Any], request: ClassifyRequest) -> ClassificationResult:
|
||||||
needs_action = bool(data.get("needs_action", False))
|
needs_action = bool(data.get("needs_action", False))
|
||||||
category = str(data.get("category", "uncategorized") or "uncategorized").lower()
|
category = str(data.get("category", "uncategorized") or "uncategorized").lower()
|
||||||
if category not in VALID_CATEGORIES:
|
if category not in VALID_CATEGORIES:
|
||||||
@@ -81,6 +92,24 @@ def _normalize_result(data: dict[str, Any]) -> ClassificationResult:
|
|||||||
reasoning = str(data.get("reasoning", "") or "").strip() or "No reasoning provided."
|
reasoning = str(data.get("reasoning", "") or "").strip() or "No reasoning provided."
|
||||||
confidence_raw = data.get("confidence", 0.0)
|
confidence_raw = data.get("confidence", 0.0)
|
||||||
confidence = max(0.0, min(1.0, float(confidence_raw)))
|
confidence = max(0.0, min(1.0, float(confidence_raw)))
|
||||||
|
details_payload = data.get("details") or {}
|
||||||
|
details = ClassificationDetails(
|
||||||
|
summary=_clean_text(details_payload.get("summary")),
|
||||||
|
suggested_title=_clean_text(details_payload.get("suggested_title")),
|
||||||
|
suggested_notes=_clean_text(details_payload.get("suggested_notes")),
|
||||||
|
deadline=_clean_text(details_payload.get("deadline")),
|
||||||
|
people=_string_list(details_payload.get("people")),
|
||||||
|
organizations=_string_list(details_payload.get("organizations")),
|
||||||
|
attachments_referenced=_string_list(details_payload.get("attachments_referenced")),
|
||||||
|
next_steps=_string_list(details_payload.get("next_steps")),
|
||||||
|
key_points=_string_list(details_payload.get("key_points")),
|
||||||
|
source_signals=_string_list(details_payload.get("source_signals")),
|
||||||
|
dedupe_key=build_fingerprint(request),
|
||||||
|
)
|
||||||
|
if needs_action and not details.suggested_title:
|
||||||
|
details.suggested_title = task_description
|
||||||
|
if not details.summary:
|
||||||
|
details.summary = reasoning
|
||||||
return ClassificationResult(
|
return ClassificationResult(
|
||||||
needs_action=needs_action,
|
needs_action=needs_action,
|
||||||
category=category,
|
category=category,
|
||||||
@@ -88,4 +117,27 @@ def _normalize_result(data: dict[str, Any]) -> ClassificationResult:
|
|||||||
task_description=task_description,
|
task_description=task_description,
|
||||||
reasoning=reasoning,
|
reasoning=reasoning,
|
||||||
confidence=confidence,
|
confidence=confidence,
|
||||||
|
details=details,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_text(value: Any) -> str | None:
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
text = str(value).strip()
|
||||||
|
return text or None
|
||||||
|
|
||||||
|
|
||||||
|
def _string_list(value: Any) -> list[str]:
|
||||||
|
if not value:
|
||||||
|
return []
|
||||||
|
if isinstance(value, list):
|
||||||
|
items = value
|
||||||
|
else:
|
||||||
|
items = [value]
|
||||||
|
output = []
|
||||||
|
for item in items:
|
||||||
|
text = str(item).strip()
|
||||||
|
if text and text not in output:
|
||||||
|
output.append(text)
|
||||||
|
return output
|
||||||
|
|||||||
@@ -2,26 +2,66 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from typing import Literal
|
from pathlib import Path
|
||||||
|
from typing import Any, Literal
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
import yaml
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
Provider = Literal["openai", "anthropic"]
|
Provider = Literal["openai", "anthropic"]
|
||||||
|
DEFAULT_CONFIG_PATHS = ["config.yml", "config.yaml", "/config/config.yml", "/config/config.yaml"]
|
||||||
|
|
||||||
|
|
||||||
class LLMSettings(BaseModel):
|
class LLMSettings(BaseModel):
|
||||||
provider: Provider = Field(default=os.getenv("LLM_PROVIDER", "openai"))
|
provider: Provider = "openai"
|
||||||
api_key: str = Field(default=os.getenv("LLM_API_KEY", "none"))
|
api_key: str = "none"
|
||||||
model: str = Field(default=os.getenv("LLM_MODEL", "qwen2.5-7b-instruct.q4_k_m"))
|
model: str = "qwen2.5-7b-instruct.q4_k_m"
|
||||||
base_url: str = Field(default=os.getenv("LLM_BASE_URL", "http://ollama.internal.henryhosted.com:9292/v1"))
|
base_url: str = "http://ollama.internal.henryhosted.com:9292/v1"
|
||||||
temperature: float = Field(default=float(os.getenv("LLM_TEMPERATURE", "0.1")))
|
temperature: float = 0.1
|
||||||
timeout_seconds: float = Field(default=float(os.getenv("LLM_TIMEOUT_SECONDS", "60")))
|
timeout_seconds: float = 60
|
||||||
max_retries: int = Field(default=int(os.getenv("LLM_MAX_RETRIES", "3")))
|
max_retries: int = 3
|
||||||
|
|
||||||
|
|
||||||
|
def _load_yaml_config() -> dict[str, Any]:
|
||||||
|
explicit = os.getenv("EMAIL_CLASSIFIER_CONFIG") or os.getenv("APP_CONFIG_FILE")
|
||||||
|
candidates = [explicit] if explicit else DEFAULT_CONFIG_PATHS
|
||||||
|
for candidate in candidates:
|
||||||
|
if not candidate:
|
||||||
|
continue
|
||||||
|
path = Path(candidate)
|
||||||
|
if not path.exists() or not path.is_file():
|
||||||
|
continue
|
||||||
|
data = yaml.safe_load(path.read_text()) or {}
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
raise ValueError(f"Config file must contain a mapping/object: {path}")
|
||||||
|
llm = data.get("llm", data)
|
||||||
|
if not isinstance(llm, dict):
|
||||||
|
raise ValueError(f"LLM config must be a mapping/object: {path}")
|
||||||
|
return llm
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _env_or_yaml(env_name: str, yaml_data: dict[str, Any], yaml_key: str, default: Any) -> Any:
|
||||||
|
value = os.getenv(env_name)
|
||||||
|
if value is not None:
|
||||||
|
return value
|
||||||
|
if yaml_key in yaml_data and yaml_data[yaml_key] is not None:
|
||||||
|
return yaml_data[yaml_key]
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=1)
|
@lru_cache(maxsize=1)
|
||||||
def get_settings() -> LLMSettings:
|
def get_settings() -> LLMSettings:
|
||||||
return LLMSettings()
|
yaml_data = _load_yaml_config()
|
||||||
|
return LLMSettings(
|
||||||
|
provider=_env_or_yaml("LLM_PROVIDER", yaml_data, "provider", "openai"),
|
||||||
|
api_key=_env_or_yaml("LLM_API_KEY", yaml_data, "api_key", "none"),
|
||||||
|
model=_env_or_yaml("LLM_MODEL", yaml_data, "model", "qwen2.5-7b-instruct.q4_k_m"),
|
||||||
|
base_url=_env_or_yaml("LLM_BASE_URL", yaml_data, "base_url", "http://ollama.internal.henryhosted.com:9292/v1"),
|
||||||
|
temperature=float(_env_or_yaml("LLM_TEMPERATURE", yaml_data, "temperature", 0.1)),
|
||||||
|
timeout_seconds=float(_env_or_yaml("LLM_TIMEOUT_SECONDS", yaml_data, "timeout_seconds", 60)),
|
||||||
|
max_retries=int(_env_or_yaml("LLM_MAX_RETRIES", yaml_data, "max_retries", 3)),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_request_settings(
|
def get_request_settings(
|
||||||
|
|||||||
119
app/dedupe_store.py
Normal file
119
app/dedupe_store.py
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
class DedupeStore:
|
||||||
|
def __init__(self, db_path: str = ".data/email_classifier.db"):
|
||||||
|
self.db_path = Path(db_path)
|
||||||
|
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._init_db()
|
||||||
|
|
||||||
|
def _connect(self) -> sqlite3.Connection:
|
||||||
|
conn = sqlite3.connect(self.db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
return conn
|
||||||
|
|
||||||
|
def _init_db(self) -> None:
|
||||||
|
with self._connect() as conn:
|
||||||
|
conn.execute(
|
||||||
|
"""
|
||||||
|
CREATE TABLE IF NOT EXISTS classification_dedupe (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
outlook_id TEXT,
|
||||||
|
conversation_id TEXT,
|
||||||
|
fingerprint TEXT NOT NULL,
|
||||||
|
result_hash TEXT NOT NULL,
|
||||||
|
request_payload TEXT NOT NULL,
|
||||||
|
result_payload TEXT NOT NULL,
|
||||||
|
seen_count INTEGER NOT NULL DEFAULT 1,
|
||||||
|
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_outlook_id ON classification_dedupe(outlook_id)")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_conversation_id ON classification_dedupe(conversation_id)")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_dedupe_fingerprint ON classification_dedupe(fingerprint)")
|
||||||
|
|
||||||
|
def find_existing(self, *, outlook_id: str | None, conversation_id: str | None, fingerprint: str) -> tuple[dict[str, Any] | None, str]:
|
||||||
|
with self._connect() as conn:
|
||||||
|
if outlook_id:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT * FROM classification_dedupe WHERE outlook_id = ? ORDER BY id DESC LIMIT 1",
|
||||||
|
(outlook_id,),
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return self._decode(row), "id"
|
||||||
|
if conversation_id:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT * FROM classification_dedupe WHERE conversation_id = ? ORDER BY id DESC LIMIT 1",
|
||||||
|
(conversation_id,),
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return self._decode(row), "conversation"
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT * FROM classification_dedupe WHERE fingerprint = ? ORDER BY id DESC LIMIT 1",
|
||||||
|
(fingerprint,),
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return self._decode(row), "fingerprint"
|
||||||
|
return None, "none"
|
||||||
|
|
||||||
|
def _decode(self, row: sqlite3.Row) -> dict[str, Any]:
|
||||||
|
data = dict(row)
|
||||||
|
data["request_payload"] = json.loads(data["request_payload"])
|
||||||
|
data["result_payload"] = json.loads(data["result_payload"])
|
||||||
|
return data
|
||||||
|
|
||||||
|
def insert_or_update(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
existing_id: int | None,
|
||||||
|
outlook_id: str | None,
|
||||||
|
conversation_id: str | None,
|
||||||
|
fingerprint: str,
|
||||||
|
result_hash: str,
|
||||||
|
request_payload: dict[str, Any],
|
||||||
|
result_payload: dict[str, Any],
|
||||||
|
seen_count: int,
|
||||||
|
) -> None:
|
||||||
|
with self._connect() as conn:
|
||||||
|
if existing_id is None:
|
||||||
|
conn.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO classification_dedupe (outlook_id, conversation_id, fingerprint, result_hash, request_payload, result_payload, seen_count)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""",
|
||||||
|
(
|
||||||
|
outlook_id,
|
||||||
|
conversation_id,
|
||||||
|
fingerprint,
|
||||||
|
result_hash,
|
||||||
|
json.dumps(request_payload, sort_keys=True),
|
||||||
|
json.dumps(result_payload, sort_keys=True),
|
||||||
|
seen_count,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
conn.execute(
|
||||||
|
"""
|
||||||
|
UPDATE classification_dedupe
|
||||||
|
SET outlook_id = ?, conversation_id = ?, fingerprint = ?, result_hash = ?, request_payload = ?, result_payload = ?,
|
||||||
|
seen_count = ?, updated_at = CURRENT_TIMESTAMP
|
||||||
|
WHERE id = ?
|
||||||
|
""",
|
||||||
|
(
|
||||||
|
outlook_id,
|
||||||
|
conversation_id,
|
||||||
|
fingerprint,
|
||||||
|
result_hash,
|
||||||
|
json.dumps(request_payload, sort_keys=True),
|
||||||
|
json.dumps(result_payload, sort_keys=True),
|
||||||
|
seen_count,
|
||||||
|
existing_id,
|
||||||
|
),
|
||||||
|
)
|
||||||
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field, model_validator
|
||||||
|
|
||||||
|
|
||||||
class EmailData(BaseModel):
|
class EmailData(BaseModel):
|
||||||
@@ -10,14 +10,91 @@ class EmailData(BaseModel):
|
|||||||
body: str
|
body: str
|
||||||
|
|
||||||
|
|
||||||
|
class EmailAddress(BaseModel):
|
||||||
|
name: str | None = None
|
||||||
|
address: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class Recipient(BaseModel):
|
||||||
|
emailAddress: EmailAddress | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class EmailBody(BaseModel):
|
||||||
|
contentType: str | None = None
|
||||||
|
content: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class Flag(BaseModel):
|
||||||
|
flagStatus: str | None = None
|
||||||
|
|
||||||
|
|
||||||
class ClassifyRequest(BaseModel):
|
class ClassifyRequest(BaseModel):
|
||||||
email_data: EmailData
|
email_data: EmailData | None = None
|
||||||
provider: Literal["openai", "anthropic"] | None = None
|
provider: Literal["openai", "anthropic"] | None = None
|
||||||
model: str | None = None
|
model: str | None = None
|
||||||
base_url: str | None = None
|
base_url: str | None = None
|
||||||
api_key: str | None = Field(default=None, exclude=True)
|
api_key: str | None = Field(default=None, exclude=True)
|
||||||
temperature: float | None = None
|
temperature: float | None = None
|
||||||
|
|
||||||
|
id: str | None = None
|
||||||
|
internetMessageId: str | None = None
|
||||||
|
conversationId: str | None = None
|
||||||
|
subject: str | None = None
|
||||||
|
bodyPreview: str | None = None
|
||||||
|
body: EmailBody | None = None
|
||||||
|
sender: Recipient | None = None
|
||||||
|
from_: Recipient | None = Field(default=None, alias="from")
|
||||||
|
toRecipients: list[Recipient] = Field(default_factory=list)
|
||||||
|
ccRecipients: list[Recipient] = Field(default_factory=list)
|
||||||
|
bccRecipients: list[Recipient] = Field(default_factory=list)
|
||||||
|
replyTo: list[Recipient] = Field(default_factory=list)
|
||||||
|
receivedDateTime: str | None = None
|
||||||
|
sentDateTime: str | None = None
|
||||||
|
hasAttachments: bool | None = None
|
||||||
|
importance: str | None = None
|
||||||
|
isRead: bool | None = None
|
||||||
|
flag: Flag | None = None
|
||||||
|
from_address: str | None = None
|
||||||
|
|
||||||
|
@model_validator(mode="after")
|
||||||
|
def populate_email_data(self) -> "ClassifyRequest":
|
||||||
|
subject = self.email_data.subject if self.email_data else self.subject
|
||||||
|
body = self.email_data.body if self.email_data else (self.body.content if self.body and self.body.content else None)
|
||||||
|
if not subject or not body:
|
||||||
|
raise ValueError("Request must include either email_data or Outlook subject/body.content fields")
|
||||||
|
self.email_data = EmailData(subject=subject, body=body)
|
||||||
|
if not self.from_address:
|
||||||
|
self.from_address = (
|
||||||
|
(self.from_.emailAddress.address if self.from_ and self.from_.emailAddress else None)
|
||||||
|
or (self.sender.emailAddress.address if self.sender and self.sender.emailAddress else None)
|
||||||
|
)
|
||||||
|
return self
|
||||||
|
|
||||||
|
model_config = {"populate_by_name": True}
|
||||||
|
|
||||||
|
|
||||||
|
class ClassificationDetails(BaseModel):
|
||||||
|
summary: str | None = None
|
||||||
|
suggested_title: str | None = None
|
||||||
|
suggested_notes: str | None = None
|
||||||
|
deadline: str | None = None
|
||||||
|
people: list[str] = Field(default_factory=list)
|
||||||
|
organizations: list[str] = Field(default_factory=list)
|
||||||
|
attachments_referenced: list[str] = Field(default_factory=list)
|
||||||
|
next_steps: list[str] = Field(default_factory=list)
|
||||||
|
key_points: list[str] = Field(default_factory=list)
|
||||||
|
source_signals: list[str] = Field(default_factory=list)
|
||||||
|
dedupe_key: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class DedupeResult(BaseModel):
|
||||||
|
status: Literal["new", "duplicate", "updated"]
|
||||||
|
seen_count: int = 1
|
||||||
|
matched_on: Literal["none", "id", "conversation", "fingerprint"] = "none"
|
||||||
|
message_id: str | None = None
|
||||||
|
conversation_id: str | None = None
|
||||||
|
fingerprint: str
|
||||||
|
|
||||||
|
|
||||||
class ClassificationResult(BaseModel):
|
class ClassificationResult(BaseModel):
|
||||||
needs_action: bool
|
needs_action: bool
|
||||||
@@ -26,3 +103,5 @@ class ClassificationResult(BaseModel):
|
|||||||
task_description: str | None = None
|
task_description: str | None = None
|
||||||
reasoning: str
|
reasoning: str
|
||||||
confidence: float
|
confidence: float
|
||||||
|
details: ClassificationDetails | None = None
|
||||||
|
dedupe: DedupeResult | None = None
|
||||||
|
|||||||
@@ -1,58 +1,38 @@
|
|||||||
SYSTEM_PROMPT = """You are an email classification assistant. Your job is to analyze emails and determine if they need the user's attention and action. The user works in the I.T. department of the Grand Portage tribal government.
|
SYSTEM_PROMPT = """You are an email classification assistant. Your job is to analyze emails and determine if they need the user's attention and action. The user works in the I.T. department of the Grand Portage tribal government.
|
||||||
|
|
||||||
|
Return valid JSON only.
|
||||||
|
|
||||||
CLASSIFICATION RULES:
|
CLASSIFICATION RULES:
|
||||||
|
1. NEEDS ATTENTION if the email asks a direct question, requests action, contains a deadline, reports a relevant problem, proposes times needing confirmation, or is a relevant I.T. alert.
|
||||||
|
2. DOES NOT NEED ATTENTION if the email is marketing, newsletter, sales outreach, bulk/promotional, or simple acknowledgment with no response needed.
|
||||||
|
3. Scheduling questions and unresolved thread questions always need attention.
|
||||||
|
|
||||||
1. NEEDS ATTENTION (create todo) if the email:
|
OUTPUT JSON SCHEMA:
|
||||||
- Asks a direct question that requires a response
|
|
||||||
- Contains scheduling questions like \"Does [day/time] work?\", \"Are you available?\", \"When can we meet?\"
|
|
||||||
- Requests the user to do something (review, approve, provide info, attend meeting)
|
|
||||||
- Contains a deadline or time-sensitive request
|
|
||||||
- Is from a colleague/client discussing active work
|
|
||||||
- Reports an issue or problem that needs addressing
|
|
||||||
- Proposes specific dates/times and needs confirmation
|
|
||||||
- Is an automated alert from a system relevant to I.T.
|
|
||||||
|
|
||||||
2. DOES NOT NEED ATTENTION (skip) if the email:
|
|
||||||
- Is a newsletter, marketing email, or webinar invitation
|
|
||||||
- Is from a person and is an FYI/informational with no action required
|
|
||||||
- Is promotional content or sales outreach
|
|
||||||
- Contains unsubscribe links or bulk sender indicators
|
|
||||||
- Is a simple acknowledgment (\"got it\", \"thanks\", \"sounds good\") with no questions
|
|
||||||
|
|
||||||
3. SPECIAL CASES:
|
|
||||||
- Even if an email says \"working on that\" or similar, if it ALSO contains a question or proposal that needs response, mark as needs_action=true
|
|
||||||
- \"Does [X] work?\" or \"When can you...?\" ALWAYS needs a response, regardless of other content
|
|
||||||
- RE: threads can still need action if they contain unanswered questions
|
|
||||||
|
|
||||||
OUTPUT FORMAT:
|
|
||||||
You must respond with valid JSON only, no other text:
|
|
||||||
{
|
{
|
||||||
\"needs_action\": true or false,
|
"needs_action": true or false,
|
||||||
\"category\": \"action_required\" | \"question\" | \"fyi\" | \"newsletter\" | \"promotional\" | \"automated\" | \"alert\" | \"uncategorized\",
|
"category": "action_required" | "question" | "fyi" | "newsletter" | "promotional" | "automated" | "alert" | "uncategorized",
|
||||||
\"priority\": \"high\" | \"medium\" | \"low\",
|
"priority": "high" | "medium" | "low",
|
||||||
\"task_description\": \"Brief description of what to do (only if needs_action is true)\",
|
"task_description": "short action-oriented description or null",
|
||||||
\"reasoning\": \"One sentence explaining your decision\",
|
"reasoning": "one sentence",
|
||||||
\"confidence\": A number from 0 to 1 indicating how confident you are
|
"confidence": 0.0 to 1.0,
|
||||||
|
"details": {
|
||||||
|
"summary": "brief human-readable summary",
|
||||||
|
"suggested_title": "good Todoist/task title",
|
||||||
|
"suggested_notes": "useful multiline notes for a human reviewing or creating a ticket",
|
||||||
|
"deadline": "deadline/date/time if present, else null",
|
||||||
|
"people": ["people involved or referenced"],
|
||||||
|
"organizations": ["organizations, departments, vendors, teams"],
|
||||||
|
"attachments_referenced": ["attachment names or referenced docs if mentioned"],
|
||||||
|
"next_steps": ["specific next actions"],
|
||||||
|
"key_points": ["important context bullets"],
|
||||||
|
"source_signals": ["question", "deadline", "request", "alert", "followup", "attachment", "scheduling"]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
EXAMPLES:
|
Rules for details:
|
||||||
|
- Be concrete and extract as much useful context as possible.
|
||||||
Email: \"Subject: Q4 Budget Review\nHi Daniel, can you review the attached budget proposal and let me know your thoughts by Friday?\"
|
- suggested_notes should help a human create a ticket later.
|
||||||
Output: {\"needs_action\": true, \"category\": \"question\", \"priority\": \"high\", \"task_description\": \"Review Q4 budget proposal and respond by Friday\", \"reasoning\": \"Direct request with deadline\", \"confidence\": 0.91}
|
- If a field is unknown, use null or empty list.
|
||||||
|
- Do not invent attachment names, people, or deadlines.
|
||||||
Email: \"Subject: RE: Meeting\nWorking on that. Does Tuesday or Wednesday work for you?\"
|
- If needs_action is true, task_description and suggested_title should be useful and specific.
|
||||||
Output: {\"needs_action\": true, \"category\": \"question\", \"priority\": \"medium\", \"task_description\": \"Respond with availability for Tuesday or Wednesday\", \"reasoning\": \"Scheduling question requires response\", \"confidence\": 0.85}
|
"""
|
||||||
|
|
||||||
Email: \"Subject: RE: Issue\nThanks, I'll look into it and get back to you.\"
|
|
||||||
Output: {\"needs_action\": false, \"category\": \"fyi\", \"priority\": \"low\", \"task_description\": null, \"reasoning\": \"Status update with no questions or action needed\", \"confidence\": 0.77}
|
|
||||||
|
|
||||||
Email: \"Subject: Join us for our exclusive webinar on cloud security\nRegister now for our upcoming webinar series...\"
|
|
||||||
Output: {\"needs_action\": false, \"category\": \"promotional\", \"priority\": \"low\", \"task_description\": null, \"reasoning\": \"Marketing webinar invitation\", \"confidence\": 0.81}
|
|
||||||
|
|
||||||
Email: \"Subject: Your order has shipped\nYour order #12345 has been dispatched and will arrive in 3-5 days.\"
|
|
||||||
Output: {\"needs_action\": false, \"category\": \"automated\", \"priority\": \"low\", \"task_description\": null, \"reasoning\": \"Automated shipping notification\", \"confidence\": 0.72}
|
|
||||||
|
|
||||||
Email: \"Subject: Disk at 95 percent on hvs-internal-01\nThe hard disk on server hvs-internal-01 is at a critical level.\"
|
|
||||||
Output: {\"needs_action\": true, \"category\": \"alert\", \"priority\": \"medium\", \"task_description\": \"Investigate critical disk usage alert on hvs-internal-01\", \"reasoning\": \"Internal I.T. system alert requires follow-up\", \"confidence\": 0.91}
|
|
||||||
|
|
||||||
Now classify the following email:"""
|
|
||||||
|
|||||||
83
app/sync.py
Normal file
83
app/sync.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
from app.dedupe_store import DedupeStore
|
||||||
|
from app.models import ClassificationResult, ClassifyRequest, DedupeResult
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_subject(subject: str) -> str:
|
||||||
|
value = subject.strip().lower()
|
||||||
|
value = re.sub(r"^(re|fw|fwd)\s*:\s*", "", value)
|
||||||
|
value = re.sub(r"\s+", " ", value)
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def build_fingerprint(request: ClassifyRequest) -> str:
|
||||||
|
subject = normalize_subject(request.email_data.subject)
|
||||||
|
body = " ".join(request.email_data.body.split()).strip().lower()
|
||||||
|
preview = " ".join((request.bodyPreview or "").split()).strip().lower()
|
||||||
|
sender = (request.from_address or "").strip().lower()
|
||||||
|
seed = f"{sender}\n{subject}\n{preview}\n{body[:2000]}"
|
||||||
|
return hashlib.sha256(seed.encode()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def build_result_hash(result: ClassificationResult) -> str:
|
||||||
|
payload = result.model_dump(exclude={"dedupe"}, exclude_none=True)
|
||||||
|
return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def apply_dedupe(request: ClassifyRequest, result: ClassificationResult) -> DedupeResult:
|
||||||
|
store = DedupeStore(os.getenv("EMAIL_CLASSIFIER_DB_PATH", ".data/email_classifier.db"))
|
||||||
|
fingerprint = build_fingerprint(request)
|
||||||
|
existing, matched_on = store.find_existing(
|
||||||
|
outlook_id=request.id,
|
||||||
|
conversation_id=request.conversationId,
|
||||||
|
fingerprint=fingerprint,
|
||||||
|
)
|
||||||
|
result_hash = build_result_hash(result)
|
||||||
|
|
||||||
|
if not existing:
|
||||||
|
store.insert_or_update(
|
||||||
|
existing_id=None,
|
||||||
|
outlook_id=request.id,
|
||||||
|
conversation_id=request.conversationId,
|
||||||
|
fingerprint=fingerprint,
|
||||||
|
result_hash=result_hash,
|
||||||
|
request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
||||||
|
result_payload=result.model_dump(exclude={"dedupe"}, exclude_none=True),
|
||||||
|
seen_count=1,
|
||||||
|
)
|
||||||
|
return DedupeResult(
|
||||||
|
status="new",
|
||||||
|
seen_count=1,
|
||||||
|
matched_on="none",
|
||||||
|
message_id=request.id,
|
||||||
|
conversation_id=request.conversationId,
|
||||||
|
fingerprint=fingerprint,
|
||||||
|
)
|
||||||
|
|
||||||
|
previous_hash = existing.get("result_hash")
|
||||||
|
seen_count = int(existing.get("seen_count", 1)) + 1
|
||||||
|
status = "duplicate" if previous_hash == result_hash else "updated"
|
||||||
|
store.insert_or_update(
|
||||||
|
existing_id=existing["id"],
|
||||||
|
outlook_id=request.id or existing.get("outlook_id"),
|
||||||
|
conversation_id=request.conversationId or existing.get("conversation_id"),
|
||||||
|
fingerprint=fingerprint,
|
||||||
|
result_hash=result_hash,
|
||||||
|
request_payload=request.model_dump(exclude={"api_key"}, exclude_none=True),
|
||||||
|
result_payload=result.model_dump(exclude={"dedupe"}, exclude_none=True),
|
||||||
|
seen_count=seen_count,
|
||||||
|
)
|
||||||
|
return DedupeResult(
|
||||||
|
status=status,
|
||||||
|
seen_count=seen_count,
|
||||||
|
matched_on=matched_on,
|
||||||
|
message_id=request.id or existing.get("outlook_id"),
|
||||||
|
conversation_id=request.conversationId or existing.get("conversation_id"),
|
||||||
|
fingerprint=fingerprint,
|
||||||
|
)
|
||||||
172
docs/api.md
Normal file
172
docs/api.md
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
# API Reference
|
||||||
|
|
||||||
|
## `POST /classify`
|
||||||
|
|
||||||
|
Classifies a single email and returns structured extraction results.
|
||||||
|
|
||||||
|
**Endpoint:** `POST /classify`
|
||||||
|
|
||||||
|
**Content-Type:** `application/json`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Request
|
||||||
|
|
||||||
|
The endpoint accepts **two input shapes**: a full Outlook-shaped payload (native Microsoft Graph API format) or a simplified `email_data` object.
|
||||||
|
|
||||||
|
### Simplified Shape
|
||||||
|
|
||||||
|
Use this for lightweight clients or testing:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"email_data": {
|
||||||
|
"subject": "Printer issue in MB",
|
||||||
|
"body": "<html>...</html>"
|
||||||
|
},
|
||||||
|
"id": "AAMk...",
|
||||||
|
"conversationId": "AAQk..."
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Full Outlook Shape
|
||||||
|
|
||||||
|
Pass through an email directly from Microsoft Graph API:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "AAMk...",
|
||||||
|
"internetMessageId": "<abc123@mail.example.com>",
|
||||||
|
"conversationId": "AAQk...",
|
||||||
|
"subject": "MB Printer",
|
||||||
|
"bodyPreview": "Good morning, ...",
|
||||||
|
"body": {
|
||||||
|
"contentType": "html",
|
||||||
|
"content": "<html>...(full HTML body)</html>"
|
||||||
|
},
|
||||||
|
"sender": {
|
||||||
|
"emailAddress": {
|
||||||
|
"name": "Bobbi Johnson",
|
||||||
|
"address": "bobbi.johnson@grandportage.com"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"from": {
|
||||||
|
"emailAddress": {
|
||||||
|
"name": "Bobbi Johnson",
|
||||||
|
"address": "bobbi.johnson@grandportage.com"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"toRecipients": [
|
||||||
|
{
|
||||||
|
"emailAddress": {
|
||||||
|
"name": "IT Helpdesk Mail",
|
||||||
|
"address": "helpdeskmail@grandportage.com"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"ccRecipients": [],
|
||||||
|
"bccRecipients": [],
|
||||||
|
"replyTo": [],
|
||||||
|
"receivedDateTime": "2026-02-19T15:27:35Z",
|
||||||
|
"sentDateTime": "2026-02-19T15:27:32Z",
|
||||||
|
"hasAttachments": false,
|
||||||
|
"importance": "normal",
|
||||||
|
"isRead": false,
|
||||||
|
"flag": { "flagStatus": "notFlagged" }
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Per-Request LLM Overrides
|
||||||
|
|
||||||
|
You can override the global LLM settings for individual requests:
|
||||||
|
|
||||||
|
| Field | Type | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `provider` | `openai` | `anthropic` | Override the global LLM provider |
|
||||||
|
| `model` | `string` | Override the model name |
|
||||||
|
| `base_url` | `string` | Override the API base URL |
|
||||||
|
| `api_key` | `string` | Override the API key (excluded from logs) |
|
||||||
|
| `temperature` | `float` | Override the temperature (0.0–1.0) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"needs_action": true,
|
||||||
|
"category": "action_required",
|
||||||
|
"priority": "high",
|
||||||
|
"task_description": "Investigate MB Printer issue and reply",
|
||||||
|
"reasoning": "The email describes an active problem requiring I.T. attention.",
|
||||||
|
"confidence": 0.91,
|
||||||
|
"details": {
|
||||||
|
"summary": "Printer issue reported in the MB area requiring investigation.",
|
||||||
|
"suggested_title": "Handle MB Printer issue",
|
||||||
|
"suggested_notes": "Review the printer problem, identify urgency, and reply with next steps.",
|
||||||
|
"deadline": null,
|
||||||
|
"people": ["Bobbi Johnson"],
|
||||||
|
"organizations": ["Grand Portage"],
|
||||||
|
"attachments_referenced": [],
|
||||||
|
"next_steps": ["Review printer status", "Reply to Bobbi Johnson"],
|
||||||
|
"key_points": ["Printer issue in MB", "Needs on-site investigation"],
|
||||||
|
"source_signals": ["request", "problem_report"]
|
||||||
|
},
|
||||||
|
"dedupe": {
|
||||||
|
"status": "new",
|
||||||
|
"seen_count": 1,
|
||||||
|
"matched_on": "none",
|
||||||
|
"message_id": "AAMk...",
|
||||||
|
"conversation_id": "AAQk...",
|
||||||
|
"fingerprint": "a3f8b..."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Response Fields
|
||||||
|
|
||||||
|
| Field | Type | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `needs_action` | `bool` | Whether the email requires user action |
|
||||||
|
| `category` | `string` | One of the 8 classification categories |
|
||||||
|
| `priority` | `string` | `high`, `medium`, or `low` |
|
||||||
|
| `task_description` | `string|null` | Short action-oriented description |
|
||||||
|
| `reasoning` | `string` | One-sentence explanation of the classification |
|
||||||
|
| `confidence` | `float` | Model confidence score (0.0–1.0) |
|
||||||
|
| `details` | `object` | Structured extraction (see below) |
|
||||||
|
| `dedupe` | `object` | Deduplication result (see below) |
|
||||||
|
|
||||||
|
### `details` Object
|
||||||
|
|
||||||
|
| Field | Type | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `summary` | `string|null` | Brief human-readable summary |
|
||||||
|
| `suggested_title` | `string|null` | Good task/Todoist title |
|
||||||
|
| `suggested_notes` | `string|null` | Multiline notes for a human reviewer |
|
||||||
|
| `deadline` | `string|null` | Any date/time deadline mentioned |
|
||||||
|
| `people` | `string[]` | People involved or referenced |
|
||||||
|
| `organizations` | `string[]` | Organizations, departments, vendors, teams |
|
||||||
|
| `attachments_referenced` | `string[]` | Attachment names mentioned in the email |
|
||||||
|
| `next_steps` | `string[]` | Specific recommended next actions |
|
||||||
|
| `key_points` | `string[]` | Important context bullets |
|
||||||
|
| `source_signals` | `string[]` | Signals that triggered the classification |
|
||||||
|
| `dedupe_key` | `string|null` | Content fingerprint (SHA-256) |
|
||||||
|
|
||||||
|
### `dedupe` Object
|
||||||
|
|
||||||
|
| Field | Type | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `status` | `new | duplicate | updated` | Whether this is new, a duplicate, or updated |
|
||||||
|
| `seen_count` | `int` | Number of times this email thread has been seen |
|
||||||
|
| `matched_on` | `none | id | conversation | fingerprint` | Which dedupe mechanism matched |
|
||||||
|
| `message_id` | `string|null` | Outlook `id` field if available |
|
||||||
|
| `conversation_id` | `string|null` | Outlook `conversationId` if available |
|
||||||
|
| `fingerprint` | `string` | SHA-256 content fingerprint |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Error Responses
|
||||||
|
|
||||||
|
If the request is missing both `email_data` and Outlook body fields, the API returns a `422 Unprocessable Entity` with a validation error.
|
||||||
|
|
||||||
|
If classification fails after all retries, the service returns a `200` with an `uncategorized` result and `confidence: 0.0`.
|
||||||
108
docs/configuration.md
Normal file
108
docs/configuration.md
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
# Configuration
|
||||||
|
|
||||||
|
All configuration is driven by environment variables. There are no config files.
|
||||||
|
|
||||||
|
## LLM Provider Settings
|
||||||
|
|
||||||
|
### `LLM_PROVIDER`
|
||||||
|
|
||||||
|
- **Values:** `openai` | `anthropic`
|
||||||
|
- **Default:** `openai`
|
||||||
|
- Determines which adapter to use for API calls. Use `openai` for Ollama, LM Studio, and any OpenAI-compatible API. Use `anthropic` for MiniMax or any Anthropic-compatible API.
|
||||||
|
|
||||||
|
### `LLM_BASE_URL`
|
||||||
|
|
||||||
|
- **Default:** `http://ollama.internal.henryhosted.com:9292/v1`
|
||||||
|
- The base URL for the LLM API. Must include the `/v1` (OpenAI format) or `/anthropic` (Anthropic format) suffix as appropriate.
|
||||||
|
|
||||||
|
### `LLM_API_KEY`
|
||||||
|
|
||||||
|
- **Default:** `none`
|
||||||
|
- API key for the LLM provider. Set to `none` for local Ollama instances that don't require authentication.
|
||||||
|
|
||||||
|
### `LLM_MODEL`
|
||||||
|
|
||||||
|
- **Default:** `qwen2.5-7b-instruct.q4_k_m`
|
||||||
|
- The model name. Must match a model available on the target LLM backend.
|
||||||
|
|
||||||
|
### `LLM_TEMPERATURE`
|
||||||
|
|
||||||
|
- **Default:** `0.1`
|
||||||
|
- Sampling temperature (0.0–1.0). Lower values produce more deterministic outputs. A value around `0.1` is recommended for classification tasks.
|
||||||
|
|
||||||
|
### `LLM_TIMEOUT_SECONDS`
|
||||||
|
|
||||||
|
- **Default:** `60`
|
||||||
|
- Request timeout in seconds.
|
||||||
|
|
||||||
|
### `LLM_MAX_RETRIES`
|
||||||
|
|
||||||
|
- **Default:** `3`
|
||||||
|
- Maximum number of retries when a classification attempt fails to parse or returns an invalid result.
|
||||||
|
|
||||||
|
## Deduplication Settings
|
||||||
|
|
||||||
|
### `EMAIL_CLASSIFIER_DB_PATH`
|
||||||
|
|
||||||
|
- **Default:** `.data/email_classifier.db`
|
||||||
|
- Path to the SQLite database used for deduplication tracking. The directory will be created automatically.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Provider-Specific Examples
|
||||||
|
|
||||||
|
### Ollama (local, OpenAI-compatible)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export LLM_PROVIDER=openai
|
||||||
|
export LLM_BASE_URL=http://localhost:11434/v1
|
||||||
|
export LLM_API_KEY=none
|
||||||
|
export LLM_MODEL=qwen2.5-7b-instruct.q4_k_m
|
||||||
|
export LLM_TEMPERATURE=0.1
|
||||||
|
```
|
||||||
|
|
||||||
|
### MiniMax (Anthropic-compatible API)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export LLM_PROVIDER=anthropic
|
||||||
|
export LLM_BASE_URL=https://api.minimax.io/anthropic
|
||||||
|
export LLM_API_KEY=your_minimax_key
|
||||||
|
export LLM_MODEL=MiniMax-M2.7
|
||||||
|
export LLM_TEMPERATURE=0.1
|
||||||
|
```
|
||||||
|
|
||||||
|
### LM Studio (local, OpenAI-compatible)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export LLM_PROVIDER=openai
|
||||||
|
export LLM_BASE_URL=http://localhost:1234/v1
|
||||||
|
export LLM_API_KEY=none
|
||||||
|
export LLM_MODEL=your-loaded-model-name
|
||||||
|
export LLM_TEMPERATURE=0.1
|
||||||
|
```
|
||||||
|
|
||||||
|
### OpenAI (cloud)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export LLM_PROVIDER=openai
|
||||||
|
export LLM_BASE_URL=https://api.openai.com/v1
|
||||||
|
export LLM_API_KEY=sk-...
|
||||||
|
export LLM_MODEL=gpt-4o-mini
|
||||||
|
export LLM_TEMPERATURE=0.1
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Per-Request Overrides
|
||||||
|
|
||||||
|
Any LLM setting can be overridden per-request by passing the field in the request body. This is useful when a single client needs to route to different providers dynamically (e.g., different email accounts with different LLM backends).
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"email_data": { "subject": "...", "body": "..." },
|
||||||
|
"provider": "anthropic",
|
||||||
|
"base_url": "https://api.minimax.io/anthropic",
|
||||||
|
"api_key": "minimax_key_here",
|
||||||
|
"model": "MiniMax-M2.7"
|
||||||
|
}
|
||||||
|
```
|
||||||
140
docs/deployment.md
Normal file
140
docs/deployment.md
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
# Deployment
|
||||||
|
|
||||||
|
## Docker
|
||||||
|
|
||||||
|
The service ships with a `Dockerfile` based on `python:3.12-slim-bookworm` using [uv](https://astral.sh/uv/) for fast dependency installation.
|
||||||
|
|
||||||
|
### Configuration sources
|
||||||
|
|
||||||
|
The application now supports two configuration sources:
|
||||||
|
- environment variables
|
||||||
|
- a YAML config file
|
||||||
|
|
||||||
|
Load order:
|
||||||
|
1. per-request overrides
|
||||||
|
2. environment variables
|
||||||
|
3. YAML config file
|
||||||
|
4. built-in defaults
|
||||||
|
|
||||||
|
Supported config file locations:
|
||||||
|
- `config.yml`
|
||||||
|
- `config.yaml`
|
||||||
|
- `/config/config.yml`
|
||||||
|
- `/config/config.yaml`
|
||||||
|
|
||||||
|
You can also set an explicit config path with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export EMAIL_CLASSIFIER_CONFIG=/path/to/config.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
Example `config.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
llm:
|
||||||
|
provider: anthropic
|
||||||
|
base_url: https://api.minimax.io/anthropic
|
||||||
|
api_key: your_api_key_here
|
||||||
|
model: MiniMax-M2.7
|
||||||
|
temperature: 0.1
|
||||||
|
timeout_seconds: 60
|
||||||
|
max_retries: 3
|
||||||
|
```
|
||||||
|
|
||||||
|
### Building
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t email-classifier .
|
||||||
|
```
|
||||||
|
|
||||||
|
### Running
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -d --name email-classifier \
|
||||||
|
-p 7999:7999 \
|
||||||
|
-e EMAIL_CLASSIFIER_CONFIG=/config/config.yml \
|
||||||
|
-e EMAIL_CLASSIFIER_DB_PATH=/data/email_classifier.db \
|
||||||
|
-v /path/to/config.yml:/config/config.yml:ro \
|
||||||
|
-v /path/to/local/data:/data \
|
||||||
|
email-classifier
|
||||||
|
```
|
||||||
|
|
||||||
|
Mount a persistent volume for `/data` (or wherever `EMAIL_CLASSIFIER_DB_PATH` points) to preserve the dedupe database across container restarts.
|
||||||
|
|
||||||
|
Environment variables still override file-based config, so you can keep most settings in YAML and override just one or two values at deploy time.
|
||||||
|
|
||||||
|
## Docker Compose example
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
email-classifier:
|
||||||
|
image: your-registry.example.com/your-org/email-classifier:latest
|
||||||
|
container_name: email-classifier
|
||||||
|
ports:
|
||||||
|
- "7999:7999"
|
||||||
|
environment:
|
||||||
|
EMAIL_CLASSIFIER_CONFIG: /config/config.yml
|
||||||
|
EMAIL_CLASSIFIER_DB_PATH: /data/email_classifier.db
|
||||||
|
# Optional overrides. Env vars win over YAML values.
|
||||||
|
# LLM_MODEL: MiniMax-M2.7
|
||||||
|
# LLM_TIMEOUT_SECONDS: "90"
|
||||||
|
volumes:
|
||||||
|
- ./config.yml:/config/config.yml:ro
|
||||||
|
- ./data:/data
|
||||||
|
restart: unless-stopped
|
||||||
|
# If your LLM backend runs on the Docker host, one option is:
|
||||||
|
# extra_hosts:
|
||||||
|
# - "host.docker.internal:host-gateway"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Compose notes
|
||||||
|
|
||||||
|
- Mount the YAML config read-only into the container, typically at `/config/config.yml`
|
||||||
|
- Mount a writable volume for `/data` so dedupe state survives restarts
|
||||||
|
- Override specific values with environment variables when needed
|
||||||
|
- If the LLM backend is another container on the same Compose network, use its service name in `base_url`
|
||||||
|
- If the LLM backend runs on the host, use `host.docker.internal` or a host-gateway mapping where appropriate
|
||||||
|
|
||||||
|
## Building for a Remote Registry
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t \
|
||||||
|
your-registry.example.com/your-org/email-classifier:latest \
|
||||||
|
.
|
||||||
|
|
||||||
|
docker push your-registry.example.com/your-org/email-classifier:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
## GitHub Actions CI/CD
|
||||||
|
|
||||||
|
The repository includes a workflow at `.github/workflows/build-publish.yaml` that builds and pushes a Docker image on every push to `main`.
|
||||||
|
|
||||||
|
### Required Secrets
|
||||||
|
|
||||||
|
Configure these in your GitHub/Gitea Actions secrets:
|
||||||
|
|
||||||
|
| Secret | Description |
|
||||||
|
|---|---|
|
||||||
|
| `DOCKER_REGISTRY` | Registry hostname (e.g., `ghcr.io` or your custom registry) |
|
||||||
|
| `DOCKER_USERNAME` | Registry username |
|
||||||
|
| `DOCKER_PASSWORD` | Registry password or access token |
|
||||||
|
|
||||||
|
The workflow tags the image as:
|
||||||
|
- `:latest` — always points to the latest commit on `main`
|
||||||
|
- `:<sha>` — the full git SHA of the triggering commit (useful for rollbacks)
|
||||||
|
|
||||||
|
### Deployment Considerations
|
||||||
|
|
||||||
|
- **Network access** — The container needs to reach your LLM backend. If using Ollama or another service on the host, use `host.docker.internal` or an explicit host-gateway mapping.
|
||||||
|
- **Dedupe persistence** — Mount a volume for the SQLite database to persist dedupe state across deploys.
|
||||||
|
- **Port** — The container exposes port `7999`. Map it to any host port you prefer.
|
||||||
|
- **Health check** — The service does not currently expose a dedicated `/health` endpoint. Use `GET /docs` as a liveness probe.
|
||||||
|
|
||||||
|
## Production Checklist
|
||||||
|
|
||||||
|
- [ ] Provide either a YAML config file or the required `LLM_*` environment variables
|
||||||
|
- [ ] Use HTTPS for remote `LLM_BASE_URL` values in production
|
||||||
|
- [ ] Mount a persistent volume for `EMAIL_CLASSIFIER_DB_PATH`
|
||||||
|
- [ ] Set appropriate resource limits (CPU/memory) on the container
|
||||||
|
- [ ] Configure `LLM_MAX_RETRIES` and `LLM_TIMEOUT_SECONDS` to suit your LLM backend's reliability
|
||||||
|
- [ ] Keep `LLM_TEMPERATURE` low for consistent classification results
|
||||||
61
docs/index.md
Normal file
61
docs/index.md
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
# email-classifier
|
||||||
|
|
||||||
|
FastAPI service that classifies emails using a configurable LLM backend. It accepts Outlook-shaped email JSON payloads, extracts structured classification data, and tracks duplicate classifications using a local SQLite dedupe store.
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
|
||||||
|
This service is designed to help workflow systems (e.g., Todoist ticket creation) automatically process incoming emails by:
|
||||||
|
|
||||||
|
- Determining whether an email requires action
|
||||||
|
- Extracting priority, category, suggested task title/notes, people, organizations, and deadlines
|
||||||
|
- Deduplicating repeated emails based on Outlook message ID, conversation ID, or content fingerprinting
|
||||||
|
|
||||||
|
## Key Features
|
||||||
|
|
||||||
|
- **Configurable LLM providers** — OpenAI-compatible (Ollama, LM Studio, OpenAI) or Anthropic-compatible (MiniMax, Anthropic API)
|
||||||
|
- **Outlook-shaped input** — Accepts native Microsoft Graph API email payloads with no transformation required
|
||||||
|
- **Simplified input** — Also accepts a minimal `email_data` shape with just `subject` and `body`
|
||||||
|
- **Deduplication** — Local SQLite store tracks seen emails by message ID, conversation ID, or content fingerprint
|
||||||
|
- **Structured extraction** — Returns classification, priority, suggested task title/notes, people, organizations, deadlines, and more
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
email-classifier/
|
||||||
|
├── app/
|
||||||
|
│ ├── main.py # FastAPI app entry point
|
||||||
|
│ ├── config.py # Pydantic settings from environment variables
|
||||||
|
│ ├── classifier.py # Core classification orchestration
|
||||||
|
│ ├── llm_adapters.py # OpenAI- and Anthropic-compatible adapter layer
|
||||||
|
│ ├── models.py # Pydantic request/response models
|
||||||
|
│ ├── prompts.py # System prompt sent to the LLM
|
||||||
|
│ ├── sync.py # Deduplication logic and content fingerprinting
|
||||||
|
│ ├── dedupe_store.py # SQLite persistence for dedupe tracking
|
||||||
|
│ ├── routers/
|
||||||
|
│ │ └── classify_email.py # /classify POST endpoint
|
||||||
|
│ └── helpers/
|
||||||
|
│ ├── clean_email_html.py
|
||||||
|
│ ├── extract_latest_message.py
|
||||||
|
│ └── remove_disclaimer.py
|
||||||
|
├── docs/ # MkDocs documentation (this site)
|
||||||
|
├── Dockerfile
|
||||||
|
├── pyproject.toml
|
||||||
|
└── uv.lock
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output Classification Schema
|
||||||
|
|
||||||
|
Emails are classified into one of these categories:
|
||||||
|
|
||||||
|
| Category | Description |
|
||||||
|
|---|---|
|
||||||
|
| `action_required` | Direct request requiring user action |
|
||||||
|
| `question` | Question needing a response |
|
||||||
|
| `fyi` | Informational, no reply needed |
|
||||||
|
| `newsletter` | Newsletter or publication |
|
||||||
|
| `promotional` | Marketing or sales outreach |
|
||||||
|
| `automated` | Automated system notification |
|
||||||
|
| `alert` | I.T. or security alert |
|
||||||
|
| `uncategorized` | Fallback when classification fails |
|
||||||
|
|
||||||
|
Priority is one of: `high`, `medium`, `low`.
|
||||||
52
docs/quirks.md
Normal file
52
docs/quirks.md
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
# Known Quirks
|
||||||
|
|
||||||
|
## MiniMax Base URL
|
||||||
|
|
||||||
|
MiniMax uses an **Anthropic-compatible** API endpoint that is **different** from the standard OpenAI-compatible path. Using the wrong URL will result in silent failures or 404 errors.
|
||||||
|
|
||||||
|
**Correct MiniMax configuration:**
|
||||||
|
```bash
|
||||||
|
export LLM_PROVIDER=anthropic
|
||||||
|
export LLM_BASE_URL=https://api.minimax.io/anthropic
|
||||||
|
export LLM_MODEL=MiniMax-M2.7
|
||||||
|
```
|
||||||
|
|
||||||
|
**Incorrect (common mistake):**
|
||||||
|
```bash
|
||||||
|
# Wrong — this is the OpenAI-compatible path, not the Anthropic path
|
||||||
|
export LLM_BASE_URL=https://api.minimax.io/v1
|
||||||
|
```
|
||||||
|
|
||||||
|
MiniMax's Anthropic-compatible endpoint is at `/anthropic`, not `/v1`. Always verify the correct endpoint in your provider's documentation.
|
||||||
|
|
||||||
|
## Per-Request `api_key` Exclusion
|
||||||
|
|
||||||
|
The `api_key` field in a request body is excluded from all logging and dedupe storage (`exclude=True` in the Pydantic model). However, it is still transmitted to the LLM adapter in plaintext during the request. Do not send untrusted request bodies to untrusted networks.
|
||||||
|
|
||||||
|
## SQLite Dedupe Database Path
|
||||||
|
|
||||||
|
The dedupe database path is relative to the **working directory** where the process starts, not relative to the application code. If you run the service from different directories, you may end up with multiple databases.
|
||||||
|
|
||||||
|
Always set `EMAIL_CLASSIFIER_DB_PATH` to an absolute path when running in production:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export EMAIL_CLASSIFIER_DB_PATH=/data/email_classifier.db
|
||||||
|
```
|
||||||
|
|
||||||
|
## Classification Retries
|
||||||
|
|
||||||
|
The classifier **retries** when `needs_action=true` but `task_description` is missing (an invalid state). This means a flaky LLM that sometimes omits `task_description` will be called multiple times. If this causes issues (e.g., rate limiting), set `LLM_MAX_RETRIES=1`.
|
||||||
|
|
||||||
|
## HTML Body Processing
|
||||||
|
|
||||||
|
The service strips disclaimers and cleans HTML from email bodies before sending to the LLM. This is aggressive and may also remove some legitimate HTML content in some email clients. There is currently no way to disable this cleaning step.
|
||||||
|
|
||||||
|
## No Authentication
|
||||||
|
|
||||||
|
The service has **no built-in authentication**. It is designed to run behind a reverse proxy (nginx, Caddy, etc.) that handles auth. Do not expose port `7999` directly to the internet.
|
||||||
|
|
||||||
|
## Dedupe Fingerprinting Limitations
|
||||||
|
|
||||||
|
The fingerprint-based dedupe fallback is **heuristic**, not exact. It uses a normalized subject + body preview + first 2000 characters of the cleaned body. Minor edits to an email (rewording, adding a signature line) can produce a different fingerprint and cause the email to be treated as `new` rather than `duplicate`. Conversely, very similar emails from different senders may collide.
|
||||||
|
|
||||||
|
For strict deduplication, rely on `message_id` (exact Outlook message ID match) or `conversation_id` (thread grouping) rather than fingerprint.
|
||||||
67
docs/setup.md
Normal file
67
docs/setup.md
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
# Setup & Installation
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Python 3.12+
|
||||||
|
- [uv](https://astral.sh/uv/) package manager
|
||||||
|
- An LLM backend (Ollama, LM Studio, MiniMax, OpenAI, or any OpenAI/Anthropic-compatible API)
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone the repository
|
||||||
|
git clone https://git.danhenry.dev/daniel/email-classifier.git
|
||||||
|
cd email-classifier
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
uv sync
|
||||||
|
|
||||||
|
# Start the server
|
||||||
|
uv run uvicorn app.main:app --host 0.0.0.0 --port 7999
|
||||||
|
```
|
||||||
|
|
||||||
|
The API will be available at `http://localhost:7999`. Auto-generated API docs are at `http://localhost:7999/docs` (Swagger UI) and `http://localhost:7999/redoc`.
|
||||||
|
|
||||||
|
## Environment Variables
|
||||||
|
|
||||||
|
The service is configured entirely through environment variables. See [Configuration](configuration.md) for the full reference.
|
||||||
|
|
||||||
|
A minimal `.env` file for local development with Ollama:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
LLM_PROVIDER=openai
|
||||||
|
LLM_BASE_URL=http://localhost:11434/v1
|
||||||
|
LLM_API_KEY=none
|
||||||
|
LLM_MODEL=qwen2.5-7b-instruct.q4_k_m
|
||||||
|
LLM_TEMPERATURE=0.1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Using Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build the image
|
||||||
|
docker build -t email-classifier .
|
||||||
|
|
||||||
|
# Run the container
|
||||||
|
docker run -p 7999:7999 \
|
||||||
|
-e LLM_PROVIDER=openai \
|
||||||
|
-e LLM_BASE_URL=http://host.docker.internal:11434/v1 \
|
||||||
|
-e LLM_API_KEY=none \
|
||||||
|
-e LLM_MODEL=qwen2.5-7b-instruct.q4_k_m \
|
||||||
|
email-classifier
|
||||||
|
```
|
||||||
|
|
||||||
|
## Dependency Management
|
||||||
|
|
||||||
|
This project uses [uv](https://astral.sh/uv/) for dependency management. Do not use `pip` directly.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Add a new dependency
|
||||||
|
uv add <package>
|
||||||
|
|
||||||
|
# Sync dependencies (after pulling changes)
|
||||||
|
uv sync
|
||||||
|
|
||||||
|
# Run with uv (recommended)
|
||||||
|
uv run uvicorn app.main:app --reload
|
||||||
|
```
|
||||||
114
docs/testing.md
Normal file
114
docs/testing.md
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
# Testing Locally
|
||||||
|
|
||||||
|
## Running the Server
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd email-classifier
|
||||||
|
uv sync
|
||||||
|
uv run uvicorn app.main:app --host 0.0.0.0 --port 7999 --reload
|
||||||
|
```
|
||||||
|
|
||||||
|
The server starts on port **7999** by default. Access the API docs at:
|
||||||
|
- Swagger UI: `http://localhost:7999/docs`
|
||||||
|
- ReDoc: `http://localhost:7999/redoc`
|
||||||
|
|
||||||
|
## Sending Test Requests
|
||||||
|
|
||||||
|
### With `curl`
|
||||||
|
|
||||||
|
**Simplified request:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:7999/classify \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"email_data": {
|
||||||
|
"subject": "Printer issue in MB building",
|
||||||
|
"body": "Hi, the printer on floor 2 is not working. Can someone take a look?"
|
||||||
|
},
|
||||||
|
"id": "test-001",
|
||||||
|
"conversationId": "test-conv-001"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Full Outlook-shaped request:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:7999/classify \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"id": "AAMkAD...",
|
||||||
|
"conversationId": "AAQkAD...",
|
||||||
|
"subject": "VPN is down",
|
||||||
|
"body": {
|
||||||
|
"contentType": "html",
|
||||||
|
"content": "<html><body>Users are reporting VPN connectivity issues.</body></html>"
|
||||||
|
},
|
||||||
|
"sender": {
|
||||||
|
"emailAddress": {
|
||||||
|
"name": "Jane Smith",
|
||||||
|
"address": "jane.smith@grandportage.com"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"from": {
|
||||||
|
"emailAddress": {
|
||||||
|
"name": "Jane Smith",
|
||||||
|
"address": "jane.smith@grandportage.com"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"toRecipients": [
|
||||||
|
{
|
||||||
|
"emailAddress": {
|
||||||
|
"name": "IT Helpdesk",
|
||||||
|
"address": "helpdesk@grandportage.com"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"ccRecipients": [],
|
||||||
|
"receivedDateTime": "2026-04-09T10:00:00Z",
|
||||||
|
"sentDateTime": "2026-04-09T09:55:00Z",
|
||||||
|
"importance": "high"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### With the Swagger UI
|
||||||
|
|
||||||
|
Open `http://localhost:7999/docs`, click **POST /classify**, click **Try it out**, paste your JSON payload, and click **Execute**.
|
||||||
|
|
||||||
|
## Running Tests
|
||||||
|
|
||||||
|
This project does not currently include a test suite. To add tests, use `pytest`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv add --dev pytest pytest-asyncio httpx
|
||||||
|
uv run pytest
|
||||||
|
```
|
||||||
|
|
||||||
|
## Verifying Deduplication
|
||||||
|
|
||||||
|
The dedupe store is a SQLite database at `.data/email_classifier.db`. You can inspect it directly:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sqlite3 .data/email_classifier.db ".schema classification_dedupe"
|
||||||
|
sqlite3 .data/email_classifier.db "SELECT * FROM classification_dedupe LIMIT 10;"
|
||||||
|
```
|
||||||
|
|
||||||
|
To reset deduplication state between tests:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
rm .data/email_classifier.db
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing with Different LLM Providers
|
||||||
|
|
||||||
|
Start the server with a specific provider:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
LLM_PROVIDER=anthropic \
|
||||||
|
LLM_BASE_URL=https://api.minimax.io/anthropic \
|
||||||
|
LLM_API_KEY=your_key \
|
||||||
|
LLM_MODEL=MiniMax-M2.7 \
|
||||||
|
uv run uvicorn app.main:app --reload
|
||||||
|
```
|
||||||
|
|
||||||
|
Or override per-request by including `provider`, `base_url`, `model`, and `api_key` in the request body.
|
||||||
49
mkdocs.yml
Normal file
49
mkdocs.yml
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
site_name: email-classifier
|
||||||
|
site_description: FastAPI service that classifies email using a configurable LLM backend
|
||||||
|
site_url: https://git.danhenry.dev/daniel/email-classifier
|
||||||
|
|
||||||
|
docs_dir: docs
|
||||||
|
exclude_docs: |
|
||||||
|
venv/
|
||||||
|
.venv/
|
||||||
|
__pycache__/
|
||||||
|
|
||||||
|
repo_name: daniel/email-classifier
|
||||||
|
repo_url: https://git.danhenry.dev/daniel/email-classifier
|
||||||
|
|
||||||
|
nav:
|
||||||
|
- Home: index.md
|
||||||
|
- Setup: setup.md
|
||||||
|
- API Reference: api.md
|
||||||
|
- Configuration: configuration.md
|
||||||
|
- Testing Locally: testing.md
|
||||||
|
- Deployment: deployment.md
|
||||||
|
- Known Quirks: quirks.md
|
||||||
|
|
||||||
|
theme:
|
||||||
|
name: material
|
||||||
|
palette:
|
||||||
|
- scheme: default
|
||||||
|
primary: indigo
|
||||||
|
accent: indigo
|
||||||
|
toggle:
|
||||||
|
icon: material/brightness-7
|
||||||
|
name: Switch to dark mode
|
||||||
|
- scheme: slate
|
||||||
|
primary: indigo
|
||||||
|
accent: indigo
|
||||||
|
toggle:
|
||||||
|
icon: material/brightness-4
|
||||||
|
name: Switch to light mode
|
||||||
|
features:
|
||||||
|
- navigation.instant
|
||||||
|
- navigation.tracking
|
||||||
|
- content.code.copy
|
||||||
|
|
||||||
|
markdown_extensions:
|
||||||
|
- pymdownx.highlight:
|
||||||
|
anchor_linenums: true
|
||||||
|
- pymdownx.superfences
|
||||||
|
- admonition
|
||||||
|
- toc:
|
||||||
|
permalink: true
|
||||||
@@ -9,5 +9,6 @@ dependencies = [
|
|||||||
"beautifulsoup4>=4.14.3",
|
"beautifulsoup4>=4.14.3",
|
||||||
"fastapi>=0.128.0",
|
"fastapi>=0.128.0",
|
||||||
"openai>=2.16.0",
|
"openai>=2.16.0",
|
||||||
|
"PyYAML>=6.0.2",
|
||||||
"uvicorn>=0.40.0",
|
"uvicorn>=0.40.0",
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user