Initial Commit

Signed-off-by: Daniel Henry <iamdanhenry@gmail.com>
2026-01-28 11:42:27 -06:00
commit da6f623d38
13 changed files with 604 additions and 0 deletions
--- a/app/helpers/clean_email_html.py
+++ b/app/helpers/clean_email_html.py
@@ -0,0 +1,23 @@
+from bs4 import BeautifulSoup
+
+def clean_email_html(html_content: str):
+    if not html_content:
+        return ""
+    
+    # Parse the HTML
+    soup = BeautifulSoup(html_content, "html.parser")
+    
+    # Remove script and style elements (CSS and JS)
+    # This is important! otherwise the AI reads the code as text
+    for script_or_style in soup(["script", "style", "head", "title", "meta"]):
+        script_or_style.decompose()
+    
+    # Get text and replace multiple spaces/newlines with a single space
+    text = soup.get_text(separator=' ')
+    
+    # Clean up extra whitespace
+    lines = (line.strip() for line in text.splitlines())
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    clean_text = ' '.join(chunk for chunk in chunks if chunk)
+    
+    return clean_text
--- a/app/helpers/extract_latest_message.py
+++ b/app/helpers/extract_latest_message.py
@@ -0,0 +1,46 @@
+import re
+
+def extract_latest_message(email):
+    """Extract only the most recent message from an email thread."""
+    text = email
+    
+    # Define reply separators (ordered by specificity)
+    separators = [
+        # Outlook style - most common
+        r'From:.*?Sent:.*?(?:To:|Subject:)',
+        # Horizontal rules
+        r'_{40,}',
+        r'-{40,}',
+        # "Original Message" variants
+        r'[-]+\s*Original Message\s*[-]+',
+        # Gmail/generic style
+        r'On\s+.{0,100}?\s+wrote:',
+        # Forwarded message
+        r'[-]+\s*Forwarded [Mm]essage\s*[-]+',
+    ]
+    
+    # Try each separator and take the earliest match
+    earliest_match = None
+    earliest_pos = len(text)
+    
+    for pattern in separators:
+        match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
+        if match and match.start() < earliest_pos:
+            earliest_pos = match.start()
+            earliest_match = match
+    
+    # Extract everything before the separator
+    if earliest_match:
+        latest_text = text[:earliest_pos].strip()
+    else:
+        latest_text = text
+    
+    # Remove excessive whitespace
+    latest_text = re.sub(r'\n{3,}', '\n\n', latest_text)
+    latest_text = re.sub(r' {2,}', ' ', latest_text)
+    
+    # Limit to reasonable length (adjust as needed)
+    if len(latest_text) > 2000:
+        latest_text = latest_text[:2000] + "..."
+    
+    return latest_text
--- a/app/helpers/remove_disclaimer.py
+++ b/app/helpers/remove_disclaimer.py
@@ -0,0 +1,13 @@
+import re
+
+def remove_disclaimer(text):
+    # The pattern matches the start and end of your phrase.
+    # \s+ matches one or more spaces/newlines/tabs.
+    # re.IGNORECASE makes it case insensitive just in case.
+    
+    pattern = r"Caution:\s+This\s+email\s+comes\s+from\s+an\s+external\s+sender.*?\s+contact\s+your\s+IT\s+Department\.?"
+    
+    # Substitute the found pattern with an empty string
+    cleaned_text = re.sub(pattern, "", text, flags=re.IGNORECASE | re.DOTALL)
+    
+    return cleaned_text.strip()
--- a/app/helpers/send_classify_request.py
+++ b/app/helpers/send_classify_request.py
@@ -0,0 +1,83 @@
+from openai import AsyncOpenAI
+from models import EmailData
+
+openai_client = AsyncOpenAI(
+    base_url="http://ollama.internal.henryhosted.com:9292/v1",
+    api_key="none"
+)
+
+model = "qwen2.5-7b-instruct.q4_k_m"
+
+
+system_prompt = """You are an email classification assistant. Your job is to analyze emails and determine if they need the user's attention and action.  The user works in the I.T. department of the Grand Portage tribal government.
+
+CLASSIFICATION RULES:
+
+1. NEEDS ATTENTION (create todo) if the email:
+   - Asks a direct question that requires a response
+   - Contains scheduling questions like "Does [day/time] work?", "Are you available?", "When can we meet?"
+   - Requests the user to do something (review, approve, provide info, attend meeting)
+   - Contains a deadline or time-sensitive request
+   - Is from a colleague/client discussing active work
+   - Reports an issue or problem that needs addressing
+   - Proposes specific dates/times and needs confirmation
+   - Is an automated alert from a system relevant to I.T.
+
+2. DOES NOT NEED ATTENTION (skip) if the email:
+   - Is a newsletter, marketing email, or webinar invitation
+   - Is from a person and is an FYI/informational with no action required
+   - Is promotional content or sales outreach
+   - Contains unsubscribe links or bulk sender indicators
+   - Is a simple acknowledgment ("got it", "thanks", "sounds good") with no questions
+
+3. SPECIAL CASES:
+   - Even if an email says "working on that" or similar, if it ALSO contains a question or proposal that needs response, mark as needs_action=true
+   - "Does [X] work?" or "When can you...?" ALWAYS needs a response, regardless of other content
+   - RE: threads can still need action if they contain unanswered questions
+
+OUTPUT FORMAT:
+You must respond with valid JSON only, no other text:
+{
+  "needs_action": true or false,
+  "category": "action_required" | "question" | "fyi" | "newsletter" | "promotional" | "automated",
+  "priority": "high" | "medium" | "low",
+  "task_description": "Brief description of what to do (only if needs_action is true)",
+  "reasoning": "One sentence explaining your decision",
+  "confidence": "A number from 0 to 1 indicating how confident you are"
+}
+
+EXAMPLES:
+
+Email: "Subject: Q4 Budget Review\nHi Daniel, can you review the attached budget proposal and let me know your thoughts by Friday?"
+Output: {"needs_action": true, "category": "question", "priority": "high", "task_description": "Review Q4 budget proposal and respond by Friday", "reasoning": "Direct request with deadline", "confidence": 0.91}
+
+Email: "Subject: RE: Meeting\nWorking on that. Does Tuesday or Wednesday work for you?"
+Output: {"needs_action": true, "category": "question", "priority": "medium", "task_description": "Respond with availability for Tuesday or Wednesday", "reasoning": "Scheduling question requires response", "confidence": 0.85}
+
+Email: "Subject: RE: Issue\nThanks, I'll look into it and get back to you."
+Output: {"needs_action": false, "category": "fyi", "priority": "low", "task_description": null, "reasoning": "Status update with no questions or action needed", "confidence": 0.77}
+
+Email: "Subject: Join us for our exclusive webinar on cloud security\nRegister now for our upcoming webinar series..."
+Output: {"needs_action": false, "category": "promotional", "priority": "low", "task_description": null, "reasoning": "Marketing webinar invitation", "confidence": 0.81}
+
+Email: "Subject: Your order has shipped\nYour order #12345 has been dispatched and will arrive in 3-5 days."
+Output: {"needs_action": false, "category": "automated", "priority": "low", "task_description": null, "reasoning": "Automated shipping notification", "confidence": 0.72}
+
+Email: "Subject: Disk at 95 percent on hvs-internal-01\nYThe hard disk on server hvs-internal-01 is at a critical level."
+Output: {"needs_action": true, "category": "alert", "priority": "medium", "task_description": null, "reasoning": "Internal I.T. system alert", "confidence": 0.91}
+
+Now classify the following email:"""
+
+async def send_classify_request(email: EmailData):
+
+    response = await openai_client.chat.completions.create(
+        model=model,
+        messages=[
+         {"role": "system", "content": system_prompt},
+         {"role": "user", "content": f"Subject: {email.subject}\nBody: {email.body}"}
+      ],
+      temperature=0.1, # Keep it low so it follows rules strictly
+      response_format={"type": "json_object"} # Important for newer local servers
+    )
+
+    return response.choices[0].message.content