Files
email-classifier/app/helpers/extract_latest_message.py
Daniel Henry da6f623d38 Initial Commit
Signed-off-by: Daniel Henry <iamdanhenry@gmail.com>
2026-01-28 11:42:27 -06:00

46 lines
1.4 KiB
Python

import re
def extract_latest_message(email):
"""Extract only the most recent message from an email thread."""
text = email
# Define reply separators (ordered by specificity)
separators = [
# Outlook style - most common
r'From:.*?Sent:.*?(?:To:|Subject:)',
# Horizontal rules
r'_{40,}',
r'-{40,}',
# "Original Message" variants
r'[-]+\s*Original Message\s*[-]+',
# Gmail/generic style
r'On\s+.{0,100}?\s+wrote:',
# Forwarded message
r'[-]+\s*Forwarded [Mm]essage\s*[-]+',
]
# Try each separator and take the earliest match
earliest_match = None
earliest_pos = len(text)
for pattern in separators:
match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
if match and match.start() < earliest_pos:
earliest_pos = match.start()
earliest_match = match
# Extract everything before the separator
if earliest_match:
latest_text = text[:earliest_pos].strip()
else:
latest_text = text
# Remove excessive whitespace
latest_text = re.sub(r'\n{3,}', '\n\n', latest_text)
latest_text = re.sub(r' {2,}', ' ', latest_text)
# Limit to reasonable length (adjust as needed)
if len(latest_text) > 2000:
latest_text = latest_text[:2000] + "..."
return latest_text