46 lines
1.4 KiB
Python
46 lines
1.4 KiB
Python
import re
|
|
|
|
def extract_latest_message(email):
|
|
"""Extract only the most recent message from an email thread."""
|
|
text = email
|
|
|
|
# Define reply separators (ordered by specificity)
|
|
separators = [
|
|
# Outlook style - most common
|
|
r'From:.*?Sent:.*?(?:To:|Subject:)',
|
|
# Horizontal rules
|
|
r'_{40,}',
|
|
r'-{40,}',
|
|
# "Original Message" variants
|
|
r'[-]+\s*Original Message\s*[-]+',
|
|
# Gmail/generic style
|
|
r'On\s+.{0,100}?\s+wrote:',
|
|
# Forwarded message
|
|
r'[-]+\s*Forwarded [Mm]essage\s*[-]+',
|
|
]
|
|
|
|
# Try each separator and take the earliest match
|
|
earliest_match = None
|
|
earliest_pos = len(text)
|
|
|
|
for pattern in separators:
|
|
match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
|
|
if match and match.start() < earliest_pos:
|
|
earliest_pos = match.start()
|
|
earliest_match = match
|
|
|
|
# Extract everything before the separator
|
|
if earliest_match:
|
|
latest_text = text[:earliest_pos].strip()
|
|
else:
|
|
latest_text = text
|
|
|
|
# Remove excessive whitespace
|
|
latest_text = re.sub(r'\n{3,}', '\n\n', latest_text)
|
|
latest_text = re.sub(r' {2,}', ' ', latest_text)
|
|
|
|
# Limit to reasonable length (adjust as needed)
|
|
if len(latest_text) > 2000:
|
|
latest_text = latest_text[:2000] + "..."
|
|
|
|
return latest_text |