46
app/helpers/extract_latest_message.py
Normal file
46
app/helpers/extract_latest_message.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import re
|
||||
|
||||
def extract_latest_message(email):
|
||||
"""Extract only the most recent message from an email thread."""
|
||||
text = email
|
||||
|
||||
# Define reply separators (ordered by specificity)
|
||||
separators = [
|
||||
# Outlook style - most common
|
||||
r'From:.*?Sent:.*?(?:To:|Subject:)',
|
||||
# Horizontal rules
|
||||
r'_{40,}',
|
||||
r'-{40,}',
|
||||
# "Original Message" variants
|
||||
r'[-]+\s*Original Message\s*[-]+',
|
||||
# Gmail/generic style
|
||||
r'On\s+.{0,100}?\s+wrote:',
|
||||
# Forwarded message
|
||||
r'[-]+\s*Forwarded [Mm]essage\s*[-]+',
|
||||
]
|
||||
|
||||
# Try each separator and take the earliest match
|
||||
earliest_match = None
|
||||
earliest_pos = len(text)
|
||||
|
||||
for pattern in separators:
|
||||
match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
|
||||
if match and match.start() < earliest_pos:
|
||||
earliest_pos = match.start()
|
||||
earliest_match = match
|
||||
|
||||
# Extract everything before the separator
|
||||
if earliest_match:
|
||||
latest_text = text[:earliest_pos].strip()
|
||||
else:
|
||||
latest_text = text
|
||||
|
||||
# Remove excessive whitespace
|
||||
latest_text = re.sub(r'\n{3,}', '\n\n', latest_text)
|
||||
latest_text = re.sub(r' {2,}', ' ', latest_text)
|
||||
|
||||
# Limit to reasonable length (adjust as needed)
|
||||
if len(latest_text) > 2000:
|
||||
latest_text = latest_text[:2000] + "..."
|
||||
|
||||
return latest_text
|
||||
Reference in New Issue
Block a user