import re def extract_latest_message(email): """Extract only the most recent message from an email thread.""" text = email # Define reply separators (ordered by specificity) separators = [ # Outlook style - most common r'From:.*?Sent:.*?(?:To:|Subject:)', # Horizontal rules r'_{40,}', r'-{40,}', # "Original Message" variants r'[-]+\s*Original Message\s*[-]+', # Gmail/generic style r'On\s+.{0,100}?\s+wrote:', # Forwarded message r'[-]+\s*Forwarded [Mm]essage\s*[-]+', ] # Try each separator and take the earliest match earliest_match = None earliest_pos = len(text) for pattern in separators: match = re.search(pattern, text, re.IGNORECASE | re.DOTALL) if match and match.start() < earliest_pos: earliest_pos = match.start() earliest_match = match # Extract everything before the separator if earliest_match: latest_text = text[:earliest_pos].strip() else: latest_text = text # Remove excessive whitespace latest_text = re.sub(r'\n{3,}', '\n\n', latest_text) latest_text = re.sub(r' {2,}', ' ', latest_text) # Limit to reasonable length (adjust as needed) if len(latest_text) > 2000: latest_text = latest_text[:2000] + "..." return latest_text