from bs4 import BeautifulSoup def clean_email_html(html_content: str): if not html_content: return "" # Parse the HTML soup = BeautifulSoup(html_content, "html.parser") # Remove script and style elements (CSS and JS) # This is important! otherwise the AI reads the code as text for script_or_style in soup(["script", "style", "head", "title", "meta"]): script_or_style.decompose() # Get text and replace multiple spaces/newlines with a single space text = soup.get_text(separator=' ') # Clean up extra whitespace lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) clean_text = ' '.join(chunk for chunk in chunks if chunk) return clean_text