Files
email-classifier/app/helpers/clean_email_html.py
Daniel Henry da6f623d38 Initial Commit
Signed-off-by: Daniel Henry <iamdanhenry@gmail.com>
2026-01-28 11:42:27 -06:00

23 lines
810 B
Python

from bs4 import BeautifulSoup
def clean_email_html(html_content: str):
if not html_content:
return ""
# Parse the HTML
soup = BeautifulSoup(html_content, "html.parser")
# Remove script and style elements (CSS and JS)
# This is important! otherwise the AI reads the code as text
for script_or_style in soup(["script", "style", "head", "title", "meta"]):
script_or_style.decompose()
# Get text and replace multiple spaces/newlines with a single space
text = soup.get_text(separator=' ')
# Clean up extra whitespace
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
clean_text = ' '.join(chunk for chunk in chunks if chunk)
return clean_text