23 lines
810 B
Python
23 lines
810 B
Python
from bs4 import BeautifulSoup
|
|
|
|
def clean_email_html(html_content: str):
|
|
if not html_content:
|
|
return ""
|
|
|
|
# Parse the HTML
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
|
|
# Remove script and style elements (CSS and JS)
|
|
# This is important! otherwise the AI reads the code as text
|
|
for script_or_style in soup(["script", "style", "head", "title", "meta"]):
|
|
script_or_style.decompose()
|
|
|
|
# Get text and replace multiple spaces/newlines with a single space
|
|
text = soup.get_text(separator=' ')
|
|
|
|
# Clean up extra whitespace
|
|
lines = (line.strip() for line in text.splitlines())
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
clean_text = ' '.join(chunk for chunk in chunks if chunk)
|
|
|
|
return clean_text |