rag-demo/clean_eml.py

48 lines
1.4 KiB
Python

# clean_eml.py
#
# Convert .eml files to plain text files for use with build.py.
# Place .eml files in ./eml, then run this script to produce
# dated .txt files in ./data.
#
# August 2025
# E. M. Furst
from email import policy
from email.parser import BytesParser
from pathlib import Path
from dateutil import parser
from dateutil import tz
eml_dir = "eml"
out_dir = "data"
for eml_file in Path(eml_dir).glob("*.eml"):
with open(eml_file, "rb") as f:
msg = BytesParser(policy=policy.default).parse(f)
# Get metadata
subject = msg.get("subject", "No Subject")
date = msg.get("date", "No Date")
# Convert date to a safe format for filenames: YYYY_MM_DD_hhmmss
date = parser.parse(date)
if date.tzinfo is None:
date = date.replace(tzinfo=tz.tzlocal())
date = date.astimezone(tz.tzlocal())
msg_date = date.strftime("%d/%m/%Y, %H:%M:%S")
date = date.strftime("%Y_%m_%d_%H%M%S")
# Prefer plain text, fallback to HTML
body_part = msg.get_body(preferencelist=('plain', 'html'))
if body_part:
body_content = body_part.get_content()
else:
body_content = msg.get_payload()
# Combine into a clean string with labels and newlines
text = f"Subject: {subject}\nDate: {date}\n\n{body_content}"
out_file = Path(f"{out_dir}/{date}.txt").open("w", encoding="utf-8")
out_file.write(text)
print(f"{msg_date}")