llm-workshop/03-rag/clean_eml.py

# clean_eml.py
#
# Convert .eml files to plain text files for use with build.py.
# Place .eml files in ./eml, then run this script to produce
# dated .txt files in ./data.
#
# August 2025
# E. M. Furst

from email import policy
from email.parser import BytesParser
from pathlib import Path
from dateutil import parser
from dateutil import tz

eml_dir = "eml"
out_dir = "data"

for eml_file in Path(eml_dir).glob("*.eml"):
    with open(eml_file, "rb") as f:
        msg = BytesParser(policy=policy.default).parse(f)

    # Get metadata
    subject = msg.get("subject", "No Subject")
    date = msg.get("date", "No Date")

    # Convert date to a safe format for filenames: YYYY_MM_DD_hhmmss
    date = parser.parse(date)
    if date.tzinfo is None:
        date = date.replace(tzinfo=tz.tzlocal())
    date = date.astimezone(tz.tzlocal())
    msg_date = date.strftime("%d/%m/%Y, %H:%M:%S")
    date = date.strftime("%Y_%m_%d_%H%M%S")

    # Prefer plain text, fallback to HTML
    body_part = msg.get_body(preferencelist=('plain', 'html'))
    if body_part:
        body_content = body_part.get_content()
    else:
        body_content = msg.get_payload()

    # Combine into a clean string with labels and newlines
    text = f"Subject: {subject}\nDate: {date}\n\n{body_content}"

    out_file = Path(f"{out_dir}/{date}.txt").open("w", encoding="utf-8")
    out_file.write(text)

    print(f"{msg_date}")