# clean_eml.py # # Convert .eml files to plain text files for use with build.py. # Place .eml files in ./eml, then run this script to produce # dated .txt files in ./data. # # August 2025 # E. M. Furst from email import policy from email.parser import BytesParser from pathlib import Path from dateutil import parser from dateutil import tz eml_dir = "eml" out_dir = "data" for eml_file in Path(eml_dir).glob("*.eml"): with open(eml_file, "rb") as f: msg = BytesParser(policy=policy.default).parse(f) # Get metadata subject = msg.get("subject", "No Subject") date = msg.get("date", "No Date") # Convert date to a safe format for filenames: YYYY_MM_DD_hhmmss date = parser.parse(date) if date.tzinfo is None: date = date.replace(tzinfo=tz.tzlocal()) date = date.astimezone(tz.tzlocal()) msg_date = date.strftime("%d/%m/%Y, %H:%M:%S") date = date.strftime("%Y_%m_%d_%H%M%S") # Prefer plain text, fallback to HTML body_part = msg.get_body(preferencelist=('plain', 'html')) if body_part: body_content = body_part.get_content() else: body_content = msg.get_payload() # Combine into a clean string with labels and newlines text = f"Subject: {subject}\nDate: {date}\n\n{body_content}" out_file = Path(f"{out_dir}/{date}.txt").open("w", encoding="utf-8") out_file.write(text) print(f"{msg_date}")