Five modules covering nanoGPT, Ollama, RAG, semantic search, and neural networks. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
48 lines
1.4 KiB
Python
48 lines
1.4 KiB
Python
# clean_eml.py
|
|
#
|
|
# Convert .eml files to plain text files for use with build.py.
|
|
# Place .eml files in ./eml, then run this script to produce
|
|
# dated .txt files in ./data.
|
|
#
|
|
# August 2025
|
|
# E. M. Furst
|
|
|
|
from email import policy
|
|
from email.parser import BytesParser
|
|
from pathlib import Path
|
|
from dateutil import parser
|
|
from dateutil import tz
|
|
|
|
eml_dir = "eml"
|
|
out_dir = "data"
|
|
|
|
for eml_file in Path(eml_dir).glob("*.eml"):
|
|
with open(eml_file, "rb") as f:
|
|
msg = BytesParser(policy=policy.default).parse(f)
|
|
|
|
# Get metadata
|
|
subject = msg.get("subject", "No Subject")
|
|
date = msg.get("date", "No Date")
|
|
|
|
# Convert date to a safe format for filenames: YYYY_MM_DD_hhmmss
|
|
date = parser.parse(date)
|
|
if date.tzinfo is None:
|
|
date = date.replace(tzinfo=tz.tzlocal())
|
|
date = date.astimezone(tz.tzlocal())
|
|
msg_date = date.strftime("%d/%m/%Y, %H:%M:%S")
|
|
date = date.strftime("%Y_%m_%d_%H%M%S")
|
|
|
|
# Prefer plain text, fallback to HTML
|
|
body_part = msg.get_body(preferencelist=('plain', 'html'))
|
|
if body_part:
|
|
body_content = body_part.get_content()
|
|
else:
|
|
body_content = msg.get_payload()
|
|
|
|
# Combine into a clean string with labels and newlines
|
|
text = f"Subject: {subject}\nDate: {date}\n\n{body_content}"
|
|
|
|
out_file = Path(f"{out_dir}/{date}.txt").open("w", encoding="utf-8")
|
|
out_file.write(text)
|
|
|
|
print(f"{msg_date}")
|