Add eml-to-text conversion script
This commit is contained in:
parent
39f1f73e2a
commit
e4754c9bdc
2 changed files with 49 additions and 0 deletions
48
clean_eml.py
Normal file
48
clean_eml.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
# clean_eml.py
|
||||
#
|
||||
# Convert .eml files to plain text files for use with build.py.
|
||||
# Place .eml files in ./eml, then run this script to produce
|
||||
# dated .txt files in ./data.
|
||||
#
|
||||
# August 2025
|
||||
# E. M. Furst
|
||||
|
||||
from email import policy
|
||||
from email.parser import BytesParser
|
||||
from pathlib import Path
|
||||
from dateutil import parser
|
||||
from dateutil import tz
|
||||
|
||||
eml_dir = "eml"
|
||||
out_dir = "data"
|
||||
|
||||
for eml_file in Path(eml_dir).glob("*.eml"):
|
||||
with open(eml_file, "rb") as f:
|
||||
msg = BytesParser(policy=policy.default).parse(f)
|
||||
|
||||
# Get metadata
|
||||
subject = msg.get("subject", "No Subject")
|
||||
date = msg.get("date", "No Date")
|
||||
|
||||
# Convert date to a safe format for filenames: YYYY_MM_DD_hhmmss
|
||||
date = parser.parse(date)
|
||||
if date.tzinfo is None:
|
||||
date = date.replace(tzinfo=tz.tzlocal())
|
||||
date = date.astimezone(tz.tzlocal())
|
||||
msg_date = date.strftime("%d/%m/%Y, %H:%M:%S")
|
||||
date = date.strftime("%Y_%m_%d_%H%M%S")
|
||||
|
||||
# Prefer plain text, fallback to HTML
|
||||
body_part = msg.get_body(preferencelist=('plain', 'html'))
|
||||
if body_part:
|
||||
body_content = body_part.get_content()
|
||||
else:
|
||||
body_content = msg.get_payload()
|
||||
|
||||
# Combine into a clean string with labels and newlines
|
||||
text = f"Subject: {subject}\nDate: {date}\n\n{body_content}"
|
||||
|
||||
out_file = Path(f"{out_dir}/{date}.txt").open("w", encoding="utf-8")
|
||||
out_file.write(text)
|
||||
|
||||
print(f"{msg_date}")
|
||||
|
|
@ -2,3 +2,4 @@ llama-index-core
|
|||
llama-index-readers-file
|
||||
llama-index-llms-ollama
|
||||
llama-index-embeddings-huggingface
|
||||
python-dateutil
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue