Initial build out

This commit is contained in:
emfurst 2026-03-30 07:15:14 -04:00
commit 29215e2bd2
40 changed files with 2622 additions and 0 deletions

View file

@ -0,0 +1,142 @@
from pathlib import Path
import duckdb
import polars as pl
from admin_analytics import config
from admin_analytics.config import UD_UNITID
# F1A (GASB/public) column mappings — first match wins per canonical column.
F1A_COLUMN_VARIANTS = {
"unitid": ["UNITID"],
"total_expenses": ["F1C191", "F1D02"],
"instruction_expenses": ["F1C011"],
"research_expenses": ["F1C021"],
"public_service_expenses": ["F1C031"],
"academic_support_expenses": ["F1C051"],
"student_services_expenses": ["F1C061"],
"institutional_support_expenses": ["F1C071"],
"auxiliary_expenses": ["F1C111"],
"hospital_expenses": ["F1C121"],
"other_expenses": ["F1C141"],
"salaries_wages": ["F1C192"],
"benefits": ["F1C193"],
}
# F2 (FASB/private-style) column mappings — UD reports here despite being public.
F2_COLUMN_VARIANTS = {
"unitid": ["UNITID"],
"total_expenses": ["F2E131"],
"instruction_expenses": ["F2E011"],
"research_expenses": ["F2E021"],
"public_service_expenses": ["F2E031"],
"academic_support_expenses": ["F2E041"],
"student_services_expenses": ["F2E051"],
"institutional_support_expenses": ["F2E061"],
"auxiliary_expenses": ["F2E071"],
"hospital_expenses": ["F2E081"],
"other_expenses": ["F2E121"],
"salaries_wages": ["F2E132"],
"benefits": ["F2E133"],
}
CANONICAL_COLUMNS = [
"unitid", "year", "reporting_standard", "total_expenses",
"instruction_expenses", "research_expenses", "public_service_expenses",
"academic_support_expenses", "student_services_expenses",
"institutional_support_expenses", "auxiliary_expenses", "hospital_expenses",
"other_expenses", "salaries_wages", "benefits",
]
def _find_csv(component_dir: Path) -> Path | None:
csvs = [f for f in component_dir.glob("*.csv") if "dict" not in f.stem.lower()]
return csvs[0] if csvs else None
def _resolve_columns(df: pl.DataFrame, variants: dict) -> dict[str, str]:
"""For each canonical name, find the first matching column."""
upper_cols = {c.upper(): c for c in df.columns}
resolved = {}
for canonical, candidates in variants.items():
for var in candidates:
if var in upper_cols:
resolved[canonical] = upper_cols[var]
break
return resolved
def _load_file(
csv_path: Path,
year: int,
variants: dict,
reporting_standard: str,
conn: duckdb.DuckDBPyConnection,
unitid_filter: int | None,
) -> int:
"""Load a single finance CSV into raw_ipeds_finance."""
df = pl.read_csv(csv_path, infer_schema_length=0, encoding="utf8-lossy")
col_map = _resolve_columns(df, variants)
if "unitid" not in col_map:
return 0
# Build dataframe with canonical columns
result = pl.DataFrame({
canonical: df[actual] for canonical, actual in col_map.items()
})
result = result.with_columns(
pl.lit(year).alias("year"),
pl.lit(reporting_standard).alias("reporting_standard"),
)
# Cast numeric columns
for col in CANONICAL_COLUMNS:
if col not in result.columns:
result = result.with_columns(pl.lit(None).alias(col))
elif col not in ("reporting_standard",):
result = result.with_columns(pl.col(col).cast(pl.Int64, strict=False))
if unitid_filter is not None:
result = result.filter(pl.col("unitid") == unitid_filter)
if result.height == 0:
return 0
result = result.select(CANONICAL_COLUMNS)
conn.register("_tmp_finance", result.to_arrow())
conn.execute("INSERT INTO raw_ipeds_finance SELECT * FROM _tmp_finance")
conn.unregister("_tmp_finance")
return result.height
def load_finance(
conn: duckdb.DuckDBPyConnection,
year_range: range,
unitid_filter: int | None = UD_UNITID,
) -> int:
"""Load IPEDS finance data into raw_ipeds_finance.
Tries both F1A (GASB) and F2 (FASB) files since some public institutions
like UD report under FASB.
"""
total = 0
for year in year_range:
conn.execute("DELETE FROM raw_ipeds_finance WHERE year = ?", [year])
# Try F1A (GASB)
f1a_dir = config.IPEDS_DATA_DIR / "finance" / str(year)
csv_path = _find_csv(f1a_dir)
if csv_path:
total += _load_file(csv_path, year, F1A_COLUMN_VARIANTS, "GASB", conn, unitid_filter)
# Try F2 (FASB) — needed for institutions like UD
f2_dir = config.IPEDS_DATA_DIR / "finance_f2" / str(year)
csv_path = _find_csv(f2_dir)
if csv_path:
total += _load_file(csv_path, year, F2_COLUMN_VARIANTS, "FASB", conn, unitid_filter)
if total == 0 and not _find_csv(f1a_dir) and not _find_csv(f2_dir):
print(f" No finance CSV found for {year}, skipping")
return total