Initial build out
This commit is contained in:
parent
f037c50736
commit
29215e2bd2
40 changed files with 2622 additions and 0 deletions
142
src/admin_analytics/ipeds/finance.py
Normal file
142
src/admin_analytics/ipeds/finance.py
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
from pathlib import Path
|
||||
|
||||
import duckdb
|
||||
import polars as pl
|
||||
|
||||
from admin_analytics import config
|
||||
from admin_analytics.config import UD_UNITID
|
||||
|
||||
# F1A (GASB/public) column mappings — first match wins per canonical column.
|
||||
F1A_COLUMN_VARIANTS = {
|
||||
"unitid": ["UNITID"],
|
||||
"total_expenses": ["F1C191", "F1D02"],
|
||||
"instruction_expenses": ["F1C011"],
|
||||
"research_expenses": ["F1C021"],
|
||||
"public_service_expenses": ["F1C031"],
|
||||
"academic_support_expenses": ["F1C051"],
|
||||
"student_services_expenses": ["F1C061"],
|
||||
"institutional_support_expenses": ["F1C071"],
|
||||
"auxiliary_expenses": ["F1C111"],
|
||||
"hospital_expenses": ["F1C121"],
|
||||
"other_expenses": ["F1C141"],
|
||||
"salaries_wages": ["F1C192"],
|
||||
"benefits": ["F1C193"],
|
||||
}
|
||||
|
||||
# F2 (FASB/private-style) column mappings — UD reports here despite being public.
|
||||
F2_COLUMN_VARIANTS = {
|
||||
"unitid": ["UNITID"],
|
||||
"total_expenses": ["F2E131"],
|
||||
"instruction_expenses": ["F2E011"],
|
||||
"research_expenses": ["F2E021"],
|
||||
"public_service_expenses": ["F2E031"],
|
||||
"academic_support_expenses": ["F2E041"],
|
||||
"student_services_expenses": ["F2E051"],
|
||||
"institutional_support_expenses": ["F2E061"],
|
||||
"auxiliary_expenses": ["F2E071"],
|
||||
"hospital_expenses": ["F2E081"],
|
||||
"other_expenses": ["F2E121"],
|
||||
"salaries_wages": ["F2E132"],
|
||||
"benefits": ["F2E133"],
|
||||
}
|
||||
|
||||
CANONICAL_COLUMNS = [
|
||||
"unitid", "year", "reporting_standard", "total_expenses",
|
||||
"instruction_expenses", "research_expenses", "public_service_expenses",
|
||||
"academic_support_expenses", "student_services_expenses",
|
||||
"institutional_support_expenses", "auxiliary_expenses", "hospital_expenses",
|
||||
"other_expenses", "salaries_wages", "benefits",
|
||||
]
|
||||
|
||||
|
||||
def _find_csv(component_dir: Path) -> Path | None:
|
||||
csvs = [f for f in component_dir.glob("*.csv") if "dict" not in f.stem.lower()]
|
||||
return csvs[0] if csvs else None
|
||||
|
||||
|
||||
def _resolve_columns(df: pl.DataFrame, variants: dict) -> dict[str, str]:
|
||||
"""For each canonical name, find the first matching column."""
|
||||
upper_cols = {c.upper(): c for c in df.columns}
|
||||
resolved = {}
|
||||
for canonical, candidates in variants.items():
|
||||
for var in candidates:
|
||||
if var in upper_cols:
|
||||
resolved[canonical] = upper_cols[var]
|
||||
break
|
||||
return resolved
|
||||
|
||||
|
||||
def _load_file(
|
||||
csv_path: Path,
|
||||
year: int,
|
||||
variants: dict,
|
||||
reporting_standard: str,
|
||||
conn: duckdb.DuckDBPyConnection,
|
||||
unitid_filter: int | None,
|
||||
) -> int:
|
||||
"""Load a single finance CSV into raw_ipeds_finance."""
|
||||
df = pl.read_csv(csv_path, infer_schema_length=0, encoding="utf8-lossy")
|
||||
col_map = _resolve_columns(df, variants)
|
||||
|
||||
if "unitid" not in col_map:
|
||||
return 0
|
||||
|
||||
# Build dataframe with canonical columns
|
||||
result = pl.DataFrame({
|
||||
canonical: df[actual] for canonical, actual in col_map.items()
|
||||
})
|
||||
result = result.with_columns(
|
||||
pl.lit(year).alias("year"),
|
||||
pl.lit(reporting_standard).alias("reporting_standard"),
|
||||
)
|
||||
|
||||
# Cast numeric columns
|
||||
for col in CANONICAL_COLUMNS:
|
||||
if col not in result.columns:
|
||||
result = result.with_columns(pl.lit(None).alias(col))
|
||||
elif col not in ("reporting_standard",):
|
||||
result = result.with_columns(pl.col(col).cast(pl.Int64, strict=False))
|
||||
|
||||
if unitid_filter is not None:
|
||||
result = result.filter(pl.col("unitid") == unitid_filter)
|
||||
|
||||
if result.height == 0:
|
||||
return 0
|
||||
|
||||
result = result.select(CANONICAL_COLUMNS)
|
||||
conn.register("_tmp_finance", result.to_arrow())
|
||||
conn.execute("INSERT INTO raw_ipeds_finance SELECT * FROM _tmp_finance")
|
||||
conn.unregister("_tmp_finance")
|
||||
return result.height
|
||||
|
||||
|
||||
def load_finance(
|
||||
conn: duckdb.DuckDBPyConnection,
|
||||
year_range: range,
|
||||
unitid_filter: int | None = UD_UNITID,
|
||||
) -> int:
|
||||
"""Load IPEDS finance data into raw_ipeds_finance.
|
||||
|
||||
Tries both F1A (GASB) and F2 (FASB) files since some public institutions
|
||||
like UD report under FASB.
|
||||
"""
|
||||
total = 0
|
||||
for year in year_range:
|
||||
conn.execute("DELETE FROM raw_ipeds_finance WHERE year = ?", [year])
|
||||
|
||||
# Try F1A (GASB)
|
||||
f1a_dir = config.IPEDS_DATA_DIR / "finance" / str(year)
|
||||
csv_path = _find_csv(f1a_dir)
|
||||
if csv_path:
|
||||
total += _load_file(csv_path, year, F1A_COLUMN_VARIANTS, "GASB", conn, unitid_filter)
|
||||
|
||||
# Try F2 (FASB) — needed for institutions like UD
|
||||
f2_dir = config.IPEDS_DATA_DIR / "finance_f2" / str(year)
|
||||
csv_path = _find_csv(f2_dir)
|
||||
if csv_path:
|
||||
total += _load_file(csv_path, year, F2_COLUMN_VARIANTS, "FASB", conn, unitid_filter)
|
||||
|
||||
if total == 0 and not _find_csv(f1a_dir) and not _find_csv(f2_dir):
|
||||
print(f" No finance CSV found for {year}, skipping")
|
||||
|
||||
return total
|
||||
Loading…
Add table
Add a link
Reference in a new issue