Compensation, endowmnet tweaks. Added About.
This commit is contained in:
parent
a41f78545b
commit
13fb4b8418
13 changed files with 914 additions and 17 deletions
|
|
@ -40,6 +40,25 @@ F2_COLUMN_VARIANTS = {
|
|||
"benefits": ["F2E133"],
|
||||
}
|
||||
|
||||
# F2 endowment / philanthropy fields
|
||||
F2_ENDOWMENT_VARIANTS = {
|
||||
"unitid": ["UNITID"],
|
||||
"endowment_boy": ["F2H01"],
|
||||
"endowment_eoy": ["F2H02"],
|
||||
"new_gifts": ["F2H03A"],
|
||||
"net_investment_return": ["F2H03B"],
|
||||
"other_changes": ["F2H03D"],
|
||||
"total_private_gifts": ["F2D08"],
|
||||
"total_investment_return": ["F2D10"],
|
||||
"long_term_investments": ["F2A01"],
|
||||
}
|
||||
|
||||
ENDOWMENT_COLUMNS = [
|
||||
"unitid", "year", "endowment_boy", "endowment_eoy", "new_gifts",
|
||||
"net_investment_return", "other_changes", "total_private_gifts",
|
||||
"total_investment_return", "long_term_investments",
|
||||
]
|
||||
|
||||
CANONICAL_COLUMNS = [
|
||||
"unitid", "year", "reporting_standard", "total_expenses",
|
||||
"instruction_expenses", "research_expenses", "public_service_expenses",
|
||||
|
|
@ -56,7 +75,7 @@ def _find_csv(component_dir: Path) -> Path | None:
|
|||
|
||||
def _resolve_columns(df: pl.DataFrame, variants: dict) -> dict[str, str]:
|
||||
"""For each canonical name, find the first matching column."""
|
||||
upper_cols = {c.upper(): c for c in df.columns}
|
||||
upper_cols = {c.strip().upper(): c for c in df.columns}
|
||||
resolved = {}
|
||||
for canonical, candidates in variants.items():
|
||||
for var in candidates:
|
||||
|
|
@ -140,3 +159,49 @@ def load_finance(
|
|||
print(f" No finance CSV found for {year}, skipping")
|
||||
|
||||
return total
|
||||
|
||||
|
||||
def load_endowment(
|
||||
conn: duckdb.DuckDBPyConnection,
|
||||
year_range: range,
|
||||
unitid_filter: int | None = UD_UNITID,
|
||||
) -> int:
|
||||
"""Load IPEDS F2 endowment and philanthropy data into raw_ipeds_endowment."""
|
||||
total = 0
|
||||
for year in year_range:
|
||||
f2_dir = config.IPEDS_DATA_DIR / "finance_f2" / str(year)
|
||||
csv_path = _find_csv(f2_dir)
|
||||
if csv_path is None:
|
||||
continue
|
||||
|
||||
df = pl.read_csv(csv_path, infer_schema_length=0, encoding="utf8-lossy")
|
||||
col_map = _resolve_columns(df, F2_ENDOWMENT_VARIANTS)
|
||||
|
||||
if "unitid" not in col_map:
|
||||
continue
|
||||
|
||||
result = pl.DataFrame({
|
||||
canonical: df[actual] for canonical, actual in col_map.items()
|
||||
})
|
||||
result = result.with_columns(pl.lit(year).alias("year"))
|
||||
|
||||
for col in ENDOWMENT_COLUMNS:
|
||||
if col not in result.columns:
|
||||
result = result.with_columns(pl.lit(None).alias(col))
|
||||
elif col not in ("year",):
|
||||
result = result.with_columns(pl.col(col).cast(pl.Int64, strict=False))
|
||||
|
||||
if unitid_filter is not None:
|
||||
result = result.filter(pl.col("unitid") == unitid_filter)
|
||||
|
||||
if result.height == 0:
|
||||
continue
|
||||
|
||||
result = result.select(ENDOWMENT_COLUMNS)
|
||||
conn.execute("DELETE FROM raw_ipeds_endowment WHERE year = ?", [year])
|
||||
conn.register("_tmp_endow", result.to_arrow())
|
||||
conn.execute("INSERT INTO raw_ipeds_endowment SELECT * FROM _tmp_endow")
|
||||
conn.unregister("_tmp_endow")
|
||||
total += result.height
|
||||
|
||||
return total
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue