Compensation, endowmnet tweaks. Added About.

This commit is contained in:
emfurst 2026-03-31 08:03:58 -04:00
commit 13fb4b8418
13 changed files with 914 additions and 17 deletions

View file

@ -96,6 +96,116 @@ def query_admin_faculty_ratio(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
""", [UD_UNITID]).pl()
def query_aggregate_comp(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
"""Top-10 Schedule J compensation per year — total, count, and average."""
return conn.execute("""
WITH ranked AS (
SELECT j.tax_year, j.total_compensation,
j.base_compensation, j.bonus_compensation,
j.deferred_compensation, j.nontaxable_benefits,
j.other_compensation,
ROW_NUMBER() OVER (PARTITION BY j.tax_year
ORDER BY j.total_compensation DESC) AS rn
FROM raw_990_schedule_j j
WHERE j.total_compensation > 0
)
SELECT tax_year,
COUNT(*) AS headcount,
SUM(total_compensation) AS total_comp,
ROUND(AVG(total_compensation), 0) AS avg_comp,
SUM(base_compensation) AS total_base,
SUM(bonus_compensation) AS total_bonus,
SUM(deferred_compensation) AS total_deferred,
SUM(nontaxable_benefits) AS total_benefits,
SUM(other_compensation) AS total_other
FROM ranked
WHERE rn <= 10
GROUP BY tax_year
ORDER BY tax_year
""").pl()
def query_aggregate_comp_cagr(conn: duckdb.DuckDBPyConnection) -> dict | None:
"""CAGR of aggregate Schedule J compensation over the last 5 years of data."""
df = query_aggregate_comp(conn)
if df.height < 2:
return None
# Use last 5 years of available data
df = df.tail(min(5, df.height))
start_year = df["tax_year"][0]
end_year = df["tax_year"][-1]
start_comp = float(df["total_comp"][0])
end_comp = float(df["total_comp"][-1])
n_years = end_year - start_year
if n_years <= 0 or start_comp <= 0:
return None
cagr = ((end_comp / start_comp) ** (1.0 / n_years) - 1) * 100
return {
"cagr_pct": round(cagr, 1),
"start_year": start_year,
"end_year": end_year,
"start_comp": int(end_comp),
"end_comp": int(end_comp),
}
def query_comp_cagr(conn: duckdb.DuckDBPyConnection) -> dict | None:
"""Annualized growth rate (CAGR) of President compensation.
Tracks the President role specifically using title normalization.
Returns dict with cagr_pct, start_year, end_year, start_comp, end_comp,
or None if insufficient data.
"""
raw = conn.execute("""
SELECT j.tax_year, j.title, j.total_compensation
FROM raw_990_schedule_j j
WHERE j.total_compensation > 0
ORDER BY j.tax_year
""").pl()
if raw.height == 0:
return None
raw = raw.with_columns(
pl.col("title").map_elements(
normalize_title, return_dtype=pl.Utf8
).alias("role")
)
df = (
raw.filter(pl.col("role") == "PRESIDENT")
.group_by("tax_year")
.agg(pl.col("total_compensation").max().alias("top_comp"))
.sort("tax_year")
)
if df.height < 2:
return None
start_year = df["tax_year"][0]
end_year = df["tax_year"][-1]
start_comp = df["top_comp"][0]
end_comp = df["top_comp"][-1]
n_years = end_year - start_year
if n_years <= 0 or start_comp <= 0:
return None
cagr = ((end_comp / start_comp) ** (1.0 / n_years) - 1) * 100
return {
"cagr_pct": round(cagr, 1),
"start_year": start_year,
"end_year": end_year,
"start_comp": start_comp,
"end_comp": end_comp,
}
def query_top_earners(
conn: duckdb.DuckDBPyConnection, year: int | None = None
) -> pl.DataFrame:
@ -162,11 +272,23 @@ def query_comp_by_role(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
def query_comp_vs_cpi(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
"""Compensation growth vs CPI growth, indexed to first available year = 100."""
"""Compensation growth vs CPI growth, indexed to first available year = 100.
Includes top earner, top-10 aggregate, and CPI-U.
"""
return conn.execute("""
WITH yearly_max_comp AS (
SELECT tax_year, MAX(total_compensation) AS top_comp
WITH ranked AS (
SELECT tax_year, total_compensation,
ROW_NUMBER() OVER (PARTITION BY tax_year
ORDER BY total_compensation DESC) AS rn
FROM raw_990_schedule_j
WHERE total_compensation > 0
),
yearly_comp AS (
SELECT tax_year,
MAX(total_compensation) AS top_comp,
SUM(CASE WHEN rn <= 10 THEN total_compensation END) AS agg_comp
FROM ranked
GROUP BY tax_year
),
annual_cpi AS (
@ -174,20 +296,24 @@ def query_comp_vs_cpi(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
FROM raw_cpi_u GROUP BY year
),
base AS (
SELECT c.top_comp AS base_comp, ac.avg_cpi AS base_cpi
FROM yearly_max_comp c
SELECT c.top_comp AS base_top, c.agg_comp AS base_agg,
ac.avg_cpi AS base_cpi
FROM yearly_comp c
JOIN annual_cpi ac ON ac.year = c.tax_year
ORDER BY c.tax_year LIMIT 1
)
SELECT
c.tax_year AS year,
c.top_comp,
c.agg_comp,
ac.avg_cpi,
ROUND(c.top_comp * 100.0 / NULLIF((SELECT base_comp FROM base), 0), 1)
ROUND(c.top_comp * 100.0 / NULLIF((SELECT base_top FROM base), 0), 1)
AS comp_index,
ROUND(c.agg_comp * 100.0 / NULLIF((SELECT base_agg FROM base), 0), 1)
AS agg_index,
ROUND(ac.avg_cpi * 100.0 / NULLIF((SELECT base_cpi FROM base), 0), 1)
AS cpi_index
FROM yearly_max_comp c
FROM yearly_comp c
JOIN annual_cpi ac ON ac.year = c.tax_year
ORDER BY year
""").pl()
@ -249,6 +375,166 @@ def query_growth_index(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
""", [UD_UNITID, UD_UNITID]).pl()
def query_endowment(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
"""Endowment performance over time."""
return conn.execute("""
SELECT year, endowment_boy, endowment_eoy, new_gifts,
net_investment_return, other_changes, long_term_investments
FROM raw_ipeds_endowment
WHERE unitid = ?
ORDER BY year
""", [UD_UNITID]).pl()
def query_endowment_per_student(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
"""Endowment value per student over time."""
return conn.execute("""
SELECT e.year, e.endowment_eoy, en.total_enrollment,
ROUND(e.endowment_eoy * 1.0 / NULLIF(en.total_enrollment, 0), 0)
AS endowment_per_student
FROM raw_ipeds_endowment e
JOIN raw_ipeds_enrollment en ON en.unitid = e.unitid AND en.year = e.year
WHERE e.unitid = ?
ORDER BY e.year
""", [UD_UNITID]).pl()
def query_cio_vs_endowment(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
"""Chief Investment Officer compensation vs endowment growth, indexed."""
raw = conn.execute("""
SELECT j.tax_year, j.title, j.total_compensation
FROM raw_990_schedule_j j
WHERE j.total_compensation > 0
""").pl()
if raw.height == 0:
return pl.DataFrame()
raw = raw.with_columns(
pl.col("title").map_elements(
normalize_title, return_dtype=pl.Utf8
).alias("role")
)
cio = (
raw.filter(pl.col("role") == "CHIEF_INVESTMENT_OFFICER")
.group_by("tax_year")
.agg(pl.col("total_compensation").max().alias("cio_comp"))
.sort("tax_year")
)
if cio.height == 0:
return pl.DataFrame()
endow = conn.execute("""
SELECT year, endowment_eoy
FROM raw_ipeds_endowment
WHERE unitid = ?
ORDER BY year
""", [UD_UNITID]).pl()
merged = (
cio.join(endow, left_on="tax_year", right_on="year", how="inner")
.drop_nulls(subset=["cio_comp", "endowment_eoy"])
.sort("tax_year")
)
if merged.height < 2:
return merged
base_comp = float(merged["cio_comp"][0])
base_endow = float(merged["endowment_eoy"][0])
merged = merged.with_columns(
(pl.col("cio_comp").cast(pl.Float64) * 100.0 / base_comp).round(1).alias("cio_index"),
(pl.col("endowment_eoy").cast(pl.Float64) * 100.0 / base_endow).round(1).alias("endowment_index"),
)
return merged
def query_philanthropy(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
"""Philanthropic giving over time — IPEDS private gifts + 990 revenue."""
return conn.execute(f"""
{_CPI_CTE}
SELECT e.year, e.total_private_gifts, e.new_gifts AS endowment_gifts,
ROUND(e.total_private_gifts * (SELECT avg_cpi FROM latest_cpi)
/ ac.avg_cpi, 0) AS gifts_cpi_adjusted
FROM raw_ipeds_endowment e
LEFT JOIN annual_cpi ac ON ac.year = e.year
WHERE e.unitid = ?
ORDER BY e.year
""", [UD_UNITID]).pl()
def query_comp_vs_philanthropy(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
"""VP Advancement and President comp vs philanthropic gifts, indexed."""
raw = conn.execute("""
SELECT j.tax_year, j.title, j.total_compensation
FROM raw_990_schedule_j j
WHERE j.total_compensation > 0
""").pl()
if raw.height == 0:
return pl.DataFrame()
raw = raw.with_columns(
pl.col("title").map_elements(
normalize_title, return_dtype=pl.Utf8
).alias("role")
)
# Get max comp per role per year for President and VP Advancement
roles = raw.filter(pl.col("role").is_in(["PRESIDENT", "VP_ADVANCEMENT"]))
if roles.height == 0:
return pl.DataFrame()
pivoted = (
roles.group_by(["tax_year", "role"])
.agg(pl.col("total_compensation").max().alias("comp"))
.sort("tax_year")
)
pres = (
pivoted.filter(pl.col("role") == "PRESIDENT")
.select(pl.col("tax_year"), pl.col("comp").alias("president_comp"))
)
vp = (
pivoted.filter(pl.col("role") == "VP_ADVANCEMENT")
.select(pl.col("tax_year"), pl.col("comp").alias("vp_adv_comp"))
)
gifts = conn.execute("""
SELECT year, total_private_gifts
FROM raw_ipeds_endowment
WHERE unitid = ?
ORDER BY year
""", [UD_UNITID]).pl()
# Join all three on year
merged = (
pres.join(vp, on="tax_year", how="outer_coalesce")
.join(gifts, left_on="tax_year", right_on="year", how="inner")
.drop_nulls(subset=["total_private_gifts"])
.sort("tax_year")
)
if merged.height < 2:
return merged
base_pres = float(merged.drop_nulls("president_comp")["president_comp"][0])
base_vp = float(merged.drop_nulls("vp_adv_comp")["vp_adv_comp"][0])
base_gifts = float(merged["total_private_gifts"][0])
merged = merged.with_columns(
(pl.col("president_comp").cast(pl.Float64) * 100.0 / base_pres).round(1).alias("president_index"),
(pl.col("vp_adv_comp").cast(pl.Float64) * 100.0 / base_vp).round(1).alias("vp_adv_index"),
(pl.col("total_private_gifts").cast(pl.Float64) * 100.0 / base_gifts).round(1).alias("gifts_index"),
)
return merged
def query_admin_headcount(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
"""All scraped admin headcount entries."""
return conn.execute("""