554 lines
18 KiB
Python
554 lines
18 KiB
Python
"""Dashboard query layer — all DuckDB queries returning polars DataFrames."""
|
|
|
|
from typing import Any
|
|
|
|
import duckdb
|
|
import polars as pl
|
|
|
|
from admin_analytics.config import UD_UNITID
|
|
from admin_analytics.irs990.titles import normalize_title
|
|
|
|
# Shared CTE for CPI adjustment
|
|
_CPI_CTE = """
|
|
WITH annual_cpi AS (
|
|
SELECT year, AVG(value) AS avg_cpi
|
|
FROM raw_cpi_u
|
|
GROUP BY year
|
|
),
|
|
latest_cpi AS (
|
|
SELECT avg_cpi FROM annual_cpi
|
|
WHERE year = (SELECT MAX(year) FROM annual_cpi)
|
|
)
|
|
"""
|
|
|
|
|
|
def query_admin_cost_ratio(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Admin cost ratio trend with CPI-adjusted values."""
|
|
return conn.execute(f"""
|
|
{_CPI_CTE}
|
|
SELECT
|
|
f.year,
|
|
f.institutional_support_expenses,
|
|
f.total_expenses,
|
|
ROUND(f.institutional_support_expenses * 100.0
|
|
/ NULLIF(f.total_expenses, 0), 2) AS admin_cost_pct,
|
|
ROUND(f.institutional_support_expenses
|
|
* (SELECT avg_cpi FROM latest_cpi) / ac.avg_cpi, 0)
|
|
AS inst_support_cpi_adjusted,
|
|
ROUND(f.total_expenses
|
|
* (SELECT avg_cpi FROM latest_cpi) / ac.avg_cpi, 0)
|
|
AS total_expenses_cpi_adjusted
|
|
FROM raw_ipeds_finance f
|
|
LEFT JOIN annual_cpi ac ON ac.year = f.year
|
|
WHERE f.unitid = ?
|
|
ORDER BY f.year
|
|
""", [UD_UNITID]).pl()
|
|
|
|
|
|
def query_expense_breakdown(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Expense breakdown by function over time."""
|
|
return conn.execute("""
|
|
SELECT year,
|
|
instruction_expenses, research_expenses, public_service_expenses,
|
|
academic_support_expenses, student_services_expenses,
|
|
institutional_support_expenses, auxiliary_expenses,
|
|
hospital_expenses, other_expenses
|
|
FROM raw_ipeds_finance
|
|
WHERE unitid = ?
|
|
ORDER BY year
|
|
""", [UD_UNITID]).pl()
|
|
|
|
|
|
def query_admin_per_student(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Admin cost per student (nominal and CPI-adjusted)."""
|
|
return conn.execute(f"""
|
|
{_CPI_CTE}
|
|
SELECT
|
|
f.year,
|
|
f.institutional_support_expenses,
|
|
e.total_enrollment,
|
|
ROUND(f.institutional_support_expenses * 1.0
|
|
/ NULLIF(e.total_enrollment, 0), 0) AS admin_per_student,
|
|
ROUND(
|
|
(f.institutional_support_expenses
|
|
* (SELECT avg_cpi FROM latest_cpi) / ac.avg_cpi)
|
|
/ NULLIF(e.total_enrollment, 0), 0
|
|
) AS admin_per_student_cpi
|
|
FROM raw_ipeds_finance f
|
|
JOIN raw_ipeds_enrollment e ON e.unitid = f.unitid AND e.year = f.year
|
|
LEFT JOIN annual_cpi ac ON ac.year = f.year
|
|
WHERE f.unitid = ?
|
|
ORDER BY f.year
|
|
""", [UD_UNITID]).pl()
|
|
|
|
|
|
def query_admin_faculty_ratio(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Admin-to-faculty ratio over time."""
|
|
return conn.execute("""
|
|
SELECT year,
|
|
management_total,
|
|
faculty_total,
|
|
ROUND(management_total * 1.0 / NULLIF(faculty_total, 0), 3)
|
|
AS admin_faculty_ratio
|
|
FROM raw_ipeds_staff
|
|
WHERE unitid = ?
|
|
ORDER BY year
|
|
""", [UD_UNITID]).pl()
|
|
|
|
|
|
def query_aggregate_comp(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Top-10 Schedule J compensation per year — total, count, and average."""
|
|
return conn.execute("""
|
|
WITH ranked AS (
|
|
SELECT j.tax_year, j.total_compensation,
|
|
j.base_compensation, j.bonus_compensation,
|
|
j.deferred_compensation, j.nontaxable_benefits,
|
|
j.other_compensation,
|
|
ROW_NUMBER() OVER (PARTITION BY j.tax_year
|
|
ORDER BY j.total_compensation DESC) AS rn
|
|
FROM raw_990_schedule_j j
|
|
WHERE j.total_compensation > 0
|
|
)
|
|
SELECT tax_year,
|
|
COUNT(*) AS headcount,
|
|
SUM(total_compensation) AS total_comp,
|
|
ROUND(AVG(total_compensation), 0) AS avg_comp,
|
|
SUM(base_compensation) AS total_base,
|
|
SUM(bonus_compensation) AS total_bonus,
|
|
SUM(deferred_compensation) AS total_deferred,
|
|
SUM(nontaxable_benefits) AS total_benefits,
|
|
SUM(other_compensation) AS total_other
|
|
FROM ranked
|
|
WHERE rn <= 10
|
|
GROUP BY tax_year
|
|
ORDER BY tax_year
|
|
""").pl()
|
|
|
|
|
|
def query_aggregate_comp_cagr(conn: duckdb.DuckDBPyConnection) -> dict | None:
|
|
"""CAGR of aggregate Schedule J compensation over the last 5 years of data."""
|
|
df = query_aggregate_comp(conn)
|
|
if df.height < 2:
|
|
return None
|
|
|
|
# Use last 5 years of available data
|
|
df = df.tail(min(5, df.height))
|
|
|
|
start_year = df["tax_year"][0]
|
|
end_year = df["tax_year"][-1]
|
|
start_comp = float(df["total_comp"][0])
|
|
end_comp = float(df["total_comp"][-1])
|
|
n_years = end_year - start_year
|
|
|
|
if n_years <= 0 or start_comp <= 0:
|
|
return None
|
|
|
|
cagr = ((end_comp / start_comp) ** (1.0 / n_years) - 1) * 100
|
|
return {
|
|
"cagr_pct": round(cagr, 1),
|
|
"start_year": start_year,
|
|
"end_year": end_year,
|
|
"start_comp": int(end_comp),
|
|
"end_comp": int(end_comp),
|
|
}
|
|
|
|
|
|
def query_comp_cagr(conn: duckdb.DuckDBPyConnection) -> dict | None:
|
|
"""Annualized growth rate (CAGR) of President compensation.
|
|
|
|
Tracks the President role specifically using title normalization.
|
|
Returns dict with cagr_pct, start_year, end_year, start_comp, end_comp,
|
|
or None if insufficient data.
|
|
"""
|
|
raw = conn.execute("""
|
|
SELECT j.tax_year, j.title, j.total_compensation
|
|
FROM raw_990_schedule_j j
|
|
WHERE j.total_compensation > 0
|
|
ORDER BY j.tax_year
|
|
""").pl()
|
|
|
|
if raw.height == 0:
|
|
return None
|
|
|
|
raw = raw.with_columns(
|
|
pl.col("title").map_elements(
|
|
normalize_title, return_dtype=pl.Utf8
|
|
).alias("role")
|
|
)
|
|
|
|
df = (
|
|
raw.filter(pl.col("role") == "PRESIDENT")
|
|
.group_by("tax_year")
|
|
.agg(pl.col("total_compensation").max().alias("top_comp"))
|
|
.sort("tax_year")
|
|
)
|
|
|
|
if df.height < 2:
|
|
return None
|
|
|
|
start_year = df["tax_year"][0]
|
|
end_year = df["tax_year"][-1]
|
|
start_comp = df["top_comp"][0]
|
|
end_comp = df["top_comp"][-1]
|
|
n_years = end_year - start_year
|
|
|
|
if n_years <= 0 or start_comp <= 0:
|
|
return None
|
|
|
|
cagr = ((end_comp / start_comp) ** (1.0 / n_years) - 1) * 100
|
|
|
|
return {
|
|
"cagr_pct": round(cagr, 1),
|
|
"start_year": start_year,
|
|
"end_year": end_year,
|
|
"start_comp": start_comp,
|
|
"end_comp": end_comp,
|
|
}
|
|
|
|
|
|
def query_top_earners(
|
|
conn: duckdb.DuckDBPyConnection, year: int | None = None
|
|
) -> pl.DataFrame:
|
|
"""Top earners from Schedule J, optionally filtered by year."""
|
|
where = "WHERE j.total_compensation > 0"
|
|
params: list[Any] = []
|
|
if year is not None:
|
|
where += " AND j.tax_year = ?"
|
|
params.append(year)
|
|
|
|
df = conn.execute(f"""
|
|
SELECT
|
|
j.tax_year,
|
|
j.person_name,
|
|
j.title,
|
|
j.base_compensation,
|
|
j.bonus_compensation,
|
|
j.other_compensation,
|
|
j.deferred_compensation,
|
|
j.nontaxable_benefits,
|
|
j.total_compensation,
|
|
f.organization_name
|
|
FROM raw_990_schedule_j j
|
|
JOIN raw_990_filing f ON f.object_id = j.object_id
|
|
{where}
|
|
ORDER BY j.tax_year DESC, j.total_compensation DESC
|
|
""", params).pl()
|
|
|
|
if df.height > 0:
|
|
df = df.with_columns(
|
|
pl.col("title").map_elements(
|
|
normalize_title, return_dtype=pl.Utf8
|
|
).alias("canonical_role")
|
|
)
|
|
return df
|
|
|
|
|
|
def query_comp_by_role(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Compensation trends by canonical role across years."""
|
|
df = conn.execute("""
|
|
SELECT j.tax_year, j.person_name, j.title, j.total_compensation
|
|
FROM raw_990_schedule_j j
|
|
JOIN raw_990_filing f ON f.object_id = j.object_id
|
|
WHERE j.total_compensation > 0
|
|
ORDER BY j.tax_year, j.total_compensation DESC
|
|
""").pl()
|
|
|
|
if df.height == 0:
|
|
return df
|
|
|
|
df = df.with_columns(
|
|
pl.col("title").map_elements(
|
|
normalize_title, return_dtype=pl.Utf8
|
|
).alias("canonical_role")
|
|
)
|
|
|
|
# Keep highest-paid person per role per year
|
|
return (
|
|
df.sort("total_compensation", descending=True)
|
|
.group_by(["tax_year", "canonical_role"])
|
|
.first()
|
|
.sort(["tax_year", "canonical_role"])
|
|
)
|
|
|
|
|
|
def query_comp_vs_cpi(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Compensation growth vs CPI growth, indexed to first available year = 100.
|
|
|
|
Includes top earner, top-10 aggregate, and CPI-U.
|
|
"""
|
|
return conn.execute("""
|
|
WITH ranked AS (
|
|
SELECT tax_year, total_compensation,
|
|
ROW_NUMBER() OVER (PARTITION BY tax_year
|
|
ORDER BY total_compensation DESC) AS rn
|
|
FROM raw_990_schedule_j
|
|
WHERE total_compensation > 0
|
|
),
|
|
yearly_comp AS (
|
|
SELECT tax_year,
|
|
MAX(total_compensation) AS top_comp,
|
|
SUM(CASE WHEN rn <= 10 THEN total_compensation END) AS agg_comp
|
|
FROM ranked
|
|
GROUP BY tax_year
|
|
),
|
|
annual_cpi AS (
|
|
SELECT year, AVG(value) AS avg_cpi
|
|
FROM raw_cpi_u GROUP BY year
|
|
),
|
|
base AS (
|
|
SELECT c.top_comp AS base_top, c.agg_comp AS base_agg,
|
|
ac.avg_cpi AS base_cpi
|
|
FROM yearly_comp c
|
|
JOIN annual_cpi ac ON ac.year = c.tax_year
|
|
ORDER BY c.tax_year LIMIT 1
|
|
)
|
|
SELECT
|
|
c.tax_year AS year,
|
|
c.top_comp,
|
|
c.agg_comp,
|
|
ac.avg_cpi,
|
|
ROUND(c.top_comp * 100.0 / NULLIF((SELECT base_top FROM base), 0), 1)
|
|
AS comp_index,
|
|
ROUND(c.agg_comp * 100.0 / NULLIF((SELECT base_agg FROM base), 0), 1)
|
|
AS agg_index,
|
|
ROUND(ac.avg_cpi * 100.0 / NULLIF((SELECT base_cpi FROM base), 0), 1)
|
|
AS cpi_index
|
|
FROM yearly_comp c
|
|
JOIN annual_cpi ac ON ac.year = c.tax_year
|
|
ORDER BY year
|
|
""").pl()
|
|
|
|
|
|
def query_staff_composition(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Staff composition over time."""
|
|
return conn.execute("""
|
|
SELECT year, total_staff, faculty_total, management_total,
|
|
total_staff - COALESCE(faculty_total, 0) - COALESCE(management_total, 0)
|
|
AS other_staff
|
|
FROM raw_ipeds_staff
|
|
WHERE unitid = ?
|
|
ORDER BY year
|
|
""", [UD_UNITID]).pl()
|
|
|
|
|
|
def query_student_staff_ratios(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Student-to-staff and student-to-faculty ratios."""
|
|
return conn.execute("""
|
|
SELECT s.year, e.total_enrollment, s.total_staff, s.faculty_total,
|
|
ROUND(e.total_enrollment * 1.0 / NULLIF(s.total_staff, 0), 1)
|
|
AS students_per_staff,
|
|
ROUND(e.total_enrollment * 1.0 / NULLIF(s.faculty_total, 0), 1)
|
|
AS students_per_faculty
|
|
FROM raw_ipeds_staff s
|
|
JOIN raw_ipeds_enrollment e ON e.unitid = s.unitid AND e.year = s.year
|
|
WHERE s.unitid = ?
|
|
ORDER BY s.year
|
|
""", [UD_UNITID]).pl()
|
|
|
|
|
|
def query_growth_index(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Management, faculty, and enrollment growth, indexed to first year = 100."""
|
|
return conn.execute("""
|
|
WITH base AS (
|
|
SELECT s.management_total AS base_mgmt,
|
|
s.faculty_total AS base_fac,
|
|
e.total_enrollment AS base_enrl
|
|
FROM raw_ipeds_staff s
|
|
JOIN raw_ipeds_enrollment e ON e.unitid = s.unitid AND e.year = s.year
|
|
WHERE s.unitid = ?
|
|
ORDER BY s.year LIMIT 1
|
|
)
|
|
SELECT s.year,
|
|
s.management_total,
|
|
s.faculty_total,
|
|
e.total_enrollment,
|
|
ROUND(s.management_total * 100.0
|
|
/ NULLIF((SELECT base_mgmt FROM base), 0), 1) AS mgmt_index,
|
|
ROUND(s.faculty_total * 100.0
|
|
/ NULLIF((SELECT base_fac FROM base), 0), 1) AS faculty_index,
|
|
ROUND(e.total_enrollment * 100.0
|
|
/ NULLIF((SELECT base_enrl FROM base), 0), 1) AS enrollment_index
|
|
FROM raw_ipeds_staff s
|
|
JOIN raw_ipeds_enrollment e ON e.unitid = s.unitid AND e.year = s.year
|
|
WHERE s.unitid = ?
|
|
ORDER BY s.year
|
|
""", [UD_UNITID, UD_UNITID]).pl()
|
|
|
|
|
|
def query_endowment(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Endowment performance over time."""
|
|
return conn.execute("""
|
|
SELECT year, endowment_boy, endowment_eoy, new_gifts,
|
|
net_investment_return, other_changes, long_term_investments
|
|
FROM raw_ipeds_endowment
|
|
WHERE unitid = ?
|
|
ORDER BY year
|
|
""", [UD_UNITID]).pl()
|
|
|
|
|
|
def query_endowment_per_student(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Endowment value per student over time."""
|
|
return conn.execute("""
|
|
SELECT e.year, e.endowment_eoy, en.total_enrollment,
|
|
ROUND(e.endowment_eoy * 1.0 / NULLIF(en.total_enrollment, 0), 0)
|
|
AS endowment_per_student
|
|
FROM raw_ipeds_endowment e
|
|
JOIN raw_ipeds_enrollment en ON en.unitid = e.unitid AND en.year = e.year
|
|
WHERE e.unitid = ?
|
|
ORDER BY e.year
|
|
""", [UD_UNITID]).pl()
|
|
|
|
|
|
def query_cio_vs_endowment(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Chief Investment Officer compensation vs endowment growth, indexed."""
|
|
raw = conn.execute("""
|
|
SELECT j.tax_year, j.title, j.total_compensation
|
|
FROM raw_990_schedule_j j
|
|
WHERE j.total_compensation > 0
|
|
""").pl()
|
|
|
|
if raw.height == 0:
|
|
return pl.DataFrame()
|
|
|
|
raw = raw.with_columns(
|
|
pl.col("title").map_elements(
|
|
normalize_title, return_dtype=pl.Utf8
|
|
).alias("role")
|
|
)
|
|
|
|
cio = (
|
|
raw.filter(pl.col("role") == "CHIEF_INVESTMENT_OFFICER")
|
|
.group_by("tax_year")
|
|
.agg(pl.col("total_compensation").max().alias("cio_comp"))
|
|
.sort("tax_year")
|
|
)
|
|
|
|
if cio.height == 0:
|
|
return pl.DataFrame()
|
|
|
|
endow = conn.execute("""
|
|
SELECT year, endowment_eoy
|
|
FROM raw_ipeds_endowment
|
|
WHERE unitid = ?
|
|
ORDER BY year
|
|
""", [UD_UNITID]).pl()
|
|
|
|
merged = (
|
|
cio.join(endow, left_on="tax_year", right_on="year", how="inner")
|
|
.drop_nulls(subset=["cio_comp", "endowment_eoy"])
|
|
.sort("tax_year")
|
|
)
|
|
|
|
if merged.height < 2:
|
|
return merged
|
|
|
|
base_comp = float(merged["cio_comp"][0])
|
|
base_endow = float(merged["endowment_eoy"][0])
|
|
|
|
merged = merged.with_columns(
|
|
(pl.col("cio_comp").cast(pl.Float64) * 100.0 / base_comp).round(1).alias("cio_index"),
|
|
(pl.col("endowment_eoy").cast(pl.Float64) * 100.0 / base_endow).round(1).alias("endowment_index"),
|
|
)
|
|
|
|
return merged
|
|
|
|
|
|
def query_philanthropy(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Philanthropic giving over time — IPEDS private gifts + 990 revenue."""
|
|
return conn.execute(f"""
|
|
{_CPI_CTE}
|
|
SELECT e.year, e.total_private_gifts, e.new_gifts AS endowment_gifts,
|
|
ROUND(e.total_private_gifts * (SELECT avg_cpi FROM latest_cpi)
|
|
/ ac.avg_cpi, 0) AS gifts_cpi_adjusted
|
|
FROM raw_ipeds_endowment e
|
|
LEFT JOIN annual_cpi ac ON ac.year = e.year
|
|
WHERE e.unitid = ?
|
|
ORDER BY e.year
|
|
""", [UD_UNITID]).pl()
|
|
|
|
|
|
def query_comp_vs_philanthropy(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""VP Advancement and President comp vs philanthropic gifts, indexed."""
|
|
raw = conn.execute("""
|
|
SELECT j.tax_year, j.title, j.total_compensation
|
|
FROM raw_990_schedule_j j
|
|
WHERE j.total_compensation > 0
|
|
""").pl()
|
|
|
|
if raw.height == 0:
|
|
return pl.DataFrame()
|
|
|
|
raw = raw.with_columns(
|
|
pl.col("title").map_elements(
|
|
normalize_title, return_dtype=pl.Utf8
|
|
).alias("role")
|
|
)
|
|
|
|
# Get max comp per role per year for President and VP Advancement
|
|
roles = raw.filter(pl.col("role").is_in(["PRESIDENT", "VP_ADVANCEMENT"]))
|
|
if roles.height == 0:
|
|
return pl.DataFrame()
|
|
|
|
pivoted = (
|
|
roles.group_by(["tax_year", "role"])
|
|
.agg(pl.col("total_compensation").max().alias("comp"))
|
|
.sort("tax_year")
|
|
)
|
|
|
|
pres = (
|
|
pivoted.filter(pl.col("role") == "PRESIDENT")
|
|
.select(pl.col("tax_year"), pl.col("comp").alias("president_comp"))
|
|
)
|
|
vp = (
|
|
pivoted.filter(pl.col("role") == "VP_ADVANCEMENT")
|
|
.select(pl.col("tax_year"), pl.col("comp").alias("vp_adv_comp"))
|
|
)
|
|
|
|
gifts = conn.execute("""
|
|
SELECT year, total_private_gifts
|
|
FROM raw_ipeds_endowment
|
|
WHERE unitid = ?
|
|
ORDER BY year
|
|
""", [UD_UNITID]).pl()
|
|
|
|
# Join all three on year
|
|
merged = (
|
|
pres.join(vp, on="tax_year", how="outer_coalesce")
|
|
.join(gifts, left_on="tax_year", right_on="year", how="inner")
|
|
.drop_nulls(subset=["total_private_gifts"])
|
|
.sort("tax_year")
|
|
)
|
|
|
|
if merged.height < 2:
|
|
return merged
|
|
|
|
base_pres = float(merged.drop_nulls("president_comp")["president_comp"][0])
|
|
base_vp = float(merged.drop_nulls("vp_adv_comp")["vp_adv_comp"][0])
|
|
base_gifts = float(merged["total_private_gifts"][0])
|
|
|
|
merged = merged.with_columns(
|
|
(pl.col("president_comp").cast(pl.Float64) * 100.0 / base_pres).round(1).alias("president_index"),
|
|
(pl.col("vp_adv_comp").cast(pl.Float64) * 100.0 / base_vp).round(1).alias("vp_adv_index"),
|
|
(pl.col("total_private_gifts").cast(pl.Float64) * 100.0 / base_gifts).round(1).alias("gifts_index"),
|
|
)
|
|
|
|
return merged
|
|
|
|
|
|
def query_admin_headcount(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""All scraped admin headcount entries."""
|
|
return conn.execute("""
|
|
SELECT unit, person_name, title, category, is_overhead, scrape_date
|
|
FROM raw_admin_headcount
|
|
ORDER BY unit, category, person_name
|
|
""").pl()
|
|
|
|
|
|
def query_headcount_summary(conn: duckdb.DuckDBPyConnection) -> pl.DataFrame:
|
|
"""Headcount summary by unit and category."""
|
|
return conn.execute("""
|
|
SELECT unit, category, is_overhead, COUNT(*) AS count
|
|
FROM raw_admin_headcount
|
|
GROUP BY unit, category, is_overhead
|
|
ORDER BY unit, count DESC
|
|
""").pl()
|