Phase 1 project prototype

This commit is contained in:
emfurst 2026-03-30 19:29:33 -04:00
commit 2c9ae1c312
29 changed files with 2967 additions and 22 deletions

9
tests/fixtures/cu_data_sample.tsv vendored Normal file
View file

@ -0,0 +1,9 @@
series_id year period value footnote_codes
CUUR0000SA0 2022 M01 281.148
CUUR0000SA0 2022 M02 283.716
CUUR0000SA0 2022 M12 296.797
CUUR0000SA0 2022 M13 292.655
CUUR0000SA0 2023 M01 299.170
CUUR0000SA0 2023 M06 305.109
CUSR0000SA0 2023 M01 298.432
CUUR0000SA0 2023 S01 302.108
Can't render this file because it has a wrong number of fields in line 2.

75
tests/test_bls.py Normal file
View file

@ -0,0 +1,75 @@
"""Tests for BLS CPI-U download and loader."""
import httpx
import respx
from admin_analytics.bls.download import download_cpi_file
from admin_analytics.bls.loader import load_cpi
from admin_analytics.config import BLS_CPI_URL
class TestDownload:
@respx.mock
def test_download_creates_file(self, tmp_path, monkeypatch):
monkeypatch.setattr("admin_analytics.bls.download.BLS_DATA_DIR", tmp_path)
respx.get(BLS_CPI_URL).mock(
return_value=httpx.Response(200, text="series_id\tyear\tperiod\tvalue\n")
)
path = download_cpi_file(force=True)
assert path.exists()
assert path.parent == tmp_path
@respx.mock
def test_download_skips_when_exists(self, tmp_path, monkeypatch):
monkeypatch.setattr("admin_analytics.bls.download.BLS_DATA_DIR", tmp_path)
existing = tmp_path / "cu.data.0.Current"
existing.write_text("cached")
path = download_cpi_file(force=False)
assert path.read_text() == "cached"
@respx.mock
def test_download_force_overwrites(self, tmp_path, monkeypatch):
monkeypatch.setattr("admin_analytics.bls.download.BLS_DATA_DIR", tmp_path)
existing = tmp_path / "cu.data.0.Current"
existing.write_text("old")
respx.get(BLS_CPI_URL).mock(
return_value=httpx.Response(200, text="new data")
)
path = download_cpi_file(force=True)
assert path.read_text() == "new data"
class TestLoader:
def test_load_cpi(self, db_conn, fixtures_dir):
fixture = fixtures_dir / "cu_data_sample.tsv"
count = load_cpi(db_conn, fixture)
# Fixture has 5 valid CUUR0000SA0 monthly rows (M01, M02, M12, M01, M06)
# Excludes: M13 (annual avg), CUSR0000SA0 (wrong series), S01 (semi-annual)
assert count == 5
def test_load_cpi_correct_values(self, db_conn, fixtures_dir):
fixture = fixtures_dir / "cu_data_sample.tsv"
load_cpi(db_conn, fixture)
rows = db_conn.execute(
"SELECT year, month, value FROM raw_cpi_u ORDER BY year, month"
).fetchall()
assert rows[0] == (2022, 1, 281.148)
assert rows[-1] == (2023, 6, 305.109)
def test_load_cpi_types(self, db_conn, fixtures_dir):
fixture = fixtures_dir / "cu_data_sample.tsv"
load_cpi(db_conn, fixture)
row = db_conn.execute(
"SELECT year, month, value, series_id FROM raw_cpi_u LIMIT 1"
).fetchone()
assert isinstance(row[0], int)
assert isinstance(row[1], int)
assert isinstance(row[2], float)
assert row[3] == "CUUR0000SA0"
def test_load_cpi_idempotent(self, db_conn, fixtures_dir):
fixture = fixtures_dir / "cu_data_sample.tsv"
load_cpi(db_conn, fixture)
load_cpi(db_conn, fixture)
count = db_conn.execute("SELECT COUNT(*) FROM raw_cpi_u").fetchone()[0]
assert count == 5

View file

@ -0,0 +1,168 @@
"""Tests for dashboard query functions."""
from admin_analytics.config import UD_UNITID
from admin_analytics.dashboard.queries import (
query_admin_cost_ratio,
query_expense_breakdown,
query_admin_per_student,
query_admin_faculty_ratio,
query_top_earners,
query_comp_by_role,
query_comp_vs_cpi,
query_staff_composition,
query_student_staff_ratios,
query_growth_index,
query_admin_headcount,
query_headcount_summary,
)
def _seed_ipeds(conn):
"""Insert minimal IPEDS data for 2 years."""
for year, inst_support, total in [(2020, 100_000, 1_000_000), (2021, 120_000, 1_100_000)]:
conn.execute(
"INSERT INTO raw_ipeds_finance (unitid, year, institutional_support_expenses, total_expenses) VALUES (?, ?, ?, ?)",
[UD_UNITID, year, inst_support, total],
)
for year, enrollment in [(2020, 20000), (2021, 21000)]:
conn.execute(
"INSERT INTO raw_ipeds_enrollment (unitid, year, total_enrollment) VALUES (?, ?, ?)",
[UD_UNITID, year, enrollment],
)
for year, total, faculty, mgmt in [(2020, 3000, 1500, 500), (2021, 3100, 1550, 520)]:
conn.execute(
"INSERT INTO raw_ipeds_staff (unitid, year, total_staff, faculty_total, management_total) VALUES (?, ?, ?, ?, ?)",
[UD_UNITID, year, total, faculty, mgmt],
)
def _seed_cpi(conn):
"""Insert CPI data for 2020-2021."""
for year, value in [(2020, 258.8), (2021, 270.9)]:
for month in range(1, 13):
conn.execute(
"INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (?, ?, ?, ?)",
[year, month, value, "CUUR0000SA0"],
)
def _seed_990(conn):
"""Insert minimal 990 filing and Schedule J data."""
conn.execute(
"INSERT INTO raw_990_filing (object_id, ein, tax_year, organization_name, total_revenue, total_expenses) "
"VALUES ('obj1', '516000297', 2021, 'UD Foundation', 50000000, 40000000)"
)
conn.execute(
"INSERT INTO raw_990_schedule_j (object_id, ein, tax_year, person_name, title, "
"base_compensation, bonus_compensation, other_compensation, deferred_compensation, "
"nontaxable_benefits, total_compensation) "
"VALUES ('obj1', '516000297', 2021, 'JOHN DOE', 'PRESIDENT', "
"500000, 100000, 10000, 20000, 15000, 645000)"
)
conn.execute(
"INSERT INTO raw_990_schedule_j (object_id, ein, tax_year, person_name, title, "
"base_compensation, bonus_compensation, other_compensation, deferred_compensation, "
"nontaxable_benefits, total_compensation) "
"VALUES ('obj1', '516000297', 2021, 'JANE SMITH', 'PROVOST', "
"400000, 50000, 5000, 15000, 10000, 480000)"
)
class TestEmptyDatabase:
def test_admin_cost_ratio_empty(self, db_conn):
df = query_admin_cost_ratio(db_conn)
assert df.height == 0
def test_top_earners_empty(self, db_conn):
df = query_top_earners(db_conn)
assert df.height == 0
def test_headcount_empty(self, db_conn):
df = query_admin_headcount(db_conn)
assert df.height == 0
class TestAdminCostRatio:
def test_returns_correct_ratio(self, db_conn):
_seed_ipeds(db_conn)
_seed_cpi(db_conn)
df = query_admin_cost_ratio(db_conn)
assert df.height == 2
# 2020: 100000 / 1000000 = 10%
row_2020 = df.filter(df["year"] == 2020)
assert row_2020["admin_cost_pct"][0] == 10.0
class TestAdminPerStudent:
def test_returns_per_student(self, db_conn):
_seed_ipeds(db_conn)
_seed_cpi(db_conn)
df = query_admin_per_student(db_conn)
assert df.height == 2
# 2020: 100000 / 20000 = 5
row_2020 = df.filter(df["year"] == 2020)
assert row_2020["admin_per_student"][0] == 5.0
class TestAdminFacultyRatio:
def test_returns_ratio(self, db_conn):
_seed_ipeds(db_conn)
df = query_admin_faculty_ratio(db_conn)
assert df.height == 2
# 2020: 500 / 1500 = 0.333
row_2020 = df.filter(df["year"] == 2020)
assert row_2020["admin_faculty_ratio"][0] == 0.333
class TestTopEarners:
def test_returns_all(self, db_conn):
_seed_990(db_conn)
df = query_top_earners(db_conn)
assert df.height == 2
assert "canonical_role" in df.columns
def test_filter_by_year(self, db_conn):
_seed_990(db_conn)
df = query_top_earners(db_conn, year=2021)
assert df.height == 2
df_empty = query_top_earners(db_conn, year=2019)
assert df_empty.height == 0
class TestCompByRole:
def test_groups_by_role(self, db_conn):
_seed_990(db_conn)
df = query_comp_by_role(db_conn)
roles = df["canonical_role"].to_list()
assert "PRESIDENT" in roles
assert "PROVOST" in roles
class TestCompVsCpi:
def test_returns_indexed(self, db_conn):
_seed_990(db_conn)
_seed_cpi(db_conn)
df = query_comp_vs_cpi(db_conn)
assert df.height > 0
assert "comp_index" in df.columns
assert "cpi_index" in df.columns
class TestStaffComposition:
def test_computes_other(self, db_conn):
_seed_ipeds(db_conn)
df = query_staff_composition(db_conn)
assert df.height == 2
# 2020: 3000 - 1500 - 500 = 1000
row = df.filter(df["year"] == 2020)
assert row["other_staff"][0] == 1000
class TestGrowthIndex:
def test_base_year_100(self, db_conn):
_seed_ipeds(db_conn)
df = query_growth_index(db_conn)
assert df.height == 2
first = df.filter(df["year"] == 2020)
assert first["mgmt_index"][0] == 100.0
assert first["enrollment_index"][0] == 100.0

View file

@ -0,0 +1,76 @@
from admin_analytics.scraper.classify import classify_title, is_overhead
def test_grants_analyst():
assert classify_title("Grants Analyst II") == "GRANTS_ADMIN"
assert classify_title("Senior Grants Analyst") == "GRANTS_ADMIN"
assert classify_title("Manager, Grant Administration, Pre-Award") == "GRANTS_ADMIN"
assert classify_title("Closeout Coordinator, Research") == "GRANTS_ADMIN"
def test_research_staff():
assert classify_title("Research Associate") == "RESEARCH"
assert classify_title("Associate Scientist") == "RESEARCH"
assert classify_title("Computer Scientist NIST") == "RESEARCH"
assert classify_title("Postdoctoral Researcher") == "RESEARCH"
def test_academic_support():
assert classify_title("Undergraduate Academic Advisor") == "ACADEMIC_SUPPORT"
assert classify_title("Graduate Services Coordinator") == "ACADEMIC_SUPPORT"
assert classify_title("Academic Program Manager") == "ACADEMIC_SUPPORT"
def test_admin_support():
assert classify_title("Administrative Assistant IV") == "ADMIN_SUPPORT"
assert classify_title("Administrative Specialist") == "ADMIN_SUPPORT"
def test_it():
assert classify_title("Computing Support Specialist II") == "IT"
assert classify_title("Systems Programmer IV") == "IT"
assert classify_title("Director of Computing Operations") == "IT"
def test_finance():
assert classify_title("Financial Specialist") == "FINANCE"
assert classify_title("Director, Procurement & Financial Processing") == "FINANCE"
assert classify_title("Sr. Business Officer") == "FINANCE"
def test_leadership():
assert classify_title("Dean") == "LEADERSHIP"
assert classify_title("Associate Dean for Academic Affairs") == "LEADERSHIP"
assert classify_title("Chief of Staff") == "LEADERSHIP"
def test_communications():
assert classify_title("Communications Director") == "COMMUNICATIONS"
assert classify_title("Digital Communications Specialist") == "COMMUNICATIONS"
def test_technical():
assert classify_title("Master Machinist") == "TECHNICAL"
assert classify_title("Lab Manager") == "TECHNICAL"
assert classify_title("Lab Coordinator II") == "TECHNICAL"
def test_faculty_not_admin():
assert classify_title("Adjunct Professor NIST") == "FACULTY"
assert classify_title("Affiliated Associate Professor") == "FACULTY"
def test_overhead_classification():
assert is_overhead("LEADERSHIP") is True
assert is_overhead("FINANCE") is True
assert is_overhead("IT") is True
assert is_overhead("RESEARCH") is False
assert is_overhead("ACADEMIC_SUPPORT") is False
assert is_overhead("TECHNICAL") is False
assert is_overhead("GRANTS_ADMIN") is None # debatable
def test_unknown():
assert classify_title("Football Coach") == "UNKNOWN"
assert classify_title(None) == "UNKNOWN"
assert classify_title("") == "UNKNOWN"

View file

@ -0,0 +1,41 @@
from datetime import date
from admin_analytics.scraper.directory import StaffEntry
from admin_analytics.scraper.loader import load_scrape
def test_load_scrape(db_conn):
entries = [
StaffEntry(name="John Doe", title="Financial Specialist", email="jdoe@udel.edu", unit="COE Central"),
StaffEntry(name="Jane Smith", title="Research Associate", email="jsmith@udel.edu", unit="CBE"),
StaffEntry(name="Bob Jones", title="Academic Advisor", email="bjones@udel.edu", unit="ME"),
]
count = load_scrape(db_conn, entries, scrape_date=date(2026, 3, 30))
assert count == 3
rows = db_conn.execute(
"SELECT unit, person_name, category, is_overhead FROM raw_admin_headcount ORDER BY person_name"
).fetchall()
assert len(rows) == 3
# Bob Jones - Academic Advisor → ACADEMIC_SUPPORT → not overhead
assert rows[0] == ("ME", "Bob Jones", "ACADEMIC_SUPPORT", False)
# Jane Smith - Research Associate → RESEARCH → not overhead
assert rows[1] == ("CBE", "Jane Smith", "RESEARCH", False)
# John Doe - Financial Specialist → FINANCE → overhead
assert rows[2] == ("COE Central", "John Doe", "FINANCE", True)
def test_load_scrape_idempotent(db_conn):
entries = [
StaffEntry(name="John Doe", title="Financial Specialist", email="jdoe@udel.edu", unit="COE Central"),
]
d = date(2026, 3, 30)
load_scrape(db_conn, entries, scrape_date=d)
load_scrape(db_conn, entries, scrape_date=d) # second run should replace
count = db_conn.execute(
"SELECT COUNT(*) FROM raw_admin_headcount WHERE scrape_date = ?", [d]
).fetchone()[0]
assert count == 1

102
tests/test_validation.py Normal file
View file

@ -0,0 +1,102 @@
"""Tests for data validation module."""
from admin_analytics.validation import (
validate_row_counts,
validate_null_rates,
validate_year_coverage,
validate_cross_source_consistency,
format_report,
)
class TestRowCounts:
def test_empty_tables(self, db_conn):
counts = validate_row_counts(db_conn)
assert counts["raw_cpi_u"] == 0
assert counts["raw_institution"] == 0
def test_with_data(self, db_conn):
db_conn.execute(
"INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (2023, 1, 299.17, 'CUUR0000SA0')"
)
db_conn.execute(
"INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (2023, 2, 300.84, 'CUUR0000SA0')"
)
counts = validate_row_counts(db_conn)
assert counts["raw_cpi_u"] == 2
class TestNullRates:
def test_empty_tables_excluded(self, db_conn):
results = validate_null_rates(db_conn)
assert "raw_cpi_u" not in results
def test_no_nulls(self, db_conn):
db_conn.execute(
"INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (2023, 1, 299.17, 'CUUR0000SA0')"
)
results = validate_null_rates(db_conn)
assert results["raw_cpi_u"]["year"] == 0.0
assert results["raw_cpi_u"]["value"] == 0.0
def test_with_nulls(self, db_conn):
db_conn.execute(
"INSERT INTO raw_990_filing (object_id, ein, tax_year) VALUES ('f1', '123', 2023)"
)
db_conn.execute(
"INSERT INTO raw_990_filing (object_id, ein, tax_year, total_revenue) VALUES ('f2', '123', 2023, 100)"
)
results = validate_null_rates(db_conn)
assert results["raw_990_filing"]["total_revenue"] == 50.0
assert results["raw_990_filing"]["ein"] == 0.0
class TestYearCoverage:
def test_empty_tables(self, db_conn):
coverage = validate_year_coverage(db_conn)
assert coverage["raw_cpi_u"]["years"] == []
def test_with_data(self, db_conn):
for year in [2020, 2021, 2023]:
db_conn.execute(
"INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (?, 1, 100.0, 'CUUR0000SA0')",
[year],
)
coverage = validate_year_coverage(db_conn)
assert coverage["raw_cpi_u"]["years"] == [2020, 2021, 2023]
assert 2022 in coverage["raw_cpi_u"]["gaps"]
class TestCrossSource:
def test_empty(self, db_conn):
result = validate_cross_source_consistency(db_conn)
assert result["years_in_all_sources"] == []
def test_overlap(self, db_conn):
# Add IPEDS finance
db_conn.execute(
"INSERT INTO raw_ipeds_finance (unitid, year) VALUES (130943, 2022)"
)
db_conn.execute(
"INSERT INTO raw_ipeds_finance (unitid, year) VALUES (130943, 2023)"
)
# Add 990 filing
db_conn.execute(
"INSERT INTO raw_990_filing (object_id, tax_year) VALUES ('f1', 2023)"
)
# Add CPI
db_conn.execute(
"INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (2023, 1, 299.0, 'X')"
)
result = validate_cross_source_consistency(db_conn)
assert 2023 in result["years_in_all_sources"]
assert 2022 not in result["years_in_all_sources"]
class TestFormatReport:
def test_runs_on_empty_db(self, db_conn):
report = format_report(db_conn)
assert "Row Counts" in report
assert "NULL Rates" in report
assert "Year Coverage" in report
assert "Cross-Source" in report