Phase 1 project prototype
This commit is contained in:
parent
29215e2bd2
commit
2c9ae1c312
29 changed files with 2967 additions and 22 deletions
9
tests/fixtures/cu_data_sample.tsv
vendored
Normal file
9
tests/fixtures/cu_data_sample.tsv
vendored
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
series_id year period value footnote_codes
|
||||
CUUR0000SA0 2022 M01 281.148
|
||||
CUUR0000SA0 2022 M02 283.716
|
||||
CUUR0000SA0 2022 M12 296.797
|
||||
CUUR0000SA0 2022 M13 292.655
|
||||
CUUR0000SA0 2023 M01 299.170
|
||||
CUUR0000SA0 2023 M06 305.109
|
||||
CUSR0000SA0 2023 M01 298.432
|
||||
CUUR0000SA0 2023 S01 302.108
|
||||
|
Can't render this file because it has a wrong number of fields in line 2.
|
75
tests/test_bls.py
Normal file
75
tests/test_bls.py
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
"""Tests for BLS CPI-U download and loader."""
|
||||
|
||||
import httpx
|
||||
import respx
|
||||
|
||||
from admin_analytics.bls.download import download_cpi_file
|
||||
from admin_analytics.bls.loader import load_cpi
|
||||
from admin_analytics.config import BLS_CPI_URL
|
||||
|
||||
|
||||
class TestDownload:
|
||||
@respx.mock
|
||||
def test_download_creates_file(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setattr("admin_analytics.bls.download.BLS_DATA_DIR", tmp_path)
|
||||
respx.get(BLS_CPI_URL).mock(
|
||||
return_value=httpx.Response(200, text="series_id\tyear\tperiod\tvalue\n")
|
||||
)
|
||||
path = download_cpi_file(force=True)
|
||||
assert path.exists()
|
||||
assert path.parent == tmp_path
|
||||
|
||||
@respx.mock
|
||||
def test_download_skips_when_exists(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setattr("admin_analytics.bls.download.BLS_DATA_DIR", tmp_path)
|
||||
existing = tmp_path / "cu.data.0.Current"
|
||||
existing.write_text("cached")
|
||||
path = download_cpi_file(force=False)
|
||||
assert path.read_text() == "cached"
|
||||
|
||||
@respx.mock
|
||||
def test_download_force_overwrites(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setattr("admin_analytics.bls.download.BLS_DATA_DIR", tmp_path)
|
||||
existing = tmp_path / "cu.data.0.Current"
|
||||
existing.write_text("old")
|
||||
respx.get(BLS_CPI_URL).mock(
|
||||
return_value=httpx.Response(200, text="new data")
|
||||
)
|
||||
path = download_cpi_file(force=True)
|
||||
assert path.read_text() == "new data"
|
||||
|
||||
|
||||
class TestLoader:
|
||||
def test_load_cpi(self, db_conn, fixtures_dir):
|
||||
fixture = fixtures_dir / "cu_data_sample.tsv"
|
||||
count = load_cpi(db_conn, fixture)
|
||||
# Fixture has 5 valid CUUR0000SA0 monthly rows (M01, M02, M12, M01, M06)
|
||||
# Excludes: M13 (annual avg), CUSR0000SA0 (wrong series), S01 (semi-annual)
|
||||
assert count == 5
|
||||
|
||||
def test_load_cpi_correct_values(self, db_conn, fixtures_dir):
|
||||
fixture = fixtures_dir / "cu_data_sample.tsv"
|
||||
load_cpi(db_conn, fixture)
|
||||
rows = db_conn.execute(
|
||||
"SELECT year, month, value FROM raw_cpi_u ORDER BY year, month"
|
||||
).fetchall()
|
||||
assert rows[0] == (2022, 1, 281.148)
|
||||
assert rows[-1] == (2023, 6, 305.109)
|
||||
|
||||
def test_load_cpi_types(self, db_conn, fixtures_dir):
|
||||
fixture = fixtures_dir / "cu_data_sample.tsv"
|
||||
load_cpi(db_conn, fixture)
|
||||
row = db_conn.execute(
|
||||
"SELECT year, month, value, series_id FROM raw_cpi_u LIMIT 1"
|
||||
).fetchone()
|
||||
assert isinstance(row[0], int)
|
||||
assert isinstance(row[1], int)
|
||||
assert isinstance(row[2], float)
|
||||
assert row[3] == "CUUR0000SA0"
|
||||
|
||||
def test_load_cpi_idempotent(self, db_conn, fixtures_dir):
|
||||
fixture = fixtures_dir / "cu_data_sample.tsv"
|
||||
load_cpi(db_conn, fixture)
|
||||
load_cpi(db_conn, fixture)
|
||||
count = db_conn.execute("SELECT COUNT(*) FROM raw_cpi_u").fetchone()[0]
|
||||
assert count == 5
|
||||
168
tests/test_dashboard_queries.py
Normal file
168
tests/test_dashboard_queries.py
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
"""Tests for dashboard query functions."""
|
||||
|
||||
from admin_analytics.config import UD_UNITID
|
||||
from admin_analytics.dashboard.queries import (
|
||||
query_admin_cost_ratio,
|
||||
query_expense_breakdown,
|
||||
query_admin_per_student,
|
||||
query_admin_faculty_ratio,
|
||||
query_top_earners,
|
||||
query_comp_by_role,
|
||||
query_comp_vs_cpi,
|
||||
query_staff_composition,
|
||||
query_student_staff_ratios,
|
||||
query_growth_index,
|
||||
query_admin_headcount,
|
||||
query_headcount_summary,
|
||||
)
|
||||
|
||||
|
||||
def _seed_ipeds(conn):
|
||||
"""Insert minimal IPEDS data for 2 years."""
|
||||
for year, inst_support, total in [(2020, 100_000, 1_000_000), (2021, 120_000, 1_100_000)]:
|
||||
conn.execute(
|
||||
"INSERT INTO raw_ipeds_finance (unitid, year, institutional_support_expenses, total_expenses) VALUES (?, ?, ?, ?)",
|
||||
[UD_UNITID, year, inst_support, total],
|
||||
)
|
||||
for year, enrollment in [(2020, 20000), (2021, 21000)]:
|
||||
conn.execute(
|
||||
"INSERT INTO raw_ipeds_enrollment (unitid, year, total_enrollment) VALUES (?, ?, ?)",
|
||||
[UD_UNITID, year, enrollment],
|
||||
)
|
||||
for year, total, faculty, mgmt in [(2020, 3000, 1500, 500), (2021, 3100, 1550, 520)]:
|
||||
conn.execute(
|
||||
"INSERT INTO raw_ipeds_staff (unitid, year, total_staff, faculty_total, management_total) VALUES (?, ?, ?, ?, ?)",
|
||||
[UD_UNITID, year, total, faculty, mgmt],
|
||||
)
|
||||
|
||||
|
||||
def _seed_cpi(conn):
|
||||
"""Insert CPI data for 2020-2021."""
|
||||
for year, value in [(2020, 258.8), (2021, 270.9)]:
|
||||
for month in range(1, 13):
|
||||
conn.execute(
|
||||
"INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (?, ?, ?, ?)",
|
||||
[year, month, value, "CUUR0000SA0"],
|
||||
)
|
||||
|
||||
|
||||
def _seed_990(conn):
|
||||
"""Insert minimal 990 filing and Schedule J data."""
|
||||
conn.execute(
|
||||
"INSERT INTO raw_990_filing (object_id, ein, tax_year, organization_name, total_revenue, total_expenses) "
|
||||
"VALUES ('obj1', '516000297', 2021, 'UD Foundation', 50000000, 40000000)"
|
||||
)
|
||||
conn.execute(
|
||||
"INSERT INTO raw_990_schedule_j (object_id, ein, tax_year, person_name, title, "
|
||||
"base_compensation, bonus_compensation, other_compensation, deferred_compensation, "
|
||||
"nontaxable_benefits, total_compensation) "
|
||||
"VALUES ('obj1', '516000297', 2021, 'JOHN DOE', 'PRESIDENT', "
|
||||
"500000, 100000, 10000, 20000, 15000, 645000)"
|
||||
)
|
||||
conn.execute(
|
||||
"INSERT INTO raw_990_schedule_j (object_id, ein, tax_year, person_name, title, "
|
||||
"base_compensation, bonus_compensation, other_compensation, deferred_compensation, "
|
||||
"nontaxable_benefits, total_compensation) "
|
||||
"VALUES ('obj1', '516000297', 2021, 'JANE SMITH', 'PROVOST', "
|
||||
"400000, 50000, 5000, 15000, 10000, 480000)"
|
||||
)
|
||||
|
||||
|
||||
class TestEmptyDatabase:
|
||||
def test_admin_cost_ratio_empty(self, db_conn):
|
||||
df = query_admin_cost_ratio(db_conn)
|
||||
assert df.height == 0
|
||||
|
||||
def test_top_earners_empty(self, db_conn):
|
||||
df = query_top_earners(db_conn)
|
||||
assert df.height == 0
|
||||
|
||||
def test_headcount_empty(self, db_conn):
|
||||
df = query_admin_headcount(db_conn)
|
||||
assert df.height == 0
|
||||
|
||||
|
||||
class TestAdminCostRatio:
|
||||
def test_returns_correct_ratio(self, db_conn):
|
||||
_seed_ipeds(db_conn)
|
||||
_seed_cpi(db_conn)
|
||||
df = query_admin_cost_ratio(db_conn)
|
||||
assert df.height == 2
|
||||
# 2020: 100000 / 1000000 = 10%
|
||||
row_2020 = df.filter(df["year"] == 2020)
|
||||
assert row_2020["admin_cost_pct"][0] == 10.0
|
||||
|
||||
|
||||
class TestAdminPerStudent:
|
||||
def test_returns_per_student(self, db_conn):
|
||||
_seed_ipeds(db_conn)
|
||||
_seed_cpi(db_conn)
|
||||
df = query_admin_per_student(db_conn)
|
||||
assert df.height == 2
|
||||
# 2020: 100000 / 20000 = 5
|
||||
row_2020 = df.filter(df["year"] == 2020)
|
||||
assert row_2020["admin_per_student"][0] == 5.0
|
||||
|
||||
|
||||
class TestAdminFacultyRatio:
|
||||
def test_returns_ratio(self, db_conn):
|
||||
_seed_ipeds(db_conn)
|
||||
df = query_admin_faculty_ratio(db_conn)
|
||||
assert df.height == 2
|
||||
# 2020: 500 / 1500 = 0.333
|
||||
row_2020 = df.filter(df["year"] == 2020)
|
||||
assert row_2020["admin_faculty_ratio"][0] == 0.333
|
||||
|
||||
|
||||
class TestTopEarners:
|
||||
def test_returns_all(self, db_conn):
|
||||
_seed_990(db_conn)
|
||||
df = query_top_earners(db_conn)
|
||||
assert df.height == 2
|
||||
assert "canonical_role" in df.columns
|
||||
|
||||
def test_filter_by_year(self, db_conn):
|
||||
_seed_990(db_conn)
|
||||
df = query_top_earners(db_conn, year=2021)
|
||||
assert df.height == 2
|
||||
df_empty = query_top_earners(db_conn, year=2019)
|
||||
assert df_empty.height == 0
|
||||
|
||||
|
||||
class TestCompByRole:
|
||||
def test_groups_by_role(self, db_conn):
|
||||
_seed_990(db_conn)
|
||||
df = query_comp_by_role(db_conn)
|
||||
roles = df["canonical_role"].to_list()
|
||||
assert "PRESIDENT" in roles
|
||||
assert "PROVOST" in roles
|
||||
|
||||
|
||||
class TestCompVsCpi:
|
||||
def test_returns_indexed(self, db_conn):
|
||||
_seed_990(db_conn)
|
||||
_seed_cpi(db_conn)
|
||||
df = query_comp_vs_cpi(db_conn)
|
||||
assert df.height > 0
|
||||
assert "comp_index" in df.columns
|
||||
assert "cpi_index" in df.columns
|
||||
|
||||
|
||||
class TestStaffComposition:
|
||||
def test_computes_other(self, db_conn):
|
||||
_seed_ipeds(db_conn)
|
||||
df = query_staff_composition(db_conn)
|
||||
assert df.height == 2
|
||||
# 2020: 3000 - 1500 - 500 = 1000
|
||||
row = df.filter(df["year"] == 2020)
|
||||
assert row["other_staff"][0] == 1000
|
||||
|
||||
|
||||
class TestGrowthIndex:
|
||||
def test_base_year_100(self, db_conn):
|
||||
_seed_ipeds(db_conn)
|
||||
df = query_growth_index(db_conn)
|
||||
assert df.height == 2
|
||||
first = df.filter(df["year"] == 2020)
|
||||
assert first["mgmt_index"][0] == 100.0
|
||||
assert first["enrollment_index"][0] == 100.0
|
||||
76
tests/test_scraper_classify.py
Normal file
76
tests/test_scraper_classify.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
from admin_analytics.scraper.classify import classify_title, is_overhead
|
||||
|
||||
|
||||
def test_grants_analyst():
|
||||
assert classify_title("Grants Analyst II") == "GRANTS_ADMIN"
|
||||
assert classify_title("Senior Grants Analyst") == "GRANTS_ADMIN"
|
||||
assert classify_title("Manager, Grant Administration, Pre-Award") == "GRANTS_ADMIN"
|
||||
assert classify_title("Closeout Coordinator, Research") == "GRANTS_ADMIN"
|
||||
|
||||
|
||||
def test_research_staff():
|
||||
assert classify_title("Research Associate") == "RESEARCH"
|
||||
assert classify_title("Associate Scientist") == "RESEARCH"
|
||||
assert classify_title("Computer Scientist NIST") == "RESEARCH"
|
||||
assert classify_title("Postdoctoral Researcher") == "RESEARCH"
|
||||
|
||||
|
||||
def test_academic_support():
|
||||
assert classify_title("Undergraduate Academic Advisor") == "ACADEMIC_SUPPORT"
|
||||
assert classify_title("Graduate Services Coordinator") == "ACADEMIC_SUPPORT"
|
||||
assert classify_title("Academic Program Manager") == "ACADEMIC_SUPPORT"
|
||||
|
||||
|
||||
def test_admin_support():
|
||||
assert classify_title("Administrative Assistant IV") == "ADMIN_SUPPORT"
|
||||
assert classify_title("Administrative Specialist") == "ADMIN_SUPPORT"
|
||||
|
||||
|
||||
def test_it():
|
||||
assert classify_title("Computing Support Specialist II") == "IT"
|
||||
assert classify_title("Systems Programmer IV") == "IT"
|
||||
assert classify_title("Director of Computing Operations") == "IT"
|
||||
|
||||
|
||||
def test_finance():
|
||||
assert classify_title("Financial Specialist") == "FINANCE"
|
||||
assert classify_title("Director, Procurement & Financial Processing") == "FINANCE"
|
||||
assert classify_title("Sr. Business Officer") == "FINANCE"
|
||||
|
||||
|
||||
def test_leadership():
|
||||
assert classify_title("Dean") == "LEADERSHIP"
|
||||
assert classify_title("Associate Dean for Academic Affairs") == "LEADERSHIP"
|
||||
assert classify_title("Chief of Staff") == "LEADERSHIP"
|
||||
|
||||
|
||||
def test_communications():
|
||||
assert classify_title("Communications Director") == "COMMUNICATIONS"
|
||||
assert classify_title("Digital Communications Specialist") == "COMMUNICATIONS"
|
||||
|
||||
|
||||
def test_technical():
|
||||
assert classify_title("Master Machinist") == "TECHNICAL"
|
||||
assert classify_title("Lab Manager") == "TECHNICAL"
|
||||
assert classify_title("Lab Coordinator II") == "TECHNICAL"
|
||||
|
||||
|
||||
def test_faculty_not_admin():
|
||||
assert classify_title("Adjunct Professor NIST") == "FACULTY"
|
||||
assert classify_title("Affiliated Associate Professor") == "FACULTY"
|
||||
|
||||
|
||||
def test_overhead_classification():
|
||||
assert is_overhead("LEADERSHIP") is True
|
||||
assert is_overhead("FINANCE") is True
|
||||
assert is_overhead("IT") is True
|
||||
assert is_overhead("RESEARCH") is False
|
||||
assert is_overhead("ACADEMIC_SUPPORT") is False
|
||||
assert is_overhead("TECHNICAL") is False
|
||||
assert is_overhead("GRANTS_ADMIN") is None # debatable
|
||||
|
||||
|
||||
def test_unknown():
|
||||
assert classify_title("Football Coach") == "UNKNOWN"
|
||||
assert classify_title(None) == "UNKNOWN"
|
||||
assert classify_title("") == "UNKNOWN"
|
||||
41
tests/test_scraper_loader.py
Normal file
41
tests/test_scraper_loader.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
from datetime import date
|
||||
|
||||
from admin_analytics.scraper.directory import StaffEntry
|
||||
from admin_analytics.scraper.loader import load_scrape
|
||||
|
||||
|
||||
def test_load_scrape(db_conn):
|
||||
entries = [
|
||||
StaffEntry(name="John Doe", title="Financial Specialist", email="jdoe@udel.edu", unit="COE Central"),
|
||||
StaffEntry(name="Jane Smith", title="Research Associate", email="jsmith@udel.edu", unit="CBE"),
|
||||
StaffEntry(name="Bob Jones", title="Academic Advisor", email="bjones@udel.edu", unit="ME"),
|
||||
]
|
||||
|
||||
count = load_scrape(db_conn, entries, scrape_date=date(2026, 3, 30))
|
||||
assert count == 3
|
||||
|
||||
rows = db_conn.execute(
|
||||
"SELECT unit, person_name, category, is_overhead FROM raw_admin_headcount ORDER BY person_name"
|
||||
).fetchall()
|
||||
assert len(rows) == 3
|
||||
|
||||
# Bob Jones - Academic Advisor → ACADEMIC_SUPPORT → not overhead
|
||||
assert rows[0] == ("ME", "Bob Jones", "ACADEMIC_SUPPORT", False)
|
||||
# Jane Smith - Research Associate → RESEARCH → not overhead
|
||||
assert rows[1] == ("CBE", "Jane Smith", "RESEARCH", False)
|
||||
# John Doe - Financial Specialist → FINANCE → overhead
|
||||
assert rows[2] == ("COE Central", "John Doe", "FINANCE", True)
|
||||
|
||||
|
||||
def test_load_scrape_idempotent(db_conn):
|
||||
entries = [
|
||||
StaffEntry(name="John Doe", title="Financial Specialist", email="jdoe@udel.edu", unit="COE Central"),
|
||||
]
|
||||
d = date(2026, 3, 30)
|
||||
load_scrape(db_conn, entries, scrape_date=d)
|
||||
load_scrape(db_conn, entries, scrape_date=d) # second run should replace
|
||||
|
||||
count = db_conn.execute(
|
||||
"SELECT COUNT(*) FROM raw_admin_headcount WHERE scrape_date = ?", [d]
|
||||
).fetchone()[0]
|
||||
assert count == 1
|
||||
102
tests/test_validation.py
Normal file
102
tests/test_validation.py
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
"""Tests for data validation module."""
|
||||
|
||||
from admin_analytics.validation import (
|
||||
validate_row_counts,
|
||||
validate_null_rates,
|
||||
validate_year_coverage,
|
||||
validate_cross_source_consistency,
|
||||
format_report,
|
||||
)
|
||||
|
||||
|
||||
class TestRowCounts:
|
||||
def test_empty_tables(self, db_conn):
|
||||
counts = validate_row_counts(db_conn)
|
||||
assert counts["raw_cpi_u"] == 0
|
||||
assert counts["raw_institution"] == 0
|
||||
|
||||
def test_with_data(self, db_conn):
|
||||
db_conn.execute(
|
||||
"INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (2023, 1, 299.17, 'CUUR0000SA0')"
|
||||
)
|
||||
db_conn.execute(
|
||||
"INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (2023, 2, 300.84, 'CUUR0000SA0')"
|
||||
)
|
||||
counts = validate_row_counts(db_conn)
|
||||
assert counts["raw_cpi_u"] == 2
|
||||
|
||||
|
||||
class TestNullRates:
|
||||
def test_empty_tables_excluded(self, db_conn):
|
||||
results = validate_null_rates(db_conn)
|
||||
assert "raw_cpi_u" not in results
|
||||
|
||||
def test_no_nulls(self, db_conn):
|
||||
db_conn.execute(
|
||||
"INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (2023, 1, 299.17, 'CUUR0000SA0')"
|
||||
)
|
||||
results = validate_null_rates(db_conn)
|
||||
assert results["raw_cpi_u"]["year"] == 0.0
|
||||
assert results["raw_cpi_u"]["value"] == 0.0
|
||||
|
||||
def test_with_nulls(self, db_conn):
|
||||
db_conn.execute(
|
||||
"INSERT INTO raw_990_filing (object_id, ein, tax_year) VALUES ('f1', '123', 2023)"
|
||||
)
|
||||
db_conn.execute(
|
||||
"INSERT INTO raw_990_filing (object_id, ein, tax_year, total_revenue) VALUES ('f2', '123', 2023, 100)"
|
||||
)
|
||||
results = validate_null_rates(db_conn)
|
||||
assert results["raw_990_filing"]["total_revenue"] == 50.0
|
||||
assert results["raw_990_filing"]["ein"] == 0.0
|
||||
|
||||
|
||||
class TestYearCoverage:
|
||||
def test_empty_tables(self, db_conn):
|
||||
coverage = validate_year_coverage(db_conn)
|
||||
assert coverage["raw_cpi_u"]["years"] == []
|
||||
|
||||
def test_with_data(self, db_conn):
|
||||
for year in [2020, 2021, 2023]:
|
||||
db_conn.execute(
|
||||
"INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (?, 1, 100.0, 'CUUR0000SA0')",
|
||||
[year],
|
||||
)
|
||||
coverage = validate_year_coverage(db_conn)
|
||||
assert coverage["raw_cpi_u"]["years"] == [2020, 2021, 2023]
|
||||
assert 2022 in coverage["raw_cpi_u"]["gaps"]
|
||||
|
||||
|
||||
class TestCrossSource:
|
||||
def test_empty(self, db_conn):
|
||||
result = validate_cross_source_consistency(db_conn)
|
||||
assert result["years_in_all_sources"] == []
|
||||
|
||||
def test_overlap(self, db_conn):
|
||||
# Add IPEDS finance
|
||||
db_conn.execute(
|
||||
"INSERT INTO raw_ipeds_finance (unitid, year) VALUES (130943, 2022)"
|
||||
)
|
||||
db_conn.execute(
|
||||
"INSERT INTO raw_ipeds_finance (unitid, year) VALUES (130943, 2023)"
|
||||
)
|
||||
# Add 990 filing
|
||||
db_conn.execute(
|
||||
"INSERT INTO raw_990_filing (object_id, tax_year) VALUES ('f1', 2023)"
|
||||
)
|
||||
# Add CPI
|
||||
db_conn.execute(
|
||||
"INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (2023, 1, 299.0, 'X')"
|
||||
)
|
||||
result = validate_cross_source_consistency(db_conn)
|
||||
assert 2023 in result["years_in_all_sources"]
|
||||
assert 2022 not in result["years_in_all_sources"]
|
||||
|
||||
|
||||
class TestFormatReport:
|
||||
def test_runs_on_empty_db(self, db_conn):
|
||||
report = format_report(db_conn)
|
||||
assert "Row Counts" in report
|
||||
assert "NULL Rates" in report
|
||||
assert "Year Coverage" in report
|
||||
assert "Cross-Source" in report
|
||||
Loading…
Add table
Add a link
Reference in a new issue