Phase 1 project prototype

2026-03-30 19:29:33 -04:00 · 2026-03-30 19:29:33 -04:00 · 2c9ae1c312
commit 2c9ae1c312
parent 29215e2bd2
29 changed files with 2967 additions and 22 deletions
--- a/tests/fixtures/cu_data_sample.tsv
+++ b/tests/fixtures/cu_data_sample.tsv
@ -0,0 +1,9 @@
+series_id	year	period	value	footnote_codes
+CUUR0000SA0           	2022	M01	281.148
+CUUR0000SA0           	2022	M02	283.716
+CUUR0000SA0           	2022	M12	296.797
+CUUR0000SA0           	2022	M13	292.655
+CUUR0000SA0           	2023	M01	299.170
+CUUR0000SA0           	2023	M06	305.109
+CUSR0000SA0           	2023	M01	298.432
+CUUR0000SA0           	2023	S01	302.108
--- a/tests/test_bls.py
+++ b/tests/test_bls.py
@ -0,0 +1,75 @@
+"""Tests for BLS CPI-U download and loader."""
+
+import httpx
+import respx
+
+from admin_analytics.bls.download import download_cpi_file
+from admin_analytics.bls.loader import load_cpi
+from admin_analytics.config import BLS_CPI_URL
+
+
+class TestDownload:
+    @respx.mock
+    def test_download_creates_file(self, tmp_path, monkeypatch):
+        monkeypatch.setattr("admin_analytics.bls.download.BLS_DATA_DIR", tmp_path)
+        respx.get(BLS_CPI_URL).mock(
+            return_value=httpx.Response(200, text="series_id\tyear\tperiod\tvalue\n")
+        )
+        path = download_cpi_file(force=True)
+        assert path.exists()
+        assert path.parent == tmp_path
+
+    @respx.mock
+    def test_download_skips_when_exists(self, tmp_path, monkeypatch):
+        monkeypatch.setattr("admin_analytics.bls.download.BLS_DATA_DIR", tmp_path)
+        existing = tmp_path / "cu.data.0.Current"
+        existing.write_text("cached")
+        path = download_cpi_file(force=False)
+        assert path.read_text() == "cached"
+
+    @respx.mock
+    def test_download_force_overwrites(self, tmp_path, monkeypatch):
+        monkeypatch.setattr("admin_analytics.bls.download.BLS_DATA_DIR", tmp_path)
+        existing = tmp_path / "cu.data.0.Current"
+        existing.write_text("old")
+        respx.get(BLS_CPI_URL).mock(
+            return_value=httpx.Response(200, text="new data")
+        )
+        path = download_cpi_file(force=True)
+        assert path.read_text() == "new data"
+
+
+class TestLoader:
+    def test_load_cpi(self, db_conn, fixtures_dir):
+        fixture = fixtures_dir / "cu_data_sample.tsv"
+        count = load_cpi(db_conn, fixture)
+        # Fixture has 5 valid CUUR0000SA0 monthly rows (M01, M02, M12, M01, M06)
+        # Excludes: M13 (annual avg), CUSR0000SA0 (wrong series), S01 (semi-annual)
+        assert count == 5
+
+    def test_load_cpi_correct_values(self, db_conn, fixtures_dir):
+        fixture = fixtures_dir / "cu_data_sample.tsv"
+        load_cpi(db_conn, fixture)
+        rows = db_conn.execute(
+            "SELECT year, month, value FROM raw_cpi_u ORDER BY year, month"
+        ).fetchall()
+        assert rows[0] == (2022, 1, 281.148)
+        assert rows[-1] == (2023, 6, 305.109)
+
+    def test_load_cpi_types(self, db_conn, fixtures_dir):
+        fixture = fixtures_dir / "cu_data_sample.tsv"
+        load_cpi(db_conn, fixture)
+        row = db_conn.execute(
+            "SELECT year, month, value, series_id FROM raw_cpi_u LIMIT 1"
+        ).fetchone()
+        assert isinstance(row[0], int)
+        assert isinstance(row[1], int)
+        assert isinstance(row[2], float)
+        assert row[3] == "CUUR0000SA0"
+
+    def test_load_cpi_idempotent(self, db_conn, fixtures_dir):
+        fixture = fixtures_dir / "cu_data_sample.tsv"
+        load_cpi(db_conn, fixture)
+        load_cpi(db_conn, fixture)
+        count = db_conn.execute("SELECT COUNT(*) FROM raw_cpi_u").fetchone()[0]
+        assert count == 5
--- a/tests/test_dashboard_queries.py
+++ b/tests/test_dashboard_queries.py
@ -0,0 +1,168 @@
+"""Tests for dashboard query functions."""
+
+from admin_analytics.config import UD_UNITID
+from admin_analytics.dashboard.queries import (
+    query_admin_cost_ratio,
+    query_expense_breakdown,
+    query_admin_per_student,
+    query_admin_faculty_ratio,
+    query_top_earners,
+    query_comp_by_role,
+    query_comp_vs_cpi,
+    query_staff_composition,
+    query_student_staff_ratios,
+    query_growth_index,
+    query_admin_headcount,
+    query_headcount_summary,
+)
+
+
+def _seed_ipeds(conn):
+    """Insert minimal IPEDS data for 2 years."""
+    for year, inst_support, total in [(2020, 100_000, 1_000_000), (2021, 120_000, 1_100_000)]:
+        conn.execute(
+            "INSERT INTO raw_ipeds_finance (unitid, year, institutional_support_expenses, total_expenses) VALUES (?, ?, ?, ?)",
+            [UD_UNITID, year, inst_support, total],
+        )
+    for year, enrollment in [(2020, 20000), (2021, 21000)]:
+        conn.execute(
+            "INSERT INTO raw_ipeds_enrollment (unitid, year, total_enrollment) VALUES (?, ?, ?)",
+            [UD_UNITID, year, enrollment],
+        )
+    for year, total, faculty, mgmt in [(2020, 3000, 1500, 500), (2021, 3100, 1550, 520)]:
+        conn.execute(
+            "INSERT INTO raw_ipeds_staff (unitid, year, total_staff, faculty_total, management_total) VALUES (?, ?, ?, ?, ?)",
+            [UD_UNITID, year, total, faculty, mgmt],
+        )
+
+
+def _seed_cpi(conn):
+    """Insert CPI data for 2020-2021."""
+    for year, value in [(2020, 258.8), (2021, 270.9)]:
+        for month in range(1, 13):
+            conn.execute(
+                "INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (?, ?, ?, ?)",
+                [year, month, value, "CUUR0000SA0"],
+            )
+
+
+def _seed_990(conn):
+    """Insert minimal 990 filing and Schedule J data."""
+    conn.execute(
+        "INSERT INTO raw_990_filing (object_id, ein, tax_year, organization_name, total_revenue, total_expenses) "
+        "VALUES ('obj1', '516000297', 2021, 'UD Foundation', 50000000, 40000000)"
+    )
+    conn.execute(
+        "INSERT INTO raw_990_schedule_j (object_id, ein, tax_year, person_name, title, "
+        "base_compensation, bonus_compensation, other_compensation, deferred_compensation, "
+        "nontaxable_benefits, total_compensation) "
+        "VALUES ('obj1', '516000297', 2021, 'JOHN DOE', 'PRESIDENT', "
+        "500000, 100000, 10000, 20000, 15000, 645000)"
+    )
+    conn.execute(
+        "INSERT INTO raw_990_schedule_j (object_id, ein, tax_year, person_name, title, "
+        "base_compensation, bonus_compensation, other_compensation, deferred_compensation, "
+        "nontaxable_benefits, total_compensation) "
+        "VALUES ('obj1', '516000297', 2021, 'JANE SMITH', 'PROVOST', "
+        "400000, 50000, 5000, 15000, 10000, 480000)"
+    )
+
+
+class TestEmptyDatabase:
+    def test_admin_cost_ratio_empty(self, db_conn):
+        df = query_admin_cost_ratio(db_conn)
+        assert df.height == 0
+
+    def test_top_earners_empty(self, db_conn):
+        df = query_top_earners(db_conn)
+        assert df.height == 0
+
+    def test_headcount_empty(self, db_conn):
+        df = query_admin_headcount(db_conn)
+        assert df.height == 0
+
+
+class TestAdminCostRatio:
+    def test_returns_correct_ratio(self, db_conn):
+        _seed_ipeds(db_conn)
+        _seed_cpi(db_conn)
+        df = query_admin_cost_ratio(db_conn)
+        assert df.height == 2
+        # 2020: 100000 / 1000000 = 10%
+        row_2020 = df.filter(df["year"] == 2020)
+        assert row_2020["admin_cost_pct"][0] == 10.0
+
+
+class TestAdminPerStudent:
+    def test_returns_per_student(self, db_conn):
+        _seed_ipeds(db_conn)
+        _seed_cpi(db_conn)
+        df = query_admin_per_student(db_conn)
+        assert df.height == 2
+        # 2020: 100000 / 20000 = 5
+        row_2020 = df.filter(df["year"] == 2020)
+        assert row_2020["admin_per_student"][0] == 5.0
+
+
+class TestAdminFacultyRatio:
+    def test_returns_ratio(self, db_conn):
+        _seed_ipeds(db_conn)
+        df = query_admin_faculty_ratio(db_conn)
+        assert df.height == 2
+        # 2020: 500 / 1500 = 0.333
+        row_2020 = df.filter(df["year"] == 2020)
+        assert row_2020["admin_faculty_ratio"][0] == 0.333
+
+
+class TestTopEarners:
+    def test_returns_all(self, db_conn):
+        _seed_990(db_conn)
+        df = query_top_earners(db_conn)
+        assert df.height == 2
+        assert "canonical_role" in df.columns
+
+    def test_filter_by_year(self, db_conn):
+        _seed_990(db_conn)
+        df = query_top_earners(db_conn, year=2021)
+        assert df.height == 2
+        df_empty = query_top_earners(db_conn, year=2019)
+        assert df_empty.height == 0
+
+
+class TestCompByRole:
+    def test_groups_by_role(self, db_conn):
+        _seed_990(db_conn)
+        df = query_comp_by_role(db_conn)
+        roles = df["canonical_role"].to_list()
+        assert "PRESIDENT" in roles
+        assert "PROVOST" in roles
+
+
+class TestCompVsCpi:
+    def test_returns_indexed(self, db_conn):
+        _seed_990(db_conn)
+        _seed_cpi(db_conn)
+        df = query_comp_vs_cpi(db_conn)
+        assert df.height > 0
+        assert "comp_index" in df.columns
+        assert "cpi_index" in df.columns
+
+
+class TestStaffComposition:
+    def test_computes_other(self, db_conn):
+        _seed_ipeds(db_conn)
+        df = query_staff_composition(db_conn)
+        assert df.height == 2
+        # 2020: 3000 - 1500 - 500 = 1000
+        row = df.filter(df["year"] == 2020)
+        assert row["other_staff"][0] == 1000
+
+
+class TestGrowthIndex:
+    def test_base_year_100(self, db_conn):
+        _seed_ipeds(db_conn)
+        df = query_growth_index(db_conn)
+        assert df.height == 2
+        first = df.filter(df["year"] == 2020)
+        assert first["mgmt_index"][0] == 100.0
+        assert first["enrollment_index"][0] == 100.0
--- a/tests/test_scraper_classify.py
+++ b/tests/test_scraper_classify.py
@ -0,0 +1,76 @@
+from admin_analytics.scraper.classify import classify_title, is_overhead
+
+
+def test_grants_analyst():
+    assert classify_title("Grants Analyst II") == "GRANTS_ADMIN"
+    assert classify_title("Senior Grants Analyst") == "GRANTS_ADMIN"
+    assert classify_title("Manager, Grant Administration, Pre-Award") == "GRANTS_ADMIN"
+    assert classify_title("Closeout Coordinator, Research") == "GRANTS_ADMIN"
+
+
+def test_research_staff():
+    assert classify_title("Research Associate") == "RESEARCH"
+    assert classify_title("Associate Scientist") == "RESEARCH"
+    assert classify_title("Computer Scientist NIST") == "RESEARCH"
+    assert classify_title("Postdoctoral Researcher") == "RESEARCH"
+
+
+def test_academic_support():
+    assert classify_title("Undergraduate Academic Advisor") == "ACADEMIC_SUPPORT"
+    assert classify_title("Graduate Services Coordinator") == "ACADEMIC_SUPPORT"
+    assert classify_title("Academic Program Manager") == "ACADEMIC_SUPPORT"
+
+
+def test_admin_support():
+    assert classify_title("Administrative Assistant IV") == "ADMIN_SUPPORT"
+    assert classify_title("Administrative Specialist") == "ADMIN_SUPPORT"
+
+
+def test_it():
+    assert classify_title("Computing Support Specialist II") == "IT"
+    assert classify_title("Systems Programmer IV") == "IT"
+    assert classify_title("Director of Computing Operations") == "IT"
+
+
+def test_finance():
+    assert classify_title("Financial Specialist") == "FINANCE"
+    assert classify_title("Director, Procurement & Financial Processing") == "FINANCE"
+    assert classify_title("Sr. Business Officer") == "FINANCE"
+
+
+def test_leadership():
+    assert classify_title("Dean") == "LEADERSHIP"
+    assert classify_title("Associate Dean for Academic Affairs") == "LEADERSHIP"
+    assert classify_title("Chief of Staff") == "LEADERSHIP"
+
+
+def test_communications():
+    assert classify_title("Communications Director") == "COMMUNICATIONS"
+    assert classify_title("Digital Communications Specialist") == "COMMUNICATIONS"
+
+
+def test_technical():
+    assert classify_title("Master Machinist") == "TECHNICAL"
+    assert classify_title("Lab Manager") == "TECHNICAL"
+    assert classify_title("Lab Coordinator II") == "TECHNICAL"
+
+
+def test_faculty_not_admin():
+    assert classify_title("Adjunct Professor NIST") == "FACULTY"
+    assert classify_title("Affiliated Associate Professor") == "FACULTY"
+
+
+def test_overhead_classification():
+    assert is_overhead("LEADERSHIP") is True
+    assert is_overhead("FINANCE") is True
+    assert is_overhead("IT") is True
+    assert is_overhead("RESEARCH") is False
+    assert is_overhead("ACADEMIC_SUPPORT") is False
+    assert is_overhead("TECHNICAL") is False
+    assert is_overhead("GRANTS_ADMIN") is None  # debatable
+
+
+def test_unknown():
+    assert classify_title("Football Coach") == "UNKNOWN"
+    assert classify_title(None) == "UNKNOWN"
+    assert classify_title("") == "UNKNOWN"
--- a/tests/test_scraper_loader.py
+++ b/tests/test_scraper_loader.py
@ -0,0 +1,41 @@
+from datetime import date
+
+from admin_analytics.scraper.directory import StaffEntry
+from admin_analytics.scraper.loader import load_scrape
+
+
+def test_load_scrape(db_conn):
+    entries = [
+        StaffEntry(name="John Doe", title="Financial Specialist", email="jdoe@udel.edu", unit="COE Central"),
+        StaffEntry(name="Jane Smith", title="Research Associate", email="jsmith@udel.edu", unit="CBE"),
+        StaffEntry(name="Bob Jones", title="Academic Advisor", email="bjones@udel.edu", unit="ME"),
+    ]
+
+    count = load_scrape(db_conn, entries, scrape_date=date(2026, 3, 30))
+    assert count == 3
+
+    rows = db_conn.execute(
+        "SELECT unit, person_name, category, is_overhead FROM raw_admin_headcount ORDER BY person_name"
+    ).fetchall()
+    assert len(rows) == 3
+
+    # Bob Jones - Academic Advisor → ACADEMIC_SUPPORT → not overhead
+    assert rows[0] == ("ME", "Bob Jones", "ACADEMIC_SUPPORT", False)
+    # Jane Smith - Research Associate → RESEARCH → not overhead
+    assert rows[1] == ("CBE", "Jane Smith", "RESEARCH", False)
+    # John Doe - Financial Specialist → FINANCE → overhead
+    assert rows[2] == ("COE Central", "John Doe", "FINANCE", True)
+
+
+def test_load_scrape_idempotent(db_conn):
+    entries = [
+        StaffEntry(name="John Doe", title="Financial Specialist", email="jdoe@udel.edu", unit="COE Central"),
+    ]
+    d = date(2026, 3, 30)
+    load_scrape(db_conn, entries, scrape_date=d)
+    load_scrape(db_conn, entries, scrape_date=d)  # second run should replace
+
+    count = db_conn.execute(
+        "SELECT COUNT(*) FROM raw_admin_headcount WHERE scrape_date = ?", [d]
+    ).fetchone()[0]
+    assert count == 1
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@ -0,0 +1,102 @@
+"""Tests for data validation module."""
+
+from admin_analytics.validation import (
+    validate_row_counts,
+    validate_null_rates,
+    validate_year_coverage,
+    validate_cross_source_consistency,
+    format_report,
+)
+
+
+class TestRowCounts:
+    def test_empty_tables(self, db_conn):
+        counts = validate_row_counts(db_conn)
+        assert counts["raw_cpi_u"] == 0
+        assert counts["raw_institution"] == 0
+
+    def test_with_data(self, db_conn):
+        db_conn.execute(
+            "INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (2023, 1, 299.17, 'CUUR0000SA0')"
+        )
+        db_conn.execute(
+            "INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (2023, 2, 300.84, 'CUUR0000SA0')"
+        )
+        counts = validate_row_counts(db_conn)
+        assert counts["raw_cpi_u"] == 2
+
+
+class TestNullRates:
+    def test_empty_tables_excluded(self, db_conn):
+        results = validate_null_rates(db_conn)
+        assert "raw_cpi_u" not in results
+
+    def test_no_nulls(self, db_conn):
+        db_conn.execute(
+            "INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (2023, 1, 299.17, 'CUUR0000SA0')"
+        )
+        results = validate_null_rates(db_conn)
+        assert results["raw_cpi_u"]["year"] == 0.0
+        assert results["raw_cpi_u"]["value"] == 0.0
+
+    def test_with_nulls(self, db_conn):
+        db_conn.execute(
+            "INSERT INTO raw_990_filing (object_id, ein, tax_year) VALUES ('f1', '123', 2023)"
+        )
+        db_conn.execute(
+            "INSERT INTO raw_990_filing (object_id, ein, tax_year, total_revenue) VALUES ('f2', '123', 2023, 100)"
+        )
+        results = validate_null_rates(db_conn)
+        assert results["raw_990_filing"]["total_revenue"] == 50.0
+        assert results["raw_990_filing"]["ein"] == 0.0
+
+
+class TestYearCoverage:
+    def test_empty_tables(self, db_conn):
+        coverage = validate_year_coverage(db_conn)
+        assert coverage["raw_cpi_u"]["years"] == []
+
+    def test_with_data(self, db_conn):
+        for year in [2020, 2021, 2023]:
+            db_conn.execute(
+                "INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (?, 1, 100.0, 'CUUR0000SA0')",
+                [year],
+            )
+        coverage = validate_year_coverage(db_conn)
+        assert coverage["raw_cpi_u"]["years"] == [2020, 2021, 2023]
+        assert 2022 in coverage["raw_cpi_u"]["gaps"]
+
+
+class TestCrossSource:
+    def test_empty(self, db_conn):
+        result = validate_cross_source_consistency(db_conn)
+        assert result["years_in_all_sources"] == []
+
+    def test_overlap(self, db_conn):
+        # Add IPEDS finance
+        db_conn.execute(
+            "INSERT INTO raw_ipeds_finance (unitid, year) VALUES (130943, 2022)"
+        )
+        db_conn.execute(
+            "INSERT INTO raw_ipeds_finance (unitid, year) VALUES (130943, 2023)"
+        )
+        # Add 990 filing
+        db_conn.execute(
+            "INSERT INTO raw_990_filing (object_id, tax_year) VALUES ('f1', 2023)"
+        )
+        # Add CPI
+        db_conn.execute(
+            "INSERT INTO raw_cpi_u (year, month, value, series_id) VALUES (2023, 1, 299.0, 'X')"
+        )
+        result = validate_cross_source_consistency(db_conn)
+        assert 2023 in result["years_in_all_sources"]
+        assert 2022 not in result["years_in_all_sources"]
+
+
+class TestFormatReport:
+    def test_runs_on_empty_db(self, db_conn):
+        report = format_report(db_conn)
+        assert "Row Counts" in report
+        assert "NULL Rates" in report
+        assert "Year Coverage" in report
+        assert "Cross-Source" in report