Initial build out

This commit is contained in:
emfurst 2026-03-30 07:15:14 -04:00
commit 29215e2bd2
40 changed files with 2622 additions and 0 deletions

0
tests/__init__.py Normal file
View file

21
tests/conftest.py Normal file
View file

@ -0,0 +1,21 @@
from pathlib import Path
import duckdb
import pytest
from admin_analytics.db.schema import ensure_schema
@pytest.fixture
def db_conn(tmp_path):
"""Provide a fresh DuckDB connection with schema initialized."""
conn = duckdb.connect(str(tmp_path / "test.duckdb"))
ensure_schema(conn)
yield conn
conn.close()
@pytest.fixture
def fixtures_dir():
"""Return the path to the test fixtures directory."""
return Path(__file__).parent / "fixtures"

73
tests/fixtures/990_sample.xml vendored Normal file
View file

@ -0,0 +1,73 @@
<?xml version="1.0" encoding="utf-8"?>
<Return xmlns="http://www.irs.gov/efile" returnVersion="2022v5.0">
<ReturnHeader>
<ReturnTypeCd>990</ReturnTypeCd>
<TaxYr>2022</TaxYr>
<Filer>
<EIN>516000297</EIN>
<BusinessName>
<BusinessNameLine1Txt>UNIVERSITY OF DELAWARE</BusinessNameLine1Txt>
</BusinessName>
</Filer>
</ReturnHeader>
<ReturnData>
<IRS990>
<CYTotalRevenueAmt>1800000000</CYTotalRevenueAmt>
<CYTotalExpensesAmt>1700000000</CYTotalExpensesAmt>
<TotalAssetsEOYAmt>5000000000</TotalAssetsEOYAmt>
<Form990PartVIISectionAGrp>
<PersonNm>JOHN DOE</PersonNm>
<TitleTxt>PRESIDENT</TitleTxt>
<AverageHoursPerWeekRt>40.00</AverageHoursPerWeekRt>
<ReportableCompFromOrgAmt>850000</ReportableCompFromOrgAmt>
<ReportableCompFromRltdOrgAmt>0</ReportableCompFromRltdOrgAmt>
<OtherCompensationAmt>150000</OtherCompensationAmt>
</Form990PartVIISectionAGrp>
<Form990PartVIISectionAGrp>
<PersonNm>JANE SMITH</PersonNm>
<TitleTxt>VICE PRESIDENT FOR FINANCE</TitleTxt>
<AverageHoursPerWeekRt>40.00</AverageHoursPerWeekRt>
<ReportableCompFromOrgAmt>450000</ReportableCompFromOrgAmt>
<ReportableCompFromRltdOrgAmt>0</ReportableCompFromRltdOrgAmt>
<OtherCompensationAmt>80000</OtherCompensationAmt>
</Form990PartVIISectionAGrp>
<Form990PartVIISectionAGrp>
<PersonNm>BOB JONES</PersonNm>
<TitleTxt>TRUSTEE</TitleTxt>
<AverageHoursPerWeekRt>2.00</AverageHoursPerWeekRt>
<ReportableCompFromOrgAmt>0</ReportableCompFromOrgAmt>
<ReportableCompFromRltdOrgAmt>0</ReportableCompFromRltdOrgAmt>
<OtherCompensationAmt>0</OtherCompensationAmt>
</Form990PartVIISectionAGrp>
</IRS990>
<IRS990ScheduleJ>
<RltdOrgOfficerTrstKeyEmplGrp>
<PersonNm>JOHN DOE</PersonNm>
<TitleTxt>PRESIDENT</TitleTxt>
<BaseCompensationFilingOrgAmt>700000</BaseCompensationFilingOrgAmt>
<BonusFilingOrgAmt>100000</BonusFilingOrgAmt>
<OtherCompensationFilingOrgAmt>50000</OtherCompensationFilingOrgAmt>
<DeferredCompensationFlngOrgAmt>75000</DeferredCompensationFlngOrgAmt>
<NontaxableBenefitsFilingOrgAmt>25000</NontaxableBenefitsFilingOrgAmt>
<TotalCompensationFilingOrgAmt>950000</TotalCompensationFilingOrgAmt>
<CompensationFromOtherSrcsAmt>0</CompensationFromOtherSrcsAmt>
</RltdOrgOfficerTrstKeyEmplGrp>
<RltdOrgOfficerTrstKeyEmplGrp>
<PersonNm>JANE SMITH</PersonNm>
<TitleTxt>VICE PRESIDENT FOR FINANCE</TitleTxt>
<BaseCompensationFilingOrgAmt>380000</BaseCompensationFilingOrgAmt>
<BonusFilingOrgAmt>40000</BonusFilingOrgAmt>
<OtherCompensationFilingOrgAmt>30000</OtherCompensationFilingOrgAmt>
<DeferredCompensationFlngOrgAmt>50000</DeferredCompensationFlngOrgAmt>
<NontaxableBenefitsFilingOrgAmt>20000</NontaxableBenefitsFilingOrgAmt>
<TotalCompensationFilingOrgAmt>520000</TotalCompensationFilingOrgAmt>
<CompensationFromOtherSrcsAmt>0</CompensationFromOtherSrcsAmt>
</RltdOrgOfficerTrstKeyEmplGrp>
</IRS990ScheduleJ>
</ReturnData>
</Return>

3
tests/fixtures/ef2023.csv vendored Normal file
View file

@ -0,0 +1,3 @@
UNITID,EFYTOTLT
130943,24120
110635,45307
1 UNITID EFYTOTLT
2 130943 24120
3 110635 45307

3
tests/fixtures/f1a2023.csv vendored Normal file
View file

@ -0,0 +1,3 @@
UNITID,F1C191,F1C011,F1C021,F1C031,F1C051,F1C061,F1C071,F1C111,F1C121,F1C141,F1C192,F1C193
130943,1200000000,400000000,200000000,50000000,100000000,80000000,150000000,60000000,0,30000000,500000000,200000000
110635,3500000000,1200000000,800000000,100000000,300000000,200000000,400000000,150000000,500000000,100000000,1500000000,600000000
1 UNITID F1C191 F1C011 F1C021 F1C031 F1C051 F1C061 F1C071 F1C111 F1C121 F1C141 F1C192 F1C193
2 130943 1200000000 400000000 200000000 50000000 100000000 80000000 150000000 60000000 0 30000000 500000000 200000000
3 110635 3500000000 1200000000 800000000 100000000 300000000 200000000 400000000 150000000 500000000 100000000 1500000000 600000000

3
tests/fixtures/hd2023.csv vendored Normal file
View file

@ -0,0 +1,3 @@
UNITID,INSTNM,CITY,STABBR,SECTOR,CONTROL,C18BASIC,EIN,EFYTOTLT
130943,University of Delaware,Newark,DE,1,1,15,510049975,24120
110635,University of California-Berkeley,Berkeley,CA,1,1,15,946036494,45307
1 UNITID INSTNM CITY STABBR SECTOR CONTROL C18BASIC EIN EFYTOTLT
2 130943 University of Delaware Newark DE 1 1 15 510049975 24120
3 110635 University of California-Berkeley Berkeley CA 1 1 15 946036494 45307

7
tests/fixtures/s2023.csv vendored Normal file
View file

@ -0,0 +1,7 @@
UNITID,STAFFCAT,FTPT,OCCUPCAT,HRTOTLT
130943,2100,2,100,5081
130943,2200,2,200,1271
130943,2250,2,250,124
110635,2100,2,100,15000
110635,2200,2,200,800
110635,2250,2,250,3500
1 UNITID STAFFCAT FTPT OCCUPCAT HRTOTLT
2 130943 2100 2 100 5081
3 130943 2200 2 200 1271
4 130943 2250 2 250 124
5 110635 2100 2 100 15000
6 110635 2200 2 200 800
7 110635 2250 2 250 3500

22
tests/test_db_schema.py Normal file
View file

@ -0,0 +1,22 @@
def test_tables_created(db_conn):
tables = db_conn.execute(
"SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'"
).fetchall()
table_names = {t[0] for t in tables}
expected = {
"raw_institution",
"raw_ipeds_finance",
"raw_ipeds_staff",
"raw_ipeds_enrollment",
"raw_990_filing",
"raw_990_schedule_j",
"raw_990_part_vii",
"raw_cpi_u",
}
assert expected.issubset(table_names)
def test_schema_idempotent(db_conn):
"""Calling ensure_schema twice should not raise."""
from admin_analytics.db.schema import ensure_schema
ensure_schema(db_conn) # already called in fixture; second call should be fine

View file

@ -0,0 +1,65 @@
import io
import zipfile
import httpx
import respx
import admin_analytics.config as config
from admin_analytics.config import ipeds_filename
from admin_analytics.ipeds.download import download_component
def test_ipeds_filename_patterns():
assert ipeds_filename("hd", 2023) == "HD2023"
assert ipeds_filename("finance", 2023) == "F2223_F1A"
assert ipeds_filename("finance", 2005) == "F0405_F1A"
assert ipeds_filename("enrollment", 2023) == "EF2023A"
assert ipeds_filename("staff", 2023) == "S2023_OC"
assert ipeds_filename("salaries", 2023) == "SAL2023_IS"
def _make_zip_bytes(filename: str, content: str) -> bytes:
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as zf:
zf.writestr(filename, content)
return buf.getvalue()
@respx.mock
def test_download_component(tmp_path):
zip_bytes = _make_zip_bytes("HD2023.csv", "UNITID,INSTNM\n130943,University of Delaware\n")
respx.get("https://nces.ed.gov/ipeds/datacenter/data/HD2023.zip").mock(
return_value=httpx.Response(200, content=zip_bytes)
)
original = config.IPEDS_DATA_DIR
config.IPEDS_DATA_DIR = tmp_path / "ipeds"
try:
dest = download_component("hd", 2023)
finally:
config.IPEDS_DATA_DIR = original
assert dest.exists()
csvs = list(dest.glob("*.csv"))
assert len(csvs) == 1
assert "University of Delaware" in csvs[0].read_text()
@respx.mock
def test_download_component_skips_if_exists(tmp_path):
original = config.IPEDS_DATA_DIR
config.IPEDS_DATA_DIR = tmp_path / "ipeds"
dest = tmp_path / "ipeds" / "hd" / "2023"
dest.mkdir(parents=True)
(dest / "HD2023.csv").write_text("UNITID\n130943\n")
route = respx.get("https://nces.ed.gov/ipeds/datacenter/data/HD2023.zip")
try:
result = download_component("hd", 2023, force=False)
finally:
config.IPEDS_DATA_DIR = original
assert result == dest
assert not route.called

View file

@ -0,0 +1,25 @@
import shutil
import admin_analytics.config as config
from admin_analytics.config import UD_UNITID
from admin_analytics.ipeds.enrollment import load_enrollment
def test_load_enrollment_filters_to_ud(db_conn, fixtures_dir, tmp_path):
ipeds_dir = tmp_path / "ipeds" / "enrollment" / "2023"
ipeds_dir.mkdir(parents=True)
shutil.copy(fixtures_dir / "ef2023.csv", ipeds_dir / "ef2023.csv")
original = config.IPEDS_DATA_DIR
config.IPEDS_DATA_DIR = tmp_path / "ipeds"
try:
count = load_enrollment(db_conn, range(2023, 2024), unitid_filter=UD_UNITID)
finally:
config.IPEDS_DATA_DIR = original
assert count == 1
row = db_conn.execute(
"SELECT total_enrollment FROM raw_ipeds_enrollment WHERE unitid = ?",
[UD_UNITID],
).fetchone()
assert row[0] == 24120

View file

@ -0,0 +1,25 @@
import shutil
import admin_analytics.config as config
from admin_analytics.config import UD_UNITID
from admin_analytics.ipeds.finance import load_finance
def test_load_finance_filters_to_ud(db_conn, fixtures_dir, tmp_path):
ipeds_dir = tmp_path / "ipeds" / "finance" / "2023"
ipeds_dir.mkdir(parents=True)
shutil.copy(fixtures_dir / "f1a2023.csv", ipeds_dir / "f1a2023.csv")
original = config.IPEDS_DATA_DIR
config.IPEDS_DATA_DIR = tmp_path / "ipeds"
try:
count = load_finance(db_conn, range(2023, 2024), unitid_filter=UD_UNITID)
finally:
config.IPEDS_DATA_DIR = original
assert count == 1
row = db_conn.execute(
"SELECT institutional_support_expenses FROM raw_ipeds_finance WHERE unitid = ?",
[UD_UNITID],
).fetchone()
assert row[0] == 150000000

View file

@ -0,0 +1,41 @@
import shutil
import admin_analytics.config as config
from admin_analytics.config import UD_UNITID
from admin_analytics.ipeds.institution import load_institutions
def test_load_institutions_filters_to_ud(db_conn, fixtures_dir, tmp_path):
"""HD loader should only load UD's row when unitid_filter is set."""
ipeds_dir = tmp_path / "ipeds" / "hd" / "2023"
ipeds_dir.mkdir(parents=True)
shutil.copy(fixtures_dir / "hd2023.csv", ipeds_dir / "hd2023.csv")
original = config.IPEDS_DATA_DIR
config.IPEDS_DATA_DIR = tmp_path / "ipeds"
try:
count = load_institutions(db_conn, range(2023, 2024), unitid_filter=UD_UNITID)
finally:
config.IPEDS_DATA_DIR = original
assert count == 1
rows = db_conn.execute("SELECT * FROM raw_institution").fetchall()
assert len(rows) == 1
assert rows[0][0] == UD_UNITID
assert rows[0][3] == "University of Delaware"
def test_load_institutions_no_filter(db_conn, fixtures_dir, tmp_path):
"""Without filter, all institutions should load."""
ipeds_dir = tmp_path / "ipeds" / "hd" / "2023"
ipeds_dir.mkdir(parents=True)
shutil.copy(fixtures_dir / "hd2023.csv", ipeds_dir / "hd2023.csv")
original = config.IPEDS_DATA_DIR
config.IPEDS_DATA_DIR = tmp_path / "ipeds"
try:
count = load_institutions(db_conn, range(2023, 2024), unitid_filter=None)
finally:
config.IPEDS_DATA_DIR = original
assert count == 2

25
tests/test_ipeds_staff.py Normal file
View file

@ -0,0 +1,25 @@
import shutil
import admin_analytics.config as config
from admin_analytics.config import UD_UNITID
from admin_analytics.ipeds.staff import load_staff
def test_load_staff_filters_to_ud(db_conn, fixtures_dir, tmp_path):
ipeds_dir = tmp_path / "ipeds" / "staff" / "2023"
ipeds_dir.mkdir(parents=True)
shutil.copy(fixtures_dir / "s2023.csv", ipeds_dir / "s2023.csv")
original = config.IPEDS_DATA_DIR
config.IPEDS_DATA_DIR = tmp_path / "ipeds"
try:
count = load_staff(db_conn, range(2023, 2024), unitid_filter=UD_UNITID)
finally:
config.IPEDS_DATA_DIR = original
assert count == 1
row = db_conn.execute(
"SELECT total_staff, faculty_total, management_total FROM raw_ipeds_staff WHERE unitid = ?",
[UD_UNITID],
).fetchone()
assert row == (5081, 124, 1271)

View file

@ -0,0 +1,25 @@
import admin_analytics.config as config
from admin_analytics.irs990.download import filter_index
def test_filter_index(fixtures_dir, tmp_path):
"""Test that index filtering finds UD filings and skips 990T."""
# Create a mock index CSV
index = tmp_path / "index_2023.csv"
index.write_text(
"RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,RETURN_TYPE,DLN,OBJECT_ID\n"
"1,EFILE,516000297,202206,2023,UNIVERSITY OF DELAWARE,990,123,OBJ001\n"
"2,EFILE,516000297,202206,2023,UNIVERSITY OF DELAWARE,990T,456,OBJ002\n"
"3,EFILE,516017306,202212,2023,UNIVERSITY OF DELAWARE RESEARCH FOUNDATION,990,789,OBJ003\n"
"4,EFILE,999999999,202212,2023,SOME OTHER ORG,990,000,OBJ004\n"
)
result = filter_index(index, config.UD_EINS)
assert result.height == 2 # UD 990 + Research Foundation 990, not 990T
eins = result["EIN"].to_list()
assert "516000297" in eins
assert "516017306" in eins
# All should be type 990 (not 990T)
assert all(rt == "990" for rt in result["RETURN_TYPE"].to_list())

View file

@ -0,0 +1,46 @@
from admin_analytics.irs990.loader import load_filing, load_part_vii, load_schedule_j
def test_load_filing(db_conn, fixtures_dir):
xml_path = fixtures_dir / "990_sample.xml"
assert load_filing(db_conn, xml_path, "TEST001")
row = db_conn.execute(
"SELECT ein, tax_year, organization_name, total_expenses FROM raw_990_filing WHERE object_id = 'TEST001'"
).fetchone()
assert row[0] == "516000297"
assert row[1] == 2022
assert row[2] == "UNIVERSITY OF DELAWARE"
assert row[3] == 1700000000
def test_load_filing_idempotent(db_conn, fixtures_dir):
xml_path = fixtures_dir / "990_sample.xml"
load_filing(db_conn, xml_path, "TEST001")
load_filing(db_conn, xml_path, "TEST001") # second load should overwrite
count = db_conn.execute(
"SELECT COUNT(*) FROM raw_990_filing WHERE object_id = 'TEST001'"
).fetchone()[0]
assert count == 1
def test_load_part_vii(db_conn, fixtures_dir):
xml_path = fixtures_dir / "990_sample.xml"
count = load_part_vii(db_conn, xml_path, "TEST001")
assert count == 3
rows = db_conn.execute(
"SELECT person_name, reportable_comp_from_org FROM raw_990_part_vii WHERE object_id = 'TEST001' ORDER BY reportable_comp_from_org DESC"
).fetchall()
assert rows[0] == ("JOHN DOE", 850000)
def test_load_schedule_j(db_conn, fixtures_dir):
xml_path = fixtures_dir / "990_sample.xml"
count = load_schedule_j(db_conn, xml_path, "TEST001")
assert count == 2
row = db_conn.execute(
"SELECT person_name, base_compensation, total_compensation FROM raw_990_schedule_j WHERE object_id = 'TEST001' AND person_name = 'JOHN DOE'"
).fetchone()
assert row == ("JOHN DOE", 700000, 950000)

View file

@ -0,0 +1,47 @@
from admin_analytics.irs990.parser import parse_xml, parse_filing, parse_part_vii, parse_schedule_j
def test_parse_filing(fixtures_dir):
root = parse_xml(fixtures_dir / "990_sample.xml")
filing = parse_filing(root)
assert filing["ein"] == "516000297"
assert filing["tax_year"] == 2022
assert filing["organization_name"] == "UNIVERSITY OF DELAWARE"
assert filing["return_type"] == "990"
assert filing["total_revenue"] == 1800000000
assert filing["total_expenses"] == 1700000000
assert filing["total_assets"] == 5000000000
def test_parse_part_vii(fixtures_dir):
root = parse_xml(fixtures_dir / "990_sample.xml")
people = parse_part_vii(root)
assert len(people) == 3
president = people[0]
assert president["person_name"] == "JOHN DOE"
assert president["title"] == "PRESIDENT"
assert president["avg_hours_per_week"] == 40.0
assert president["reportable_comp_from_org"] == 850000
trustee = people[2]
assert trustee["person_name"] == "BOB JONES"
assert trustee["title"] == "TRUSTEE"
assert trustee["reportable_comp_from_org"] == 0
def test_parse_schedule_j(fixtures_dir):
root = parse_xml(fixtures_dir / "990_sample.xml")
people = parse_schedule_j(root)
assert len(people) == 2
president = people[0]
assert president["person_name"] == "JOHN DOE"
assert president["base_compensation"] == 700000
assert president["bonus_compensation"] == 100000
assert president["deferred_compensation"] == 75000
assert president["total_compensation"] == 950000
vp = people[1]
assert vp["person_name"] == "JANE SMITH"
assert vp["total_compensation"] == 520000

View file

@ -0,0 +1,45 @@
from admin_analytics.irs990.titles import normalize_title
def test_president():
assert normalize_title("PRESIDENT") == "PRESIDENT"
assert normalize_title("President & CEO") == "PRESIDENT"
def test_vice_president_finance():
assert normalize_title("VICE PRESIDENT FOR FINANCE") == "VP_FINANCE"
assert normalize_title("VP Finance and Administration") == "VP_FINANCE"
assert normalize_title("EVP & VP for Business") == "VP_FINANCE"
def test_vice_president_other():
assert normalize_title("VICE PRESIDENT") == "VP_OTHER"
def test_provost():
assert normalize_title("PROVOST") == "PROVOST"
assert normalize_title("Executive Vice President and Provost") == "PROVOST"
def test_trustee():
assert normalize_title("TRUSTEE") == "TRUSTEE"
def test_dean():
assert normalize_title("DEAN OF ENGINEERING") == "DEAN"
def test_cfo():
assert normalize_title("CHIEF FINANCIAL OFFICER") == "CFO"
assert normalize_title("CFO") == "CFO"
def test_other():
assert normalize_title("ATHLETIC DIRECTOR") == "DIRECTOR"
assert normalize_title("FOOTBALL COACH") == "OTHER"
def test_none_and_empty():
assert normalize_title(None) is None
assert normalize_title("") is None
assert normalize_title(" ") is None