203 lines
6.6 KiB
Python
203 lines
6.6 KiB
Python
from typing import Annotated
|
|
|
|
import typer
|
|
|
|
from admin_analytics.config import DEFAULT_YEAR_RANGE
|
|
from admin_analytics.db.connection import get_connection
|
|
from admin_analytics.db.schema import ensure_schema
|
|
|
|
app = typer.Typer(help="University of Delaware administrative analytics")
|
|
ingest_app = typer.Typer(help="Ingest data from external sources")
|
|
app.add_typer(ingest_app, name="ingest")
|
|
|
|
|
|
def _parse_year_range(year_range: str) -> range:
|
|
"""Parse '2005-2024' into a range object."""
|
|
parts = year_range.split("-")
|
|
if len(parts) != 2:
|
|
raise typer.BadParameter("Year range must be in format YYYY-YYYY")
|
|
start, end = int(parts[0]), int(parts[1])
|
|
return range(start, end + 1)
|
|
|
|
|
|
@ingest_app.command()
|
|
def ipeds(
|
|
year_range: Annotated[
|
|
str, typer.Option(help="Year range, e.g. 2005-2024")
|
|
] = f"{DEFAULT_YEAR_RANGE.start}-{DEFAULT_YEAR_RANGE.stop - 1}",
|
|
component: Annotated[
|
|
str, typer.Option(help="Component: all, hd, finance, staff, enrollment")
|
|
] = "all",
|
|
force: Annotated[
|
|
bool, typer.Option("--force", help="Re-download even if files exist")
|
|
] = False,
|
|
) -> None:
|
|
"""Ingest IPEDS data for the University of Delaware."""
|
|
years = _parse_year_range(year_range)
|
|
conn = get_connection()
|
|
ensure_schema(conn)
|
|
|
|
from admin_analytics.ipeds.download import download_all
|
|
from admin_analytics.ipeds.institution import load_institutions
|
|
from admin_analytics.ipeds.finance import load_finance
|
|
from admin_analytics.ipeds.staff import load_staff
|
|
from admin_analytics.ipeds.enrollment import load_enrollment
|
|
|
|
components = (
|
|
["hd", "finance", "staff", "enrollment"]
|
|
if component == "all"
|
|
else [component]
|
|
)
|
|
|
|
# Finance needs both F1A and F2 downloads (UD reports under FASB/F2)
|
|
download_components = list(components)
|
|
if "finance" in download_components:
|
|
download_components.append("finance_f2")
|
|
|
|
typer.echo(f"Downloading IPEDS data for {years.start}-{years.stop - 1}...")
|
|
download_all(years, download_components, force=force)
|
|
|
|
if "hd" in components:
|
|
typer.echo("Loading institutional directory (HD)...")
|
|
load_institutions(conn, years)
|
|
|
|
if "finance" in components:
|
|
typer.echo("Loading finance data (F1A)...")
|
|
load_finance(conn, years)
|
|
|
|
if "staff" in components:
|
|
typer.echo("Loading staff data (S)...")
|
|
load_staff(conn, years)
|
|
|
|
if "enrollment" in components:
|
|
typer.echo("Loading enrollment data (EF)...")
|
|
load_enrollment(conn, years)
|
|
|
|
typer.echo("IPEDS ingestion complete.")
|
|
conn.close()
|
|
|
|
|
|
@ingest_app.command()
|
|
def irs990(
|
|
year_range: Annotated[
|
|
str, typer.Option(help="Year range for index files, e.g. 2019-2025")
|
|
] = "2019-2025",
|
|
force: Annotated[
|
|
bool, typer.Option("--force", help="Re-download even if files exist")
|
|
] = False,
|
|
) -> None:
|
|
"""Ingest IRS 990 data for UD and UD Research Foundation."""
|
|
years = _parse_year_range(year_range)
|
|
conn = get_connection()
|
|
ensure_schema(conn)
|
|
|
|
from admin_analytics.irs990.download import download_all_filings
|
|
from admin_analytics.irs990.loader import load_all
|
|
|
|
typer.echo(f"Downloading 990 filings for {years.start}-{years.stop - 1}...")
|
|
download_all_filings(years, force=force)
|
|
|
|
typer.echo("Loading 990 data into database...")
|
|
totals = load_all(conn, years)
|
|
typer.echo(
|
|
f"IRS 990 ingestion complete: {totals['filings']} filings, "
|
|
f"{totals['part_vii']} Part VII rows, {totals['schedule_j']} Schedule J rows."
|
|
)
|
|
conn.close()
|
|
|
|
|
|
@ingest_app.command()
|
|
def cpi(
|
|
force: Annotated[
|
|
bool, typer.Option("--force", help="Re-download even if file exists")
|
|
] = False,
|
|
) -> None:
|
|
"""Ingest BLS CPI-U data."""
|
|
conn = get_connection()
|
|
ensure_schema(conn)
|
|
|
|
from admin_analytics.bls.download import download_cpi_file
|
|
from admin_analytics.bls.loader import load_cpi
|
|
|
|
typer.echo("Downloading BLS CPI-U data...")
|
|
file_path = download_cpi_file(force=force)
|
|
|
|
typer.echo("Loading CPI-U data into database...")
|
|
count = load_cpi(conn, file_path)
|
|
typer.echo(f"CPI-U ingestion complete: {count} monthly observations loaded.")
|
|
conn.close()
|
|
|
|
|
|
@ingest_app.command()
|
|
def scrape() -> None:
|
|
"""Scrape UD staff directory pages for admin headcounts."""
|
|
conn = get_connection()
|
|
ensure_schema(conn)
|
|
|
|
from admin_analytics.scraper.directory import scrape_all
|
|
from admin_analytics.scraper.loader import load_scrape
|
|
from admin_analytics.scraper.classify import OVERHEAD_CATEGORIES, NON_OVERHEAD_CATEGORIES
|
|
|
|
typer.echo("Scraping UD staff directory pages...")
|
|
entries = scrape_all()
|
|
|
|
typer.echo("Loading scraped data into database...")
|
|
count = load_scrape(conn, entries)
|
|
|
|
# Summary by unit and category
|
|
typer.echo(f"\nLoaded {count} staff entries.\n")
|
|
unit_counts: dict[str, dict[str, int]] = {}
|
|
for e in entries:
|
|
unit_counts.setdefault(e.unit, {})
|
|
unit_counts[e.unit][e.category] = unit_counts[e.unit].get(e.category, 0) + 1
|
|
|
|
for unit, cats in sorted(unit_counts.items()):
|
|
total = sum(cats.values())
|
|
overhead = sum(v for k, v in cats.items() if k in OVERHEAD_CATEGORIES)
|
|
typer.echo(f" {unit}: {total} staff ({overhead} overhead)")
|
|
for cat, n in sorted(cats.items(), key=lambda x: -x[1]):
|
|
typer.echo(f" {cat}: {n}")
|
|
|
|
conn.close()
|
|
|
|
|
|
@app.command()
|
|
def dashboard(
|
|
host: Annotated[str, typer.Option(help="Host to bind to (0.0.0.0 for network access)")] = "127.0.0.1",
|
|
port: Annotated[int, typer.Option(help="Port to serve on")] = 8050,
|
|
debug: Annotated[bool, typer.Option(help="Enable Dash debug mode")] = True,
|
|
) -> None:
|
|
"""Launch the analytics dashboard."""
|
|
from admin_analytics.dashboard.app import create_app
|
|
|
|
dash_app = create_app()
|
|
typer.echo(f"Starting dashboard at http://{host}:{port}/")
|
|
dash_app.run(host=host, debug=debug, port=port)
|
|
|
|
|
|
@app.command()
|
|
def validate() -> None:
|
|
"""Run data validation checks and print a report."""
|
|
conn = get_connection()
|
|
ensure_schema(conn)
|
|
|
|
from admin_analytics.validation import format_report
|
|
|
|
typer.echo(format_report(conn))
|
|
conn.close()
|
|
|
|
|
|
@ingest_app.command(name="all")
|
|
def ingest_all(
|
|
year_range: Annotated[
|
|
str, typer.Option(help="Year range, e.g. 2005-2024")
|
|
] = f"{DEFAULT_YEAR_RANGE.start}-{DEFAULT_YEAR_RANGE.stop - 1}",
|
|
force: Annotated[
|
|
bool, typer.Option("--force", help="Re-download even if files exist")
|
|
] = False,
|
|
) -> None:
|
|
"""Ingest all data sources."""
|
|
ipeds(year_range=year_range, component="all", force=force)
|
|
irs990(year_range=year_range, force=force)
|
|
cpi(force=force)
|
|
scrape()
|