import duckdb TABLES = { "raw_institution": """ CREATE TABLE IF NOT EXISTS raw_institution ( unitid INTEGER NOT NULL, year INTEGER NOT NULL, ein VARCHAR, institution_name VARCHAR, city VARCHAR, state VARCHAR, sector INTEGER, control INTEGER, carnegie_class INTEGER, enrollment_total INTEGER, PRIMARY KEY (unitid, year) ) """, "raw_ipeds_finance": """ CREATE TABLE IF NOT EXISTS raw_ipeds_finance ( unitid INTEGER NOT NULL, year INTEGER NOT NULL, reporting_standard VARCHAR, total_expenses BIGINT, instruction_expenses BIGINT, research_expenses BIGINT, public_service_expenses BIGINT, academic_support_expenses BIGINT, student_services_expenses BIGINT, institutional_support_expenses BIGINT, auxiliary_expenses BIGINT, hospital_expenses BIGINT, other_expenses BIGINT, salaries_wages BIGINT, benefits BIGINT, PRIMARY KEY (unitid, year) ) """, "raw_ipeds_staff": """ CREATE TABLE IF NOT EXISTS raw_ipeds_staff ( unitid INTEGER NOT NULL, year INTEGER NOT NULL, total_staff INTEGER, faculty_total INTEGER, management_total INTEGER, PRIMARY KEY (unitid, year) ) """, "raw_ipeds_enrollment": """ CREATE TABLE IF NOT EXISTS raw_ipeds_enrollment ( unitid INTEGER NOT NULL, year INTEGER NOT NULL, total_enrollment INTEGER, PRIMARY KEY (unitid, year) ) """, "raw_990_filing": """ CREATE TABLE IF NOT EXISTS raw_990_filing ( object_id VARCHAR PRIMARY KEY, ein VARCHAR, tax_year INTEGER, organization_name VARCHAR, return_type VARCHAR, filing_date DATE, total_revenue BIGINT, total_expenses BIGINT, total_assets BIGINT ) """, "raw_990_schedule_j": """ CREATE SEQUENCE IF NOT EXISTS seq_990_schedule_j START 1; CREATE TABLE IF NOT EXISTS raw_990_schedule_j ( id INTEGER PRIMARY KEY DEFAULT nextval('seq_990_schedule_j'), object_id VARCHAR, ein VARCHAR, tax_year INTEGER, person_name VARCHAR, title VARCHAR, base_compensation BIGINT, bonus_compensation BIGINT, other_compensation BIGINT, deferred_compensation BIGINT, nontaxable_benefits BIGINT, total_compensation BIGINT, compensation_from_related BIGINT ) """, "raw_990_part_vii": """ CREATE SEQUENCE IF NOT EXISTS seq_990_part_vii START 1; CREATE TABLE IF NOT EXISTS raw_990_part_vii ( id INTEGER PRIMARY KEY DEFAULT nextval('seq_990_part_vii'), object_id VARCHAR, ein VARCHAR, tax_year INTEGER, person_name VARCHAR, title VARCHAR, avg_hours_per_week DOUBLE, reportable_comp_from_org BIGINT, reportable_comp_from_related BIGINT, other_compensation BIGINT ) """, "raw_ipeds_endowment": """ CREATE TABLE IF NOT EXISTS raw_ipeds_endowment ( unitid INTEGER NOT NULL, year INTEGER NOT NULL, endowment_boy BIGINT, endowment_eoy BIGINT, new_gifts BIGINT, net_investment_return BIGINT, spending_distribution BIGINT, other_changes BIGINT, total_private_gifts BIGINT, total_investment_return BIGINT, long_term_investments BIGINT, PRIMARY KEY (unitid, year) ) """, "raw_cpi_u": """ CREATE TABLE IF NOT EXISTS raw_cpi_u ( year INTEGER NOT NULL, month INTEGER NOT NULL, value DOUBLE, series_id VARCHAR, PRIMARY KEY (year, month) ) """, "raw_admin_headcount": """ CREATE SEQUENCE IF NOT EXISTS seq_admin_headcount START 1; CREATE TABLE IF NOT EXISTS raw_admin_headcount ( id INTEGER PRIMARY KEY DEFAULT nextval('seq_admin_headcount'), scrape_date DATE NOT NULL, unit VARCHAR NOT NULL, person_name VARCHAR, title VARCHAR, email VARCHAR, category VARCHAR, is_overhead BOOLEAN ) """, } def ensure_schema(conn: duckdb.DuckDBPyConnection) -> None: """Create all raw tables if they don't exist.""" for ddl in TABLES.values(): # Some DDL blocks contain multiple statements (e.g. CREATE SEQUENCE + CREATE TABLE) for stmt in ddl.split(";"): stmt = stmt.strip() if stmt: conn.execute(stmt)