Initial build out

This commit is contained in:
emfurst 2026-03-30 07:15:14 -04:00
commit 29215e2bd2
40 changed files with 2622 additions and 0 deletions

View file

@ -0,0 +1,122 @@
import duckdb
TABLES = {
"raw_institution": """
CREATE TABLE IF NOT EXISTS raw_institution (
unitid INTEGER NOT NULL,
year INTEGER NOT NULL,
ein VARCHAR,
institution_name VARCHAR,
city VARCHAR,
state VARCHAR,
sector INTEGER,
control INTEGER,
carnegie_class INTEGER,
enrollment_total INTEGER,
PRIMARY KEY (unitid, year)
)
""",
"raw_ipeds_finance": """
CREATE TABLE IF NOT EXISTS raw_ipeds_finance (
unitid INTEGER NOT NULL,
year INTEGER NOT NULL,
reporting_standard VARCHAR,
total_expenses BIGINT,
instruction_expenses BIGINT,
research_expenses BIGINT,
public_service_expenses BIGINT,
academic_support_expenses BIGINT,
student_services_expenses BIGINT,
institutional_support_expenses BIGINT,
auxiliary_expenses BIGINT,
hospital_expenses BIGINT,
other_expenses BIGINT,
salaries_wages BIGINT,
benefits BIGINT,
PRIMARY KEY (unitid, year)
)
""",
"raw_ipeds_staff": """
CREATE TABLE IF NOT EXISTS raw_ipeds_staff (
unitid INTEGER NOT NULL,
year INTEGER NOT NULL,
total_staff INTEGER,
faculty_total INTEGER,
management_total INTEGER,
PRIMARY KEY (unitid, year)
)
""",
"raw_ipeds_enrollment": """
CREATE TABLE IF NOT EXISTS raw_ipeds_enrollment (
unitid INTEGER NOT NULL,
year INTEGER NOT NULL,
total_enrollment INTEGER,
PRIMARY KEY (unitid, year)
)
""",
"raw_990_filing": """
CREATE TABLE IF NOT EXISTS raw_990_filing (
object_id VARCHAR PRIMARY KEY,
ein VARCHAR,
tax_year INTEGER,
organization_name VARCHAR,
return_type VARCHAR,
filing_date DATE,
total_revenue BIGINT,
total_expenses BIGINT,
total_assets BIGINT
)
""",
"raw_990_schedule_j": """
CREATE SEQUENCE IF NOT EXISTS seq_990_schedule_j START 1;
CREATE TABLE IF NOT EXISTS raw_990_schedule_j (
id INTEGER PRIMARY KEY DEFAULT nextval('seq_990_schedule_j'),
object_id VARCHAR,
ein VARCHAR,
tax_year INTEGER,
person_name VARCHAR,
title VARCHAR,
base_compensation BIGINT,
bonus_compensation BIGINT,
other_compensation BIGINT,
deferred_compensation BIGINT,
nontaxable_benefits BIGINT,
total_compensation BIGINT,
compensation_from_related BIGINT
)
""",
"raw_990_part_vii": """
CREATE SEQUENCE IF NOT EXISTS seq_990_part_vii START 1;
CREATE TABLE IF NOT EXISTS raw_990_part_vii (
id INTEGER PRIMARY KEY DEFAULT nextval('seq_990_part_vii'),
object_id VARCHAR,
ein VARCHAR,
tax_year INTEGER,
person_name VARCHAR,
title VARCHAR,
avg_hours_per_week DOUBLE,
reportable_comp_from_org BIGINT,
reportable_comp_from_related BIGINT,
other_compensation BIGINT
)
""",
"raw_cpi_u": """
CREATE TABLE IF NOT EXISTS raw_cpi_u (
year INTEGER NOT NULL,
month INTEGER NOT NULL,
value DOUBLE,
series_id VARCHAR,
PRIMARY KEY (year, month)
)
""",
}
def ensure_schema(conn: duckdb.DuckDBPyConnection) -> None:
"""Create all raw tables if they don't exist."""
for ddl in TABLES.values():
# Some DDL blocks contain multiple statements (e.g. CREATE SEQUENCE + CREATE TABLE)
for stmt in ddl.split(";"):
stmt = stmt.strip()
if stmt:
conn.execute(stmt)