"""Classify staff by title into functional categories. Categories distinguish admin overhead from grant-funded, student-facing, and technical roles. This is critical because IPEDS/IRE "staff" counts lump everyone who isn't tenure-track faculty — including postdocs, research scientists, and lab technicians who are soft-funded through extramural research and are NOT administrative overhead. """ import re # Order matters — first match wins. CATEGORY_PATTERNS: list[tuple[str, re.Pattern]] = [ # Leadership / structural overhead ("LEADERSHIP", re.compile( r"\b(dean|chief of staff|associate dean|assistant dean)\b", re.I)), # Faculty listed on staff page — not admin (must come before RESEARCH # to catch "Adjunct Professor NIST" as faculty, not research) ("FACULTY", re.compile( r"\b(professor|lecturer|instructor|faculty|adjunct|affiliated)", re.I)), # Grants administration — debatable; supports extramural funding ("GRANTS_ADMIN", re.compile( r"\b(grants?\s+anal|pre-?award|post-?award|closeout\s+coord" r"|sponsored\s+program|grants?\s+admin|grants?\s+manag|grants?\s+coord)", re.I)), # Research staff — soft-funded, NOT admin bloat ("RESEARCH", re.compile( r"\b(research\s+(?:associate|assistant|scientist|scholar|fellow)" r"|postdoc|post-?doctoral|(?:associate\s+)?scientist\b)", re.I)), # Academic / student-facing support ("ACADEMIC_SUPPORT", re.compile( r"\b(academic\s+(?:advisor|analyst|program)|undergrad\w*\s+(?:recruit|advisor|affairs)" r"|graduate\s+(?:services|advisor)|student\s+(?:develop|support|services)" r"|program\s+(?:coordinator|manager))", re.I)), # Advancement / development — revenue-generating (fundraising) ("ADVANCEMENT", re.compile( r"\b(development|fundrais|advancement|alumni\s+relation|donor|giving)", re.I)), # Finance / procurement ("FINANCE", re.compile( r"\b(financial|fiscal|budget|procurement|business\s+(?:officer|admin)" r"|sr\.?\s+business)", re.I)), # IT / computing ("IT", re.compile( r"\b(computing|systems?\s+(?:prog|admin)|it\s+|information\s+tech" r"|support\s+specialist|service\s+desk|digital\s+tech)", re.I)), # Communications / marketing (must come before DIRECTOR) ("COMMUNICATIONS", re.compile( r"\b(communicat\w+|marketing|media\s+(?:specialist|coord|director)" r"|web\s+(?:develop|design|content)|event\s+(?:coord|plan|manag))", re.I)), # Human resources ("HR", re.compile( r"\b(human\s+resource|hr\s+analyst|talent|workforce)", re.I)), # Facilities / space management ("FACILITIES", re.compile( r"\b(facilit|building|space\s+(?:plan|manag)|safety|engineer\w+\s+facilit)", re.I)), # Technical / lab operations — not admin bloat ("TECHNICAL", re.compile( r"\b(machinist|lab\s+(?:manager|coord|tech)|equipment|technician" r"|instrument)", re.I)), # Administrative support ("ADMIN_SUPPORT", re.compile( r"\b(admin\w*\s+(?:assistant|specialist|support|secretary|coord)" r"|secretary|receptionist|office\s+(?:manager|coord))", re.I)), # Director-level (catch remaining directors) ("DIRECTOR", re.compile( r"\b(director|associate\s+director|sr\.?\s+director)\b", re.I)), ] # Which categories count as administrative overhead OVERHEAD_CATEGORIES = { "LEADERSHIP", "FINANCE", "IT", "COMMUNICATIONS", "HR", "FACILITIES", "ADMIN_SUPPORT", "DIRECTOR", } # Debatable — could go either way depending on analysis DEBATABLE_CATEGORIES = {"GRANTS_ADMIN"} # NOT overhead — these are mission-aligned or revenue-generating NON_OVERHEAD_CATEGORIES = { "RESEARCH", "ACADEMIC_SUPPORT", "ADVANCEMENT", "TECHNICAL", "FACULTY", } def classify_title(title: str | None) -> str: """Classify a staff title into a functional category. Returns the category string, or "UNKNOWN" if no pattern matches. """ if not title or not title.strip(): return "UNKNOWN" for category, pattern in CATEGORY_PATTERNS: if pattern.search(title): return category return "UNKNOWN" def is_overhead(category: str) -> bool | None: """Return True if the category is administrative overhead, False if not, None if debatable. """ if category in OVERHEAD_CATEGORIES: return True if category in NON_OVERHEAD_CATEGORIES: return False if category in DEBATABLE_CATEGORIES: return None return None