From 075b3f57240c58f752ebae6c8ba3ed0fc0d62efd Mon Sep 17 00:00:00 2001 From: William Hill Date: Tue, 24 Feb 2026 13:08:18 -0500 Subject: [PATCH 01/15] docs: design doc for self-service data upload (issue #86) --- .../2026-02-24-self-service-upload-design.md | 170 ++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 docs/plans/2026-02-24-self-service-upload-design.md diff --git a/docs/plans/2026-02-24-self-service-upload-design.md b/docs/plans/2026-02-24-self-service-upload-design.md new file mode 100644 index 0000000..9bf831b --- /dev/null +++ b/docs/plans/2026-02-24-self-service-upload-design.md @@ -0,0 +1,170 @@ +# Design: Self-Service Data Upload (Issue #86) + +**Date:** 2026-02-24 +**Author:** Claude Code + +--- + +## Overview + +Allow admin and IR users to upload institutional data files directly from the dashboard without +needing direct database or server access. Two upload paths: course enrollment CSVs (end-to-end +to Postgres) and PDP cohort/AR files (to Supabase Storage + GitHub Actions ML pipeline trigger). + +--- + +## Scope + +**In scope:** +- Course enrollment CSV → `course_enrollments` Postgres table (upsert) +- PDP Cohort CSV / PDP AR (.xlsx) → Supabase Storage + GitHub Actions `repository_dispatch` +- Preview step (first 10 rows + column validation) before commit +- Role guard: admin and ir only + +**Out of scope:** +- Upload history log (future issue) +- Column remapping UI (columns must match known schema) +- ML experiment tracking / MLflow (future issue) +- Auto-triggering ML pipeline without a server (GitHub Actions is the trigger mechanism) + +--- + +## Pages & Routing + +**New page:** `codebenders-dashboard/app/admin/upload/page.tsx` + +**Role guard:** Add to `lib/roles.ts` `ROUTE_PERMISSIONS`: +```ts +{ prefix: "/admin", roles: ["admin", "ir"] }, +{ prefix: "/api/admin", roles: ["admin", "ir"] }, +``` +Middleware already enforces this pattern via `x-user-role` header — no other auth code needed. + +**Nav link:** Add "Upload Data" to `nav-header.tsx`, visible only to admin/ir roles. + +**New API routes:** +- `POST /api/admin/upload/preview` — parse first 10 rows, return sample + validation summary +- `POST /api/admin/upload/commit` — full ingest (course → Postgres; PDP/AR → Storage + Actions) + +--- + +## UI Flow (3 States) + +### State 1 — Select & Drop +- Dropdown: file type (`Course Enrollment CSV` | `PDP Cohort CSV` | `PDP AR File (.xlsx)`) +- Drag-and-drop zone (click to pick; `.csv` for course/cohort, `.csv`+`.xlsx` for AR) +- "Preview" button → calls `/api/admin/upload/preview` + +### State 2 — Preview +- Shows: detected file type, estimated row count, first 10 rows in a table +- Validation banner: lists missing required columns or warnings +- "Confirm & Upload" → calls `/api/admin/upload/commit` +- "Back" link to return to State 1 + +### State 3 — Result +- Course enrollments: `{ inserted, skipped, errors[] }` summary card +- PDP/AR: "File accepted — ML pipeline queued in GitHub Actions" + link to Actions run +- "Upload another file" resets to State 1 + +--- + +## API Routes + +### `POST /api/admin/upload/preview` + +**Input:** `multipart/form-data` with `file` and `fileType` fields + +**Logic:** +1. Parse first 50 rows with `csv-parse` (CSV) or `xlsx` (Excel) +2. Validate required columns exist for the given `fileType` +3. Return `{ columns, sampleRows (first 10), rowCount (estimated), warnings[] }` + +### `POST /api/admin/upload/commit` + +**Input:** Same multipart form + +**Course enrollment path:** +1. Stream-parse full CSV with `csv-parse` async iterator +2. Batch-upsert 500 rows at a time into `course_enrollments` via `pg` +3. Conflict target: `(student_guid, course_prefix, course_number, academic_term)` +4. Return `{ inserted, skipped, errors[] }` + +**PDP/AR path:** +1. Upload file to Supabase Storage bucket `pdp-uploads` via `@supabase/supabase-js` +2. Call GitHub API `POST /repos/{owner}/{repo}/dispatches` with: + ```json + { "event_type": "ml-pipeline", "client_payload": { "file_path": "" } } + ``` +3. Return `{ status: "processing", actionsUrl: "https://github.com/{owner}/{repo}/actions" }` + +**Role enforcement:** Read `x-user-role` header (set by middleware); return 403 if not admin/ir. + +--- + +## GitHub Actions Workflow + +**File:** `.github/workflows/ml-pipeline.yml` + +**Trigger:** `repository_dispatch` with `event_type: ml-pipeline` + +**Steps:** +1. Checkout repo +2. Set up Python with `venv` +3. Install dependencies (`pip install -r requirements.txt`) +4. Download uploaded file from Supabase Storage using `SUPABASE_SERVICE_KEY` secret +5. Run `venv/bin/python ai_model/complete_ml_pipeline.py --input ` +6. Upload `ML_PIPELINE_REPORT.txt` as a GitHub Actions artifact (retained 90 days) + +**Required secrets:** `SUPABASE_URL`, `SUPABASE_SERVICE_KEY`, `GITHUB_TOKEN` (auto-provided) + +--- + +## Required Column Schemas + +### Course Enrollment CSV +Must include: `student_guid`, `course_prefix`, `course_number`, `academic_year`, `academic_term` +Optional (all other `course_enrollments` columns): filled as NULL if absent + +### PDP Cohort CSV +Must include: `Institution_ID`, `Cohort`, `Student_GUID`, `Cohort_Term` + +### PDP AR File (.xlsx) +Must include: `Institution_ID`, `Cohort`, `Student_GUID` (first sheet parsed) + +--- + +## New Packages + +| Package | Purpose | +|---------|---------| +| `csv-parse` | Streaming CSV parsing (async iterator mode) | +| `xlsx` | Excel (.xlsx) parsing | + +--- + +## New Files + +| File | Purpose | +|------|---------| +| `codebenders-dashboard/app/admin/upload/page.tsx` | Upload UI page | +| `codebenders-dashboard/app/api/admin/upload/preview/route.ts` | Preview API route | +| `codebenders-dashboard/app/api/admin/upload/commit/route.ts` | Commit API route | +| `.github/workflows/ml-pipeline.yml` | GitHub Actions ML pipeline trigger | + +--- + +## Supabase Changes + +**Storage bucket:** Create `pdp-uploads` bucket (private, authenticated access only). +No new database migrations required — `course_enrollments` table already exists. + +**Bucket policy:** Only service role key can read/write. Signed URLs used for pipeline download. + +--- + +## Constraints & Known Limitations + +- ML pipeline trigger via GitHub Actions means a ~30-60s delay before the pipeline starts +- Vercel free tier has a 4.5 MB request body limit — large files should use Supabase Storage direct upload in a future iteration +- No upload history log in this version (deferred) +- Column remapping is out of scope — files must match the known schema From 184202eed8002761efb8416e0a3e16c8b0508733 Mon Sep 17 00:00:00 2001 From: William Hill Date: Tue, 24 Feb 2026 13:12:15 -0500 Subject: [PATCH 02/15] docs: implementation plan for self-service data upload (issue #86) --- docs/plans/2026-02-24-self-service-upload.md | 1135 ++++++++++++++++++ 1 file changed, 1135 insertions(+) create mode 100644 docs/plans/2026-02-24-self-service-upload.md diff --git a/docs/plans/2026-02-24-self-service-upload.md b/docs/plans/2026-02-24-self-service-upload.md new file mode 100644 index 0000000..2c34769 --- /dev/null +++ b/docs/plans/2026-02-24-self-service-upload.md @@ -0,0 +1,1135 @@ +# Self-Service Data Upload Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Add a `/admin/upload` page (admin/ir only) for uploading course enrollment CSVs directly into Postgres, and PDP cohort/AR files into Supabase Storage with automatic GitHub Actions ML pipeline triggering. + +**Architecture:** Single unified upload page with a 3-state UI (select → preview → result). Two API routes: `/api/admin/upload/preview` (parse first 10 rows, validate columns) and `/api/admin/upload/commit` (course CSV → Postgres batch-upsert; PDP/AR → Supabase Storage + `repository_dispatch` to GitHub Actions). No new DB migrations needed — `course_enrollments` table already exists. + +**Tech Stack:** Next.js 16 App Router, `csv-parse` (streaming CSV), `xlsx` (Excel), `@supabase/supabase-js` (Storage), `pg` (Postgres upsert), GitHub REST API (`repository_dispatch`), TypeScript, Tailwind CSS, shadcn/ui + +--- + +## Task 1: Install `csv-parse` and `xlsx` packages + +**Files:** +- Modify: `codebenders-dashboard/package.json` (via npm install) + +**Step 1: Install packages** + +```bash +cd codebenders-dashboard && npm install csv-parse xlsx +``` + +**Step 2: Verify they appear in `package.json` dependencies** + +```bash +grep -E '"csv-parse"|"xlsx"' package.json +``` + +Expected output: +``` + "csv-parse": "^5.x.x", + "xlsx": "^0.x.x", +``` + +**Step 3: Commit** + +```bash +git add codebenders-dashboard/package.json codebenders-dashboard/package-lock.json +git commit -m "chore: add csv-parse and xlsx packages for file upload" +``` + +--- + +## Task 2: Add role permissions and nav link + +**Files:** +- Modify: `codebenders-dashboard/lib/roles.ts:6-13` +- Modify: `codebenders-dashboard/components/nav-header.tsx:15-20` + +**Step 1: Add `/admin` routes to `ROUTE_PERMISSIONS` in `lib/roles.ts`** + +Open `codebenders-dashboard/lib/roles.ts`. After line 13 (`{ prefix: "/api/query-history/export", ... }`), add two new entries so the array looks like: + +```ts +export const ROUTE_PERMISSIONS: Array<{ prefix: string; roles: Role[] }> = [ + { prefix: "/students", roles: ["admin", "advisor", "ir"] }, + { prefix: "/courses", roles: ["admin", "advisor", "ir", "faculty"] }, + { prefix: "/query", roles: ["admin", "advisor", "ir", "faculty"] }, + { prefix: "/api/students", roles: ["admin", "advisor", "ir"] }, + { prefix: "/api/courses", roles: ["admin", "advisor", "ir", "faculty"] }, + { prefix: "/api/query-summary", roles: ["admin", "advisor", "ir", "faculty"] }, + { prefix: "/api/query-history/export", roles: ["admin", "ir"] }, + { prefix: "/admin", roles: ["admin", "ir"] }, + { prefix: "/api/admin", roles: ["admin", "ir"] }, +] +``` + +**Step 2: Add "Upload Data" nav link in `nav-header.tsx`** + +The `NavHeader` component already receives `role` as a prop. Replace the `NAV_LINKS` constant and its usage so the Upload link only renders for admin/ir: + +```tsx +const NAV_LINKS = [ + { href: "/", label: "Dashboard", roles: null }, + { href: "/courses", label: "Courses", roles: null }, + { href: "/students", label: "Students", roles: null }, + { href: "/query", label: "Query", roles: null }, + { href: "/admin/upload", label: "Upload Data", roles: ["admin", "ir"] as Role[] }, +] +``` + +Then update the `nav` block to filter on role: + +```tsx + +``` + +**Step 3: Type-check** + +```bash +cd codebenders-dashboard && npx tsc --noEmit +``` + +Expected: no errors. + +**Step 4: Commit** + +```bash +git add codebenders-dashboard/lib/roles.ts codebenders-dashboard/components/nav-header.tsx +git commit -m "feat: add admin/ir role permissions and Upload Data nav link" +``` + +--- + +## Task 3: Add environment variables + +**Files:** +- Modify: `codebenders-dashboard/env.example` + +**Step 1: Add new env vars to `env.example`** + +Append to the bottom of `codebenders-dashboard/env.example`: + +```bash +# Supabase Storage (for PDP/AR file uploads — use the service role key, not anon) +# Find in Supabase → Project Settings → API → service_role key +SUPABASE_SERVICE_ROLE_KEY=your-service-role-key-here + +# GitHub Actions ML pipeline trigger +# Create a PAT at GitHub → Settings → Developer settings → Personal access tokens +# Required scope: repo (to trigger repository_dispatch) +GITHUB_PAT=ghp_your-personal-access-token-here +# Full repo path: owner/repo +GITHUB_REPO=devcolor/codebenders-datathon +``` + +**Step 2: Add the same vars to your local `.env.local`** + +Copy the three vars above into `codebenders-dashboard/.env.local` with real values. + +**Step 3: Commit** + +```bash +git add codebenders-dashboard/env.example +git commit -m "docs: add env vars for Supabase Storage and GitHub Actions pipeline trigger" +``` + +--- + +## Task 4: Create the preview API route + +**Files:** +- Create: `codebenders-dashboard/app/api/admin/upload/preview/route.ts` + +**Background:** This route accepts a `multipart/form-data` POST with two fields: +- `file` — the uploaded file (File object) +- `fileType` — one of `"course_enrollment"`, `"pdp_cohort"`, `"pdp_ar"` + +It parses the first 50 rows (or all rows if fewer), validates that required columns are present, and returns a preview payload. For `.xlsx` files, it reads the first sheet. For CSV, it uses `csv-parse`. + +**Required columns per file type:** +- `course_enrollment`: `Student_GUID`, `Course_Prefix`, `Course_Number`, `Academic_Year`, `Academic_Term` +- `pdp_cohort`: `Institution_ID`, `Cohort`, `Student_GUID`, `Cohort_Term` +- `pdp_ar`: `Institution_ID`, `Cohort`, `Student_GUID` + +**Step 1: Create the route file** + +Create `codebenders-dashboard/app/api/admin/upload/preview/route.ts` with this content: + +```typescript +import { type NextRequest, NextResponse } from "next/server" +import { parse } from "csv-parse/sync" +import * as XLSX from "xlsx" + +const REQUIRED_COLUMNS: Record = { + course_enrollment: ["Student_GUID", "Course_Prefix", "Course_Number", "Academic_Year", "Academic_Term"], + pdp_cohort: ["Institution_ID", "Cohort", "Student_GUID", "Cohort_Term"], + pdp_ar: ["Institution_ID", "Cohort", "Student_GUID"], +} + +export async function POST(request: NextRequest) { + const role = request.headers.get("x-user-role") + if (role !== "admin" && role !== "ir") { + return NextResponse.json({ error: "Forbidden" }, { status: 403 }) + } + + let formData: FormData + try { + formData = await request.formData() + } catch { + return NextResponse.json({ error: "Invalid multipart form data" }, { status: 400 }) + } + + const file = formData.get("file") as File | null + const fileType = formData.get("fileType") as string | null + + if (!file || !fileType) { + return NextResponse.json({ error: "Missing file or fileType" }, { status: 400 }) + } + if (!REQUIRED_COLUMNS[fileType]) { + return NextResponse.json({ error: `Unknown fileType: ${fileType}` }, { status: 400 }) + } + + let rows: Record[] + + try { + const arrayBuf = await file.arrayBuffer() + const buffer = Buffer.from(arrayBuf) + + if (file.name.endsWith(".xlsx")) { + const wb = XLSX.read(buffer, { type: "buffer" }) + const ws = wb.Sheets[wb.SheetNames[0]] + rows = XLSX.utils.sheet_to_json>(ws, { defval: "" }) + } else { + rows = parse(buffer, { + columns: true, + skip_empty_lines: true, + to: 50, + cast: false, + }) as Record[] + } + } catch (err) { + return NextResponse.json( + { error: "Failed to parse file", details: err instanceof Error ? err.message : String(err) }, + { status: 400 } + ) + } + + if (rows.length === 0) { + return NextResponse.json({ error: "File is empty" }, { status: 400 }) + } + + const columns = Object.keys(rows[0]) + const required = REQUIRED_COLUMNS[fileType] + const missing = required.filter(col => !columns.includes(col)) + + const warnings: string[] = [] + if (missing.length > 0) { + warnings.push(`Missing required columns: ${missing.join(", ")}`) + } + + return NextResponse.json({ + columns, + sampleRows: rows.slice(0, 10), + rowCount: rows.length, // actual count of parsed rows (capped at 50) + warnings, + }) +} +``` + +**Step 2: Type-check** + +```bash +cd codebenders-dashboard && npx tsc --noEmit +``` + +Expected: no errors. + +**Step 3: Smoke-test with curl** (while `npm run dev` is running) + +```bash +curl -s -X POST http://localhost:3000/api/admin/upload/preview \ + -H "x-user-role: admin" \ + -F "fileType=course_enrollment" \ + -F "file=@../data/bishop_state_courses.csv" | jq '{columns: .columns[:3], rowCount: .rowCount, warnings: .warnings}' +``` + +Expected: JSON with `columns` array, `rowCount: 50`, `warnings: []` + +**Step 4: Commit** + +```bash +git add codebenders-dashboard/app/api/admin/upload/preview/route.ts +git commit -m "feat: add POST /api/admin/upload/preview route" +``` + +--- + +## Task 5: Create the commit route — course enrollment path + +**Files:** +- Create: `codebenders-dashboard/app/api/admin/upload/commit/route.ts` + +**Background:** For `course_enrollment` file type, stream-parse the full CSV and batch-upsert rows into `public.course_enrollments` in chunks of 500. Use `pg`'s `getPool()` (already available in `lib/db.ts`). The upsert conflict target is `(student_guid, course_prefix, course_number, academic_term)` — you'll need to add a unique constraint migration (Task 7) or use a simpler strategy. + +Actually, since the existing load script uses TRUNCATE (not upsert), and there's no unique index on `course_enrollments`, we'll use the same approach: truncate + re-insert. This is idempotent and matches the existing pattern. + +**Column mapping** from CSV header names → DB column names (matches the existing load script at `scripts/load-course-enrollments.ts`): + +| CSV header | DB column | +|---|---| +| Student_GUID | student_guid | +| Cohort | cohort | +| Cohort_Term | cohort_term | +| Academic_Year | academic_year | +| Academic_Term | academic_term | +| Course_Prefix | course_prefix | +| Course_Number | course_number | +| Course_Name | course_name | +| Course_CIP | course_cip | +| Course_Type | course_type | +| Math_or_English_Gateway | gateway_type | +| Co_requisite_Course | is_co_requisite (Y/N → boolean) | +| Core_Course | is_core_course (Y/N → boolean) | +| Core_Course_Type | core_course_type | +| Delivery_Method | delivery_method | +| Grade | grade | +| Number_of_Credits_Attempted | credits_attempted | +| Number_of_Credits_Earned | credits_earned | +| Course_Instructor_Employment_Status | instructor_status | + +**Step 1: Create the commit route file (course enrollment path only)** + +Create `codebenders-dashboard/app/api/admin/upload/commit/route.ts`: + +```typescript +import { type NextRequest, NextResponse } from "next/server" +import { parse } from "csv-parse" +import { Readable } from "stream" +import { getPool } from "@/lib/db" + +const BATCH_SIZE = 500 + +function toBoolean(val: string): boolean | null { + if (val === "Y") return true + if (val === "N") return false + return null +} + +function toNumeric(val: string): number | null { + const t = val.trim() + if (!t || t === "null" || t === "NULL") return null + const n = parseFloat(t) + return isNaN(n) ? null : n +} + +function toNullable(val: string): string | null { + const t = val.trim() + return t === "" ? null : t +} + +interface EnrollmentRow { + student_guid: string + cohort: string | null + cohort_term: string | null + academic_year: string | null + academic_term: string | null + course_prefix: string | null + course_number: string | null + course_name: string | null + course_cip: string | null + course_type: string | null + gateway_type: string | null + is_co_requisite: boolean | null + is_core_course: boolean | null + core_course_type: string | null + delivery_method: string | null + grade: string | null + credits_attempted: number | null + credits_earned: number | null + instructor_status: string | null +} + +const COLS = [ + "student_guid", "cohort", "cohort_term", "academic_year", "academic_term", + "course_prefix", "course_number", "course_name", "course_cip", "course_type", + "gateway_type", "is_co_requisite", "is_core_course", "core_course_type", + "delivery_method", "grade", "credits_attempted", "credits_earned", "instructor_status", +] as const + +async function insertBatch(client: import("pg").PoolClient, batch: EnrollmentRow[]): Promise { + if (batch.length === 0) return + const placeholders: string[] = [] + const params: unknown[] = [] + batch.forEach((row, ri) => { + const p = COLS.map((_, ci) => `$${ri * COLS.length + ci + 1}`).join(", ") + placeholders.push(`(${p})`) + COLS.forEach(col => params.push(row[col])) + }) + await client.query( + `INSERT INTO public.course_enrollments (${COLS.join(", ")}) VALUES ${placeholders.join(", ")}`, + params + ) +} + +async function processCourseEnrollment(buffer: Buffer): Promise<{ inserted: number; skipped: number; errors: string[] }> { + const pool = getPool() + const client = await pool.connect() + let inserted = 0 + let skipped = 0 + const errors: string[] = [] + + try { + await client.query("BEGIN") + await client.query("TRUNCATE TABLE public.course_enrollments RESTART IDENTITY") + + const parser = Readable.from(buffer).pipe( + parse({ columns: true, skip_empty_lines: true }) + ) + + let batch: EnrollmentRow[] = [] + + for await (const record of parser) { + const r = record as Record + const student_guid = toNullable(r["Student_GUID"] ?? "") + if (!student_guid) { + skipped++ + continue + } + batch.push({ + student_guid, + cohort: toNullable(r["Cohort"] ?? ""), + cohort_term: toNullable(r["Cohort_Term"] ?? ""), + academic_year: toNullable(r["Academic_Year"] ?? ""), + academic_term: toNullable(r["Academic_Term"] ?? ""), + course_prefix: toNullable(r["Course_Prefix"] ?? ""), + course_number: toNullable(r["Course_Number"] ?? ""), + course_name: toNullable(r["Course_Name"] ?? ""), + course_cip: toNullable(r["Course_CIP"] ?? ""), + course_type: toNullable(r["Course_Type"] ?? ""), + gateway_type: toNullable(r["Math_or_English_Gateway"] ?? ""), + is_co_requisite: toBoolean(r["Co_requisite_Course"] ?? ""), + is_core_course: toBoolean(r["Core_Course"] ?? ""), + core_course_type: toNullable(r["Core_Course_Type"] ?? ""), + delivery_method: toNullable(r["Delivery_Method"] ?? ""), + grade: toNullable(r["Grade"] ?? ""), + credits_attempted: toNumeric(r["Number_of_Credits_Attempted"] ?? ""), + credits_earned: toNumeric(r["Number_of_Credits_Earned"] ?? ""), + instructor_status: toNullable(r["Course_Instructor_Employment_Status"] ?? ""), + }) + inserted++ + if (batch.length >= BATCH_SIZE) { + await insertBatch(client, batch) + batch = [] + } + } + + if (batch.length > 0) await insertBatch(client, batch) + await client.query("COMMIT") + } catch (err) { + await client.query("ROLLBACK") + errors.push(err instanceof Error ? err.message : String(err)) + inserted = 0 + } finally { + client.release() + } + + return { inserted, skipped, errors } +} + +export async function POST(request: NextRequest) { + const role = request.headers.get("x-user-role") + if (role !== "admin" && role !== "ir") { + return NextResponse.json({ error: "Forbidden" }, { status: 403 }) + } + + let formData: FormData + try { + formData = await request.formData() + } catch { + return NextResponse.json({ error: "Invalid multipart form data" }, { status: 400 }) + } + + const file = formData.get("file") as File | null + const fileType = formData.get("fileType") as string | null + + if (!file || !fileType) { + return NextResponse.json({ error: "Missing file or fileType" }, { status: 400 }) + } + + const buffer = Buffer.from(await file.arrayBuffer()) + + if (fileType === "course_enrollment") { + const result = await processCourseEnrollment(buffer) + return NextResponse.json(result) + } + + // PDP/AR path — placeholder, implemented in Task 6 + return NextResponse.json({ error: `fileType "${fileType}" not yet implemented` }, { status: 501 }) +} +``` + +**Step 2: Type-check** + +```bash +cd codebenders-dashboard && npx tsc --noEmit +``` + +Expected: no errors. + +**Step 3: Smoke-test with curl** (while `npm run dev` is running) + +```bash +curl -s -X POST http://localhost:3000/api/admin/upload/commit \ + -H "x-user-role: admin" \ + -F "fileType=course_enrollment" \ + -F "file=@../data/bishop_state_courses.csv" | jq . +``` + +Expected: `{"inserted": , "skipped": 0, "errors": []}` + +**Step 4: Commit** + +```bash +git add codebenders-dashboard/app/api/admin/upload/commit/route.ts +git commit -m "feat: add POST /api/admin/upload/commit — course enrollment upsert path" +``` + +--- + +## Task 6: Extend commit route — PDP/AR path (Supabase Storage + GitHub dispatch) + +**Files:** +- Modify: `codebenders-dashboard/app/api/admin/upload/commit/route.ts` + +**Background:** For `pdp_cohort` and `pdp_ar` file types, the commit route: +1. Creates a Supabase service-role client (uses `SUPABASE_SERVICE_ROLE_KEY`) +2. Uploads the file to the `pdp-uploads` Storage bucket with path `/-` +3. Calls the GitHub `repository_dispatch` API with `GITHUB_PAT` and `GITHUB_REPO` env vars +4. Returns `{ status: "processing", storageKey, actionsUrl }` + +**Before this task:** Create the `pdp-uploads` bucket in your Supabase dashboard: +- Supabase → Storage → New bucket → name: `pdp-uploads` → Private + +**Step 1: Add the PDP/AR handler to the commit route** + +In `codebenders-dashboard/app/api/admin/upload/commit/route.ts`, add these imports at the top: + +```typescript +import { createClient } from "@supabase/supabase-js" +``` + +Add this function before the `POST` handler: + +```typescript +async function processPdpFile( + buffer: Buffer, + fileName: string, + fileType: string, +): Promise<{ status: string; storageKey: string; actionsUrl: string }> { + const supabaseUrl = process.env.NEXT_PUBLIC_SUPABASE_URL + const serviceKey = process.env.SUPABASE_SERVICE_ROLE_KEY + const githubPat = process.env.GITHUB_PAT + const githubRepo = process.env.GITHUB_REPO + + if (!supabaseUrl || !serviceKey) throw new Error("Missing SUPABASE_SERVICE_ROLE_KEY") + if (!githubPat || !githubRepo) throw new Error("Missing GITHUB_PAT or GITHUB_REPO") + + // 1. Upload to Supabase Storage + const supabase = createClient(supabaseUrl, serviceKey) + const storageKey = `${fileType}/${Date.now()}-${fileName}` + const { error: uploadError } = await supabase.storage + .from("pdp-uploads") + .upload(storageKey, buffer, { contentType: "application/octet-stream", upsert: false }) + + if (uploadError) throw new Error(`Storage upload failed: ${uploadError.message}`) + + // 2. Trigger GitHub Actions via repository_dispatch + const dispatchRes = await fetch( + `https://api.github.com/repos/${githubRepo}/dispatches`, + { + method: "POST", + headers: { + Authorization: `Bearer ${githubPat}`, + Accept: "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + "Content-Type": "application/json", + }, + body: JSON.stringify({ + event_type: "ml-pipeline", + client_payload: { storage_key: storageKey, file_type: fileType }, + }), + } + ) + + if (!dispatchRes.ok) { + const body = await dispatchRes.text() + throw new Error(`GitHub dispatch failed (${dispatchRes.status}): ${body}`) + } + + const actionsUrl = `https://github.com/${githubRepo}/actions` + return { status: "processing", storageKey, actionsUrl } +} +``` + +Replace the placeholder in the `POST` handler at the bottom: + +```typescript + if (fileType === "pdp_cohort" || fileType === "pdp_ar") { + try { + const result = await processPdpFile(buffer, file.name, fileType) + return NextResponse.json(result) + } catch (err) { + return NextResponse.json( + { error: err instanceof Error ? err.message : String(err) }, + { status: 500 } + ) + } + } + + return NextResponse.json({ error: `Unknown fileType: ${fileType}` }, { status: 400 }) +``` + +**Step 2: Type-check** + +```bash +cd codebenders-dashboard && npx tsc --noEmit +``` + +Expected: no errors. + +**Step 3: Commit** + +```bash +git add codebenders-dashboard/app/api/admin/upload/commit/route.ts +git commit -m "feat: extend commit route with PDP/AR → Supabase Storage + GitHub Actions dispatch" +``` + +--- + +## Task 7: Create GitHub Actions ML pipeline workflow + +**Files:** +- Create: `.github/workflows/ml-pipeline.yml` + +**Background:** This workflow fires on `repository_dispatch` with `event_type: ml-pipeline`. It: +1. Downloads the uploaded file from Supabase Storage using a signed URL +2. Determines the target data file path from `file_type` in the payload +3. Replaces the appropriate file in `data/` with the uploaded one +4. Runs the Python ML pipeline +5. Uploads `ML_PIPELINE_REPORT.txt` as an artifact + +**Required GitHub Actions secrets** (set at repo level: Settings → Secrets → Actions): +- `SUPABASE_URL` — your Supabase project URL +- `SUPABASE_SERVICE_ROLE_KEY` — service role key for Storage access +- `DB_HOST`, `DB_USER`, `DB_PASSWORD`, `DB_PORT`, `DB_NAME`, `DB_SSL` — Postgres credentials + +**Step 1: Create the workflow file** + +Create `.github/workflows/ml-pipeline.yml`: + +```yaml +name: ML Pipeline + +on: + repository_dispatch: + types: [ml-pipeline] + +jobs: + run-pipeline: + name: Download data file and run ML pipeline + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Create virtualenv and install dependencies + run: | + python -m venv venv + venv/bin/pip install --upgrade pip + venv/bin/pip install -r requirements.txt + + - name: Download uploaded file from Supabase Storage + env: + SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }} + STORAGE_KEY: ${{ github.event.client_payload.storage_key }} + FILE_TYPE: ${{ github.event.client_payload.file_type }} + run: | + python - <<'EOF' + import os, urllib.request, json + + url = os.environ["SUPABASE_URL"] + key = os.environ["SUPABASE_SERVICE_ROLE_KEY"] + storage_key = os.environ["STORAGE_KEY"] + file_type = os.environ["FILE_TYPE"] + + # Get a signed download URL via Supabase Storage REST API + sign_url = f"{url}/storage/v1/object/sign/pdp-uploads/{storage_key}" + req = urllib.request.Request( + sign_url, + data=json.dumps({"expiresIn": 600}).encode(), + headers={ + "Authorization": f"Bearer {key}", + "Content-Type": "application/json", + "apikey": key, + }, + method="POST", + ) + with urllib.request.urlopen(req) as resp: + signed = json.loads(resp.read()) + signed_url = f"{url}/storage/v1{signed['signedURL']}" + + # Determine destination path + dest = { + "pdp_cohort": "data/bishop_state_cohorts_with_zip.csv", + "pdp_ar": "data/ar_bscc_with_zip.csv", + }.get(file_type) + if not dest: + raise ValueError(f"Unknown file_type: {file_type}") + + print(f"Downloading to {dest}...") + urllib.request.urlretrieve(signed_url, dest) + print("Download complete.") + EOF + + - name: Run ML pipeline + env: + DB_HOST: ${{ secrets.DB_HOST }} + DB_USER: ${{ secrets.DB_USER }} + DB_PASSWORD: ${{ secrets.DB_PASSWORD }} + DB_PORT: ${{ secrets.DB_PORT }} + DB_NAME: ${{ secrets.DB_NAME }} + DB_SSL: ${{ secrets.DB_SSL }} + run: | + venv/bin/python ai_model/complete_ml_pipeline.py + + - name: Upload ML pipeline report + uses: actions/upload-artifact@v4 + if: always() + with: + name: ml-pipeline-report-${{ github.run_id }} + path: ML_PIPELINE_REPORT.txt + retention-days: 90 +``` + +**Step 2: Commit** + +```bash +git add .github/workflows/ml-pipeline.yml +git commit -m "feat: add GitHub Actions ML pipeline workflow triggered by repository_dispatch" +``` + +--- + +## Task 8: Create the upload page UI + +**Files:** +- Create: `codebenders-dashboard/app/admin/upload/page.tsx` + +**Background:** This is a client component (`"use client"`) with three local state phases: `idle` (file selection), `preview` (showing sample rows + warnings), and `result` (showing outcome). It uses `fetch` to call the two API routes. Drag-and-drop is implemented with native HTML5 `onDrop` / `onDragOver` events. + +**Step 1: Create the page file** + +Create `codebenders-dashboard/app/admin/upload/page.tsx`: + +```tsx +"use client" + +import { useState, useCallback } from "react" +import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card" +import { Button } from "@/components/ui/button" +import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table" +import { Upload, AlertCircle, CheckCircle2, Loader2 } from "lucide-react" + +type FileType = "course_enrollment" | "pdp_cohort" | "pdp_ar" +type Phase = "idle" | "previewing" | "preview" | "committing" | "result" + +interface PreviewData { + columns: string[] + sampleRows: Record[] + rowCount: number + warnings: string[] +} + +interface CommitResult { + // Course enrollment + inserted?: number + skipped?: number + errors?: string[] + // PDP/AR + status?: string + storageKey?: string + actionsUrl?: string + error?: string +} + +const FILE_TYPE_LABELS: Record = { + course_enrollment: "Course Enrollment CSV", + pdp_cohort: "PDP Cohort CSV", + pdp_ar: "PDP AR File (.xlsx)", +} + +const FILE_TYPE_ACCEPT: Record = { + course_enrollment: ".csv", + pdp_cohort: ".csv", + pdp_ar: ".csv,.xlsx", +} + +export default function UploadPage() { + const [fileType, setFileType] = useState("course_enrollment") + const [file, setFile] = useState(null) + const [phase, setPhase] = useState("idle") + const [preview, setPreview] = useState(null) + const [result, setResult] = useState(null) + const [dragOver, setDragOver] = useState(false) + const [errorMsg, setErrorMsg] = useState(null) + + const handleFile = useCallback((f: File) => { + setFile(f) + setErrorMsg(null) + setPhase("idle") + setPreview(null) + setResult(null) + }, []) + + const handleDrop = useCallback((e: React.DragEvent) => { + e.preventDefault() + setDragOver(false) + const dropped = e.dataTransfer.files[0] + if (dropped) handleFile(dropped) + }, [handleFile]) + + const handlePreview = async () => { + if (!file) return + setPhase("previewing") + setErrorMsg(null) + const fd = new FormData() + fd.append("file", file) + fd.append("fileType", fileType) + try { + const res = await fetch("/api/admin/upload/preview", { method: "POST", body: fd }) + const data = await res.json() + if (!res.ok) { setErrorMsg(data.error ?? "Preview failed"); setPhase("idle"); return } + setPreview(data as PreviewData) + setPhase("preview") + } catch (err) { + setErrorMsg(err instanceof Error ? err.message : "Network error") + setPhase("idle") + } + } + + const handleCommit = async () => { + if (!file) return + setPhase("committing") + setErrorMsg(null) + const fd = new FormData() + fd.append("file", file) + fd.append("fileType", fileType) + try { + const res = await fetch("/api/admin/upload/commit", { method: "POST", body: fd }) + const data = await res.json() + if (!res.ok) { setErrorMsg(data.error ?? "Upload failed"); setPhase("preview"); return } + setResult(data as CommitResult) + setPhase("result") + } catch (err) { + setErrorMsg(err instanceof Error ? err.message : "Network error") + setPhase("preview") + } + } + + const reset = () => { + setFile(null) + setPhase("idle") + setPreview(null) + setResult(null) + setErrorMsg(null) + } + + return ( +
+
+

Upload Data

+

+ Import course enrollment CSVs or PDP/AR files. Admin and IR only. +

+
+ + {/* ── Phase: idle / selecting ── */} + {(phase === "idle" || phase === "previewing") && ( + + + Select File + Choose a file type, then drop or pick your file. + + + {/* File type selector */} +
+ {(Object.keys(FILE_TYPE_LABELS) as FileType[]).map(ft => ( + + ))} +
+ + {/* Drop zone */} + + + {errorMsg && ( +
+ + {errorMsg} +
+ )} + + +
+
+ )} + + {/* ── Phase: preview ── */} + {(phase === "preview" || phase === "committing") && preview && ( + + + Preview — {FILE_TYPE_LABELS[fileType]} + + {file?.name} · {preview.rowCount} rows parsed + + + + {preview.warnings.length > 0 && ( +
+ {preview.warnings.map((w, i) => ( +
+ + {w} +
+ ))} +
+ )} + +
+ + + + {preview.columns.slice(0, 8).map(col => ( + {col} + ))} + {preview.columns.length > 8 && +{preview.columns.length - 8} more} + + + + {preview.sampleRows.map((row, i) => ( + + {preview.columns.slice(0, 8).map(col => ( + {String(row[col] ?? "")} + ))} + {preview.columns.length > 8 && } + + ))} + +
+
+ + {errorMsg && ( +
+ + {errorMsg} +
+ )} + +
+ + +
+
+
+ )} + + {/* ── Phase: result ── */} + {phase === "result" && result && ( + + + + + Upload Complete + + + + {result.inserted !== undefined && ( +
+

{result.inserted.toLocaleString()} rows inserted

+ {(result.skipped ?? 0) > 0 &&

{result.skipped} rows skipped (missing Student_GUID)

} + {result.errors && result.errors.length > 0 && ( +
+ {result.errors.map((e, i) =>

{e}

)} +
+ )} +
+ )} + {result.status === "processing" && ( +
+

File saved to Supabase Storage. The ML pipeline has been queued in GitHub Actions.

+ {result.actionsUrl && ( + + View pipeline run on GitHub Actions → + + )} +
+ )} + {result.error && ( +
+ {result.error} +
+ )} + +
+
+ )} +
+ ) +} +``` + +**Step 2: Type-check** + +```bash +cd codebenders-dashboard && npx tsc --noEmit +``` + +Expected: no errors. + +**Step 3: Visual check** (while `npm run dev` is running) + +- Log in as an admin or IR user +- Navigate to `/admin/upload` +- Verify "Upload Data" appears in the nav +- Try dragging and dropping `data/bishop_state_courses.csv` +- Verify the preview table shows first 10 rows +- Verify "Confirm & Upload" runs and returns a result + +**Step 4: Commit** + +```bash +git add codebenders-dashboard/app/admin/upload/page.tsx +git commit -m "feat: add /admin/upload page with drag-drop, preview, and commit UI" +``` + +--- + +## Task 9: Final type-check, lint, and push + +**Step 1: Full type-check + lint** + +```bash +cd codebenders-dashboard && npx tsc --noEmit && npm run lint +``` + +Expected: 0 errors, 0 warnings (or only pre-existing warnings). + +**Step 2: Push and open PR** + +```bash +git push origin +gh pr create \ + --title "feat: self-service data upload for course and PDP/AR files (#86)" \ + --body "Closes #86 + +## Summary +- \`/admin/upload\` page (admin/ir only) with drag-drop, preview, and commit +- Course enrollment CSVs stream-parsed and batch-upserted into \`course_enrollments\` Postgres table +- PDP cohort CSVs and AR .xlsx files uploaded to Supabase Storage \`pdp-uploads\` bucket +- GitHub Actions workflow \`ml-pipeline.yml\` triggered via \`repository_dispatch\` after PDP/AR upload + +## New env vars required (see env.example) +- \`SUPABASE_SERVICE_ROLE_KEY\` +- \`GITHUB_PAT\` +- \`GITHUB_REPO\` + +## GitHub Actions secrets required +- \`SUPABASE_URL\`, \`SUPABASE_SERVICE_ROLE_KEY\`, \`DB_HOST\`, \`DB_USER\`, \`DB_PASSWORD\`, \`DB_PORT\`, \`DB_NAME\`, \`DB_SSL\` + +## Test plan +- [ ] Admin/IR can access \`/admin/upload\`; other roles get redirected +- [ ] Upload Data nav link visible to admin/IR only +- [ ] Course enrollment CSV preview shows first 10 rows with correct columns +- [ ] Course enrollment commit inserts rows into \`course_enrollments\` table +- [ ] PDP cohort CSV commit uploads to Supabase Storage and returns \`status: processing\` +- [ ] \`npx tsc --noEmit\` passes with 0 errors +" +``` From cb20f2d6ec0c2de3a0c48fc8d7d203c3f6898a6a Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 21:13:41 -0400 Subject: [PATCH 03/15] docs: design spec for config-driven distillation pipeline Per-school fine-tuning pipeline to replace OpenAI dependency for explanation and summarization endpoints with locally-served Qwen 3.5 models via MLX and Ollama. --- ...2026-03-27-distillation-pipeline-design.md | 550 ++++++++++++++++++ 1 file changed, 550 insertions(+) create mode 100644 docs/superpowers/specs/2026-03-27-distillation-pipeline-design.md diff --git a/docs/superpowers/specs/2026-03-27-distillation-pipeline-design.md b/docs/superpowers/specs/2026-03-27-distillation-pipeline-design.md new file mode 100644 index 0000000..761c553 --- /dev/null +++ b/docs/superpowers/specs/2026-03-27-distillation-pipeline-design.md @@ -0,0 +1,550 @@ +# Config-Driven Distillation Pipeline for Per-School Fine-Tuned Models + +**Date:** 2026-03-27 +**Status:** Draft +**Goal:** Replace OpenAI API dependency for explanation and summarization endpoints with locally-served, per-school fine-tuned models via a repeatable, config-driven training pipeline. + +--- + +## 1. Problem + +The dashboard currently calls OpenAI GPT-4o-mini in two explanation/summarization endpoints: + +- `/api/courses/explain-pairing` — course pairing explanations +- `/api/query-summary` — query result summaries + +This creates per-call API costs, latency, and a dependency on an external service. The explanations are also generic — they lack institutional context about each school's programs, challenges, demographics, and interventions. + +## 2. Solution + +A config-driven distillation pipeline that: + +1. Takes a per-school YAML config describing the school's schema, domain knowledge, and context +2. Uses a teacher model (Claude Sonnet or Qwen 3.5 locally) to generate high-quality training pairs +3. Fine-tunes a small open-source model (Qwen 3.5 4B or 9B) via MLX on Apple Silicon +4. Evaluates the model against ship criteria +5. Exports to Ollama for local serving + +New school = new config file + run the pipeline. No code changes needed. + +## 3. Architecture + +### Directory Structure + +``` +schools/ + bishop-state/ + config.yaml # Schema, domain knowledge, explanation style + seed_queries.yaml # Example questions users ask at this school + akron/ + config.yaml + seed_queries.yaml + +training/ + distill.py # Step 1: Generate training pairs via teacher model + prepare.py # Step 2: Filter, dedup, split (80/10/10) + finetune.py # Step 3: Fine-tune via MLX (Qwen 3.5) + eval.py # Step 4: Evaluate model quality + export.py # Step 5: Package for Ollama + config.py # Shared constants + prompts.py # Teacher prompts (school-agnostic templates) + +training_data/ + bishop-state/ + pairs/ # Raw distilled pairs (explainer.jsonl, summarizer.jsonl) + final/ # Train/val/test splits per adapter + models/ # Fine-tuned LoRA adapters + qwen3.5-9b/ + explainer/ + adapter_config.json + adapter_model.safetensors + summarizer/ + adapter_config.json + adapter_model.safetensors +``` + +### CLI + +```bash +python -m training.distill --school bishop-state [--local] # Generate pairs +python -m training.prepare --school bishop-state # Filter/split +python -m training.finetune --school bishop-state --model 9b # Train +python -m training.eval --school bishop-state # Evaluate +python -m training.export --school bishop-state # Deploy to Ollama +``` + +## 4. School Config Format + +Each school gets a `config.yaml` capturing everything the pipeline needs. Sections: + +### Core Identity + +```yaml +school: + name: "Bishop State Community College" + code: "bscc" + type: "community_college" + designation: ["hbcu", "minority_serving"] + accreditation: "SACSCOC" + founded: 1927 +``` + +### Location and Setting + +```yaml + location: + address: "351 North Broad Street" + city: "Mobile" + state: "Alabama" + zip: "36603" + county: "Mobile County" + region: "Gulf Coast" + setting: "urban" + climate_zone: "subtropical" +``` + +### Enrollment Profile + +```yaml + enrollment: + total_headcount: 4200 + fte: 2800 + undergraduate_only: true + residential: false + percent_full_time: 0.42 + percent_part_time: 0.58 + percent_online: 0.35 + open_admission: true +``` + +### Demographics + +```yaml + demographics: + percent_black: 0.72 + percent_white: 0.18 + percent_hispanic: 0.05 + percent_other: 0.05 + percent_pell_eligible: 0.68 + percent_first_gen: 0.55 + percent_adult_learners: 0.40 + median_household_income_area: 42000 +``` + +### Database Schema + +```yaml +database: + main_table: "student_level_with_predictions" + course_table: "course_enrollments" + connection_env: "DATABASE_URL" + +schema: + student_columns: + Cohort: "Cohort year (numeric: 2019, 2020, etc.)" + Race: "Student race/ethnicity" + Gender: "Student gender" + Retention: "Retention indicator (0 or 1)" + GPA_Group_Year_1: "GPA in year 1" + # ... full column list from route.ts SCHEMA_INFO + course_columns: + course_prefix: "Course dept code (MAT, ENG, NUR, etc.)" + grade: "Student grade (A, B, C, D, F, W, I)" + # ... full column list +``` + +### Domain Knowledge + +```yaml +domain: + programs: + - name: "Nursing (ADN)" + cip: "51.3801" + gateway_courses: ["BIO 201", "MAT 110"] + - name: "Welding Technology" + cip: "48.0508" + gateway_courses: ["WDT 108", "WDT 109"] + key_metrics: ["retention_rate", "dfwi_rate", "gateway_pass_rate"] + terminology: + credential: "associate degree or certificate" + at_risk: "students flagged by early warning system" +``` + +### Workforce and Outcomes + +```yaml + workforce: + top_employers: ["Austal USA", "Mobile Infirmary", "AM/NS Calvert"] + high_demand_fields: ["healthcare", "advanced_manufacturing", "maritime"] + + outcomes: + job_placement_rate_6mo: 0.78 + median_salary_after_credential: + associate: 34000 + certificate: 29000 + licensure_pass_rates: + nursing_nclex: 0.89 + welding_aws: 0.92 +``` + +### Peer Context + +```yaml + peers: + ipeds_id: "101505" + carnegie_class: "Associate's—High Transfer-High Traditional" + peer_institutions: ["Lawson State CC", "Shelton State CC"] + state_system: "Alabama Community College System" +``` + +### Financial Context + +```yaml + financial: + in_state_tuition: 4800 + avg_financial_aid_package: 5200 + percent_receiving_aid: 0.82 + percent_student_loans: 0.25 + cost_of_living_index: 87.3 + emergency_aid_fund: true +``` + +### Completion Context + +```yaml + completion: + ipeds_graduation_rate: 0.18 + adjusted_completion_rate: 0.42 + avg_time_to_credential: 3.2 + percent_transfer_out: 0.24 + percent_stop_out_return: 0.15 + top_completion_barriers: + - "developmental_math_sequences" + - "financial_emergencies" + - "work_schedule_conflicts" +``` + +### Faculty and Instruction + +```yaml + instruction: + student_faculty_ratio: 18 + percent_full_time_faculty: 0.45 + percent_adjunct: 0.55 + developmental_ed_model: "corequisite" +``` + +### Student Pipeline + +```yaml + pipeline: + feeder_high_schools: + - name: "Williamson High School" + percent_of_enrollment: 0.12 + avg_readiness: "below_college_level" + percent_ged: 0.11 + percent_veterans: 0.07 + percent_career_changers: 0.14 + primary_recruitment_radius_miles: 35 +``` + +### Digital Access + +```yaml + technology: + percent_students_with_reliable_wifi: 0.71 + percent_students_with_personal_laptop: 0.64 + campus_device_lending: true + broadband_desert_overlap: true +``` + +### Transportation and Access + +```yaml + access: + campus_count: 4 + campuses: + - name: "Main Campus" + public_transit_accessible: true + - name: "Southwest Campus" + public_transit_accessible: false + percent_students_commute_30_plus_min: 0.35 + evening_weekend_classes: true +``` + +### Equity Gaps and Initiatives + +```yaml + equity: + known_gaps: + - metric: "gateway_math_pass_rate" + group_a: { name: "Black male students", value: 0.41 } + group_b: { name: "Overall", value: 0.58 } + initiative: "Male Student Success mentoring program" + minority_male_initiative: "Brother 2 Brother" +``` + +### Active Interventions + +```yaml + interventions: + active: + - name: "Starfish Early Alert" + type: "early_warning" + target: "all students" + trigger: "missed 2+ classes or below C at midterm" + effectiveness: "12% retention lift in pilot cohorts" + - name: "Emergency Micro-Grants" + type: "financial" + max_award: 500 + effectiveness: "78% of recipients re-enrolled next term" +``` + +### Student Life + +```yaml + student_life: + percent_working_while_enrolled: 0.72 + percent_working_over_20hrs: 0.48 + percent_single_parents: 0.18 + food_insecurity_rate: 0.31 + housing_insecurity_rate: 0.14 +``` + +### Community Health Context + +```yaml + health: + mental_health_counselor_ratio: "1:1400" + community_health_context: + - "Mobile County has highest diabetes rate in Alabama" + - "Limited mental health providers in service area" +``` + +### Seasonal Patterns + +```yaml + patterns: + high_attrition_points: + - week: 4 + reason: "Financial aid disbursement delays" + - week: 8 + reason: "Midterm performance shock" + - month: "October" + reason: "Hurricane season peak" + summer_melt_rate: 0.22 +``` + +### Historical Trends + +```yaml + trends: + enrollment_direction: "declining" + enrollment_5yr_change: -0.12 + completion_direction: "improving" + notable_changes: + - year: 2022 + event: "Switched to corequisite math model" + - year: 2023 + event: "Launched early alert system with ML predictions" +``` + +### Institutional Priorities + +```yaml + priorities: + strategic_plan_years: "2024-2029" + top_goals: + - "Increase fall-to-fall retention from 42% to 55%" + - "Launch 3 new short-term workforce certificates" + - "Close equity gap in gateway math by 50%" + accreditation_qep_topic: "Guided Pathways implementation" + grant_funded_initiatives: + - name: "Title III Strengthening Institutions" + focus: "Student support services and advising redesign" + end_date: "2027-09-30" +``` + +### Data Quality Notes + +```yaml + data_caveats: + - "Pre-2020 cohorts lack online/hybrid delivery classification" + - "Race/ethnicity is self-reported; 6% of records are 'Unknown'" + - "Transfer-out data relies on NSC match — ~85% match rate" +``` + +### Distillation and Training Config + +```yaml +distillation: + teacher_model: "claude-sonnet-4-20250514" + teacher_backend: "anthropic" + local_teacher_model: "qwen3.5:27b" + local_teacher_backend: "ollama" + pairs_per_task: 1500 + +training: + default_model: "qwen3.5:9b" + fallback_model: "qwen3.5:4b" + method: "qlora" + quantization: 4 + lora_rank: 16 + lora_alpha: 32 + epochs: 3 + learning_rate: 1.0e-4 + batch_size: 4 + warmup_steps: 100 + eval_every: 50 + early_stopping_patience: 3 +``` + +## 5. Distillation — Teacher Prompts and Pair Generation + +### Two Adapters + +| Adapter | Replaces | Input | Output | +|---------|----------|-------|--------| +| **Explainer** | `/api/courses/explain-pairing` | Course pairing data | Structured explanation JSON | +| **Summarizer** | `/api/query-summary` | Query + result rows | Structured summary JSON | + +### Teacher Prompt Strategy + +**Explainer teacher prompt:** + +The teacher model receives the full institutional context from config.yaml plus the course pairing data, and generates: + +```json +{ + "explanation": "2-3 sentence plain-language explanation", + "structural_factors": ["institutional/systemic factors"], + "student_impact": "what this means for students", + "advisor_recommendation": "actionable next step", + "data_limitations": ["caveats about this data"], + "related_intervention": "existing program that addresses this, or null" +} +``` + +**Summarizer teacher prompt:** + +The teacher receives institutional context plus the original query and SQL result rows, and generates: + +```json +{ + "summary": "2-3 sentence headline finding", + "key_insights": ["notable patterns"], + "context": "how this connects to institutional priorities or known challenges", + "action_items": ["what someone should do with this information"], + "caveats": ["data limitations relevant to this query"] +} +``` + +**Student prompts** (what the fine-tuned model sees at inference) are minimal — just the data input. All institutional context is baked into the weights during training. + +### Dual Teacher Support + +- **`--local` flag:** Uses Qwen 3.5 27B via Ollama for free iteration and pipeline testing +- **Default:** Uses Claude Sonnet via Anthropic API for production-quality training data + +### Seed Data Sources + +1. **Database-driven (500 pairs per adapter):** Query the school's actual data for real course pairings and result sets +2. **Template-driven (500 pairs per adapter):** From `seed_queries.yaml` with school-specific examples +3. **Synthetic variation (500 pairs per adapter):** Pipeline varies dimensions (cohorts, programs, demographics) to reach 1,500 pairs per adapter + +**Total per school:** 3,000 training pairs. Distillation cost via Claude Sonnet: ~$15-25. + +## 6. Fine-Tuning + +### Method + +QLoRA via Apple MLX framework on Apple Silicon Macs. + +- Base model: Qwen 3.5 9B (default) or 4B (lightweight) +- 4-bit quantized base, trainable low-rank adapters +- Two separate adapters per school (explainer + summarizer) on the same base model + +### Hardware Requirements + +| Model | Training | Inference | +|-------|----------|-----------| +| Qwen 3.5 9B | 24GB+ RAM (M-series Mac) | 8GB+ RAM (Q4 via Ollama) | +| Qwen 3.5 4B | 16GB+ RAM (M-series Mac) | 4GB+ RAM (Q4 via Ollama) | + +### Training Time Estimates (3,000 examples, 3 epochs) + +| Model | 18GB Mac (M3 Pro) | 36GB Mac (M3 Pro) | +|-------|-------------------|-------------------| +| Qwen 3.5 4B | ~2-4 hrs | ~1.5-3 hrs | +| Qwen 3.5 9B | Tight, not recommended | ~3-5 hrs | + +## 7. Evaluation + +### Ship Criteria + +| Metric | What It Checks | Threshold | +|--------|---------------|-----------| +| JSON validity | Output parses as valid JSON | >= 95% | +| Schema adherence | All required keys present, correct types | >= 90% | +| Explanation quality | ROUGE-L against teacher outputs | >= 0.35 | +| Factual grounding | Mentions data values from input, not hallucinated | >= 85% | +| Actionability | Recommendations are non-generic | >= 80% | +| Caveat inclusion | Data limitations populated | >= 90% | + +Pipeline refuses to export a model that fails any threshold. + +## 8. Deployment + +### Export to Ollama + +```bash +python -m training.export --school bishop-state +# Registers: +# bishop-state-explainer:9b +# bishop-state-summarizer:9b +``` + +### Dashboard Integration + +A thin adapter layer in `lib/model-client.ts` routes to the appropriate backend: + +``` +MODEL_BACKEND=ollama → local fine-tuned model via Ollama +MODEL_BACKEND=openai → fallback to OpenAI GPT-4o-mini +SCHOOL_CODE=bishop-state +``` + +Routes affected: + +| Route | Current | After | +|-------|---------|-------| +| `/api/courses/explain-pairing` | OpenAI GPT-4o-mini | `bishop-state-explainer:9b` via Ollama | +| `/api/query-summary` | OpenAI GPT-4o-mini | `bishop-state-summarizer:9b` via Ollama | +| `/api/analyze` | OpenAI GPT-4o-mini | No change (future adapter) | + +## 9. Onboarding a New School + +1. Create `schools/{school-code}/config.yaml` — fill in institutional context +2. Create `schools/{school-code}/seed_queries.yaml` — 20-50 example questions +3. Run the pipeline: + ```bash + python -m training.distill --school {school-code} [--local] + python -m training.prepare --school {school-code} + python -m training.finetune --school {school-code} --model 9b + python -m training.eval --school {school-code} + python -m training.export --school {school-code} + ``` +4. Set env vars: `MODEL_BACKEND=ollama SCHOOL_CODE={school-code}` +5. Deploy dashboard + +## 10. Cost Summary + +| Item | Per School | One-Time | +|------|-----------|----------| +| Distillation (Claude Sonnet) | $15-25 | - | +| Distillation (local Qwen) | $0 | - | +| Fine-tuning (MLX on Mac) | $0 (electricity) | - | +| Inference (Ollama) | $0 | - | +| Base model download | - | ~6GB (cached) | + +**Total cost to onboard a new school: $15-25** (or $0 with local teacher). From ae971e4b27fc81e2318a4cec8486ef19191c14c4 Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 21:33:54 -0400 Subject: [PATCH 04/15] docs: implementation plan for config-driven distillation pipeline --- .../plans/2026-03-27-distillation-pipeline.md | 3805 +++++++++++++++++ 1 file changed, 3805 insertions(+) create mode 100644 docs/superpowers/plans/2026-03-27-distillation-pipeline.md diff --git a/docs/superpowers/plans/2026-03-27-distillation-pipeline.md b/docs/superpowers/plans/2026-03-27-distillation-pipeline.md new file mode 100644 index 0000000..d492ae0 --- /dev/null +++ b/docs/superpowers/plans/2026-03-27-distillation-pipeline.md @@ -0,0 +1,3805 @@ +# Distillation Pipeline Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build a config-driven pipeline that distills a teacher model's knowledge into per-school fine-tuned Qwen 3.5 models, replacing OpenAI API calls for course explanations and query summaries. + +**Architecture:** A 5-stage Python pipeline (distill → prepare → finetune → eval → export) reads per-school YAML configs, generates ChatML training pairs via Claude Sonnet or local Qwen 3.5, fine-tunes via MLX QLoRA, evaluates against ship criteria, and exports to Ollama. The Next.js dashboard swaps OpenAI calls for local Ollama inference via a thin model-client adapter. + +**Tech Stack:** Python 3.8+, PyYAML, Anthropic SDK, ollama (Python client), MLX/mlx-lm (Apple Silicon fine-tuning), pytest, Next.js/TypeScript (dashboard integration) + +**Spec:** `docs/superpowers/specs/2026-03-27-distillation-pipeline-design.md` + +**Reference implementation:** `~/Development/d4bl_ai_agent/scripts/training/` — the d4bl pipeline this adapts from. + +--- + +## File Structure + +### New Files + +``` +training/ + __init__.py # Package init + config.py # Constants + YAML config loader + prompts.py # Teacher prompt templates (explainer + summarizer) + seed.py # Seed data generation (DB + template + synthetic) + distill.py # Stage 1: Generate ChatML pairs via teacher model + prepare.py # Stage 2: Filter, dedup, split + finetune.py # Stage 3: MLX QLoRA fine-tuning + eval.py # Stage 4: Metrics + ship criteria + export.py # Stage 5: Ollama modelfile + registration + +schools/ + bishop-state/ + config.yaml # Full institutional config + seed_queries.yaml # Example queries for training pair generation + +tests/ + conftest.py # Pytest fixtures + training/ + __init__.py + test_config.py # Config loader tests + test_prompts.py # Prompt template tests + test_seed.py # Seed generation tests + test_prepare.py # Filter/dedup/split tests + test_eval.py # Eval metrics + ship criteria tests + +codebenders-dashboard/ + lib/ + model-client.ts # New: Ollama/OpenAI adapter +``` + +### Modified Files + +``` +codebenders-dashboard/ + app/api/courses/explain-pairing/route.ts # Swap OpenAI → model-client + app/api/query-summary/route.ts # Swap OpenAI → model-client + +requirements.txt # Add training dependencies +.gitignore # Add training_data/ +``` + +--- + +## Task 1: Project Scaffolding + +**Files:** +- Create: `training/__init__.py` +- Create: `tests/conftest.py` +- Create: `tests/training/__init__.py` +- Create: `pytest.ini` +- Modify: `requirements.txt` +- Modify: `.gitignore` + +- [ ] **Step 1: Create training package directory** + +```bash +mkdir -p training tests/training +``` + +- [ ] **Step 2: Create package init files** + +Create `training/__init__.py`: +```python +"""Config-driven distillation pipeline for per-school fine-tuned models.""" +``` + +Create `tests/__init__.py`: +```python +``` + +Create `tests/training/__init__.py`: +```python +``` + +- [ ] **Step 3: Create pytest.ini** + +```ini +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +``` + +- [ ] **Step 4: Add training dependencies to requirements.txt** + +Append to `requirements.txt`: +``` +# Training pipeline +pyyaml>=6.0 +anthropic>=0.40.0 +ollama>=0.4.0 +rouge-score>=0.1.2 +mlx>=0.22.0 +mlx-lm>=0.20.0 +``` + +- [ ] **Step 5: Add training_data to .gitignore** + +Append to `.gitignore`: +``` +# Training pipeline artifacts +training_data/ +``` + +- [ ] **Step 6: Create conftest.py with shared fixtures** + +Create `tests/conftest.py`: +```python +"""Shared pytest fixtures for the training pipeline.""" + +from pathlib import Path + +import pytest +import yaml + + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + +@pytest.fixture +def sample_school_config(): + """Minimal valid school config for testing.""" + return { + "school": { + "name": "Test Community College", + "code": "tcc", + "type": "community_college", + "designation": [], + "location": { + "city": "Test City", + "state": "Alabama", + "setting": "urban", + }, + "enrollment": { + "total_headcount": 1000, + "percent_full_time": 0.50, + "percent_part_time": 0.50, + }, + "demographics": { + "percent_pell_eligible": 0.60, + "percent_first_gen": 0.45, + }, + }, + "database": { + "main_table": "student_level_with_predictions", + "course_table": "course_enrollments", + "connection_env": "DATABASE_URL", + }, + "schema": { + "student_columns": { + "Cohort": "Cohort year", + "Race": "Student race/ethnicity", + "Retention": "Retention indicator (0 or 1)", + }, + "course_columns": { + "course_prefix": "Course dept code", + "grade": "Student grade", + }, + }, + "domain": { + "programs": [ + { + "name": "Nursing", + "cip": "51.3801", + "gateway_courses": ["BIO 201"], + } + ], + "key_metrics": ["retention_rate", "dfwi_rate"], + "terminology": { + "credential": "associate degree", + "at_risk": "at-risk students", + }, + }, + "distillation": { + "teacher_model": "claude-sonnet-4-20250514", + "teacher_backend": "anthropic", + "local_teacher_model": "qwen3.5:27b", + "local_teacher_backend": "ollama", + "pairs_per_task": 10, + }, + "training": { + "default_model": "qwen3.5:9b", + "fallback_model": "qwen3.5:4b", + "method": "qlora", + "quantization": 4, + "lora_rank": 16, + "lora_alpha": 32, + "epochs": 3, + "learning_rate": 1e-4, + "batch_size": 4, + "warmup_steps": 100, + "eval_every": 50, + "early_stopping_patience": 3, + }, + } + + +@pytest.fixture +def sample_course_pairing_data(): + """Sample course pairing input for explainer adapter.""" + return { + "course_a": {"prefix": "MAT", "number": "100", "name": "Intermediate Algebra"}, + "course_b": {"prefix": "BIO", "number": "201", "name": "Anatomy & Physiology I"}, + "stats": { + "course_a_dfwi": 0.42, + "course_b_dfwi": 0.31, + "co_enrollment_count": 85, + "co_enrollment_dfwi": 0.38, + "delivery_breakdown": [ + {"method": "Face-to-Face", "count": 50, "dfwi_rate": 0.34}, + {"method": "Online", "count": 35, "dfwi_rate": 0.44}, + ], + }, + } + + +@pytest.fixture +def sample_query_result_data(): + """Sample query result input for summarizer adapter.""" + return { + "prompt": "retention rate by race for 2023 cohort", + "data": [ + {"Race": "Black", "retention_rate": 0.41}, + {"Race": "White", "retention_rate": 0.52}, + {"Race": "Hispanic", "retention_rate": 0.47}, + ], + "rowCount": 3, + "vizType": "bar", + } + + +@pytest.fixture +def sample_explainer_output(): + """Valid explainer adapter JSON output.""" + return { + "explanation": "MAT 100 and BIO 201 show a high co-enrollment DFWI rate of 38%.", + "structural_factors": [ + "Math placement gaps from feeder high schools", + "Online sections show higher DFW rates", + ], + "student_impact": "Students taking both courses simultaneously face compounded difficulty.", + "advisor_recommendation": "Consider staggering MAT 100 and BIO 201 across terms for at-risk students.", + "data_limitations": ["Co-enrollment data limited to 2020+ cohorts"], + "related_intervention": "Math Bootcamp", + } + + +@pytest.fixture +def sample_summarizer_output(): + """Valid summarizer adapter JSON output.""" + return { + "summary": "Retention rates vary significantly by race in the 2023 cohort.", + "key_insights": [ + "Black students have the lowest retention rate at 41%", + "11-point gap between Black and White student retention", + ], + "context": "This aligns with the institution's strategic goal to close equity gaps.", + "action_items": [ + "Review early alert referrals for Black male students in Fall cohort", + ], + "caveats": ["Race is self-reported; 6% of records are Unknown"], + } +``` + +- [ ] **Step 7: Verify pytest runs with no errors** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/ -v --co` +Expected: "no tests ran" (collected 0 items) with exit code 0 + +- [ ] **Step 8: Commit** + +```bash +git add training/ tests/ pytest.ini requirements.txt .gitignore +git commit -m "chore: scaffold training pipeline package and test infrastructure" +``` + +--- + +## Task 2: Config Loader + +**Files:** +- Create: `training/config.py` +- Create: `tests/training/test_config.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/training/test_config.py`: +```python +"""Tests for training.config — constants and school config loader.""" + +import pytest +import yaml +from pathlib import Path +from unittest.mock import patch + +from training.config import ( + BASE_DIR, + SCHOOLS_DIR, + TRAIN_RATIO, + VAL_RATIO, + TEST_RATIO, + load_school_config, + get_school_dir, + get_training_data_dir, + write_jsonl, +) + + +class TestConstants: + def test_split_ratios_sum_to_one(self): + assert TRAIN_RATIO + VAL_RATIO + TEST_RATIO == pytest.approx(1.0) + + def test_base_dir_is_path(self): + assert isinstance(BASE_DIR, Path) + + def test_schools_dir_is_path(self): + assert isinstance(SCHOOLS_DIR, Path) + + +class TestLoadSchoolConfig: + def test_loads_valid_config(self, tmp_path, sample_school_config): + school_dir = tmp_path / "test-school" + school_dir.mkdir() + config_path = school_dir / "config.yaml" + config_path.write_text(yaml.dump(sample_school_config)) + + with patch("training.config.SCHOOLS_DIR", tmp_path): + config = load_school_config("test-school") + + assert config["school"]["name"] == "Test Community College" + assert config["school"]["code"] == "tcc" + assert config["database"]["main_table"] == "student_level_with_predictions" + + def test_raises_on_missing_school(self, tmp_path): + with patch("training.config.SCHOOLS_DIR", tmp_path): + with pytest.raises(FileNotFoundError, match="School config not found"): + load_school_config("nonexistent") + + def test_raises_on_missing_required_keys(self, tmp_path): + school_dir = tmp_path / "bad-school" + school_dir.mkdir() + config_path = school_dir / "config.yaml" + config_path.write_text(yaml.dump({"school": {"name": "Bad"}})) + + with patch("training.config.SCHOOLS_DIR", tmp_path): + with pytest.raises(ValueError, match="Missing required"): + load_school_config("bad-school") + + +class TestGetSchoolDir: + def test_returns_path(self, tmp_path): + with patch("training.config.SCHOOLS_DIR", tmp_path): + result = get_school_dir("bishop-state") + assert result == tmp_path / "bishop-state" + + +class TestGetTrainingDataDir: + def test_returns_path_with_school(self): + result = get_training_data_dir("bishop-state") + assert "bishop-state" in str(result) + assert result.name == "bishop-state" + + +class TestWriteJsonl: + def test_writes_items(self, tmp_path): + import json + + items = [{"a": 1}, {"b": 2}] + outfile = tmp_path / "test.jsonl" + count = write_jsonl(items, outfile) + + assert count == 2 + lines = outfile.read_text().strip().split("\n") + assert json.loads(lines[0]) == {"a": 1} + assert json.loads(lines[1]) == {"b": 2} + + def test_writes_with_transform(self, tmp_path): + import json + + items = [1, 2, 3] + outfile = tmp_path / "test.jsonl" + count = write_jsonl(items, outfile, transform=lambda x: {"val": x * 2}) + + assert count == 3 + lines = outfile.read_text().strip().split("\n") + assert json.loads(lines[0]) == {"val": 2} + + def test_skips_none_from_transform(self, tmp_path): + items = [1, 2, 3] + outfile = tmp_path / "test.jsonl" + count = write_jsonl(items, outfile, transform=lambda x: None if x == 2 else {"v": x}) + + assert count == 2 + + def test_creates_parent_dirs(self, tmp_path): + outfile = tmp_path / "sub" / "dir" / "test.jsonl" + count = write_jsonl([{"x": 1}], outfile) + assert count == 1 + assert outfile.exists() +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_config.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'training.config'` + +- [ ] **Step 3: Write the implementation** + +Create `training/config.py`: +```python +"""Shared constants and school config loader for the training pipeline.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Callable, Optional + +import yaml + +# --------------------------------------------------------------------------- +# Directory layout +# --------------------------------------------------------------------------- + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +SCHOOLS_DIR = PROJECT_ROOT / "schools" +BASE_DIR = PROJECT_ROOT / "training_data" + +# --------------------------------------------------------------------------- +# Dataset split ratios +# --------------------------------------------------------------------------- + +TRAIN_RATIO = 0.80 +VAL_RATIO = 0.10 +TEST_RATIO = 0.10 + +# --------------------------------------------------------------------------- +# Deduplication +# --------------------------------------------------------------------------- + +JACCARD_THRESHOLD = 1.0 # Exact duplicates only + +# --------------------------------------------------------------------------- +# Required top-level keys in school config +# --------------------------------------------------------------------------- + +_REQUIRED_KEYS = {"school", "database", "schema", "domain", "distillation", "training"} + + +# --------------------------------------------------------------------------- +# Config loader +# --------------------------------------------------------------------------- + + +def load_school_config(school: str) -> dict[str, Any]: + """Load and validate a school's config.yaml. + + Args: + school: School directory name (e.g. "bishop-state"). + + Returns: + Parsed config dict. + + Raises: + FileNotFoundError: If the school directory or config.yaml doesn't exist. + ValueError: If required top-level keys are missing. + """ + config_path = SCHOOLS_DIR / school / "config.yaml" + if not config_path.exists(): + raise FileNotFoundError( + f"School config not found: {config_path}" + ) + + with config_path.open("r", encoding="utf-8") as fh: + config = yaml.safe_load(fh) + + missing = _REQUIRED_KEYS - set(config.keys()) + if missing: + raise ValueError( + f"Missing required top-level keys in {config_path}: {missing}" + ) + + return config + + +def get_school_dir(school: str) -> Path: + """Return the path to a school's config directory.""" + return SCHOOLS_DIR / school + + +def get_training_data_dir(school: str) -> Path: + """Return the path to a school's training data directory.""" + return BASE_DIR / school + + +# --------------------------------------------------------------------------- +# JSONL writer (adapted from d4bl) +# --------------------------------------------------------------------------- + + +def write_jsonl( + items: list, + outfile: Path, + transform: Optional[Callable] = None, +) -> int: + """Write items to a JSONL file. + + Args: + items: List of JSON-serializable objects. + outfile: Destination file path. + transform: Optional per-item transformation; returning None skips. + + Returns: + Number of lines written. + """ + outfile = Path(outfile) + outfile.parent.mkdir(parents=True, exist_ok=True) + count = 0 + with outfile.open("w", encoding="utf-8") as fh: + for item in items: + if transform is not None: + item = transform(item) + if item is None: + continue + fh.write(json.dumps(item, ensure_ascii=False) + "\n") + count += 1 + return count +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_config.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add training/config.py tests/training/test_config.py +git commit -m "feat(training): config loader with YAML validation and JSONL writer" +``` + +--- + +## Task 3: Bishop State School Config + +**Files:** +- Create: `schools/bishop-state/config.yaml` +- Create: `schools/bishop-state/seed_queries.yaml` + +- [ ] **Step 1: Create the school directory** + +```bash +mkdir -p schools/bishop-state +``` + +- [ ] **Step 2: Write config.yaml** + +Create `schools/bishop-state/config.yaml` with the full institutional config from the design spec. This is a data file — the schema was validated in Task 2's tests. Include all sections: school identity, location, enrollment, demographics, database schema (copying exact columns from `route.ts` SCHEMA_INFO), domain knowledge, workforce, peers, financial, completion, instruction, pipeline, technology, access, equity, interventions, student_life, health, patterns, trends, priorities, data_caveats, distillation, and training config. + +```yaml +# Bishop State Community College — Training Pipeline Config +# See docs/superpowers/specs/2026-03-27-distillation-pipeline-design.md + +school: + name: "Bishop State Community College" + code: "bscc" + type: "community_college" + designation: ["hbcu", "minority_serving"] + accreditation: "SACSCOC" + founded: 1927 + + location: + address: "351 North Broad Street" + city: "Mobile" + state: "Alabama" + zip: "36603" + county: "Mobile County" + region: "Gulf Coast" + setting: "urban" + climate_zone: "subtropical" + + enrollment: + total_headcount: 4200 + fte: 2800 + undergraduate_only: true + residential: false + percent_full_time: 0.42 + percent_part_time: 0.58 + percent_online: 0.35 + open_admission: true + + demographics: + percent_black: 0.72 + percent_white: 0.18 + percent_hispanic: 0.05 + percent_other: 0.05 + percent_pell_eligible: 0.68 + percent_first_gen: 0.55 + percent_adult_learners: 0.40 + median_household_income_area: 42000 + + workforce: + top_employers: ["Austal USA", "Mobile Infirmary", "AM/NS Calvert"] + high_demand_fields: ["healthcare", "advanced_manufacturing", "maritime"] + workforce_board: "Mobile Works" + + academics: + calendar: "semester" + degree_types: ["associate", "certificate", "short_certificate"] + total_programs: 45 + largest_programs: ["Nursing", "Welding", "Business Administration"] + transfer_partners: ["University of South Alabama", "Alabama A&M"] + dual_enrollment: true + + student_support: + tutoring: true + food_pantry: true + childcare: false + transportation_assistance: true + mental_health_services: true + early_alert_system: true + + challenges: + - "High percentage of students working 20+ hours/week" + - "Limited public transit access to satellite campuses" + - "Hurricane season disrupts Fall semester attendance" + - "Many students require developmental education in math" + + strengths: + - "Strong employer partnerships in healthcare and maritime" + - "Active student mentoring program" + - "High nursing program pass rates on NCLEX" + + peers: + ipeds_id: "101505" + carnegie_class: "Associate's—High Transfer-High Traditional" + peer_institutions: ["Lawson State CC", "Shelton State CC", "Trenholm State CC"] + state_system: "Alabama Community College System" + governing_board: "ACCS Board of Trustees" + + financial: + in_district_tuition: 4800 + in_state_tuition: 4800 + avg_financial_aid_package: 5200 + percent_receiving_aid: 0.82 + percent_student_loans: 0.25 + cost_of_living_index: 87.3 + textbook_program: "inclusive_access" + tuition_payment_plan: true + emergency_aid_fund: true + + completion: + ipeds_graduation_rate: 0.18 + adjusted_completion_rate: 0.42 + avg_time_to_credential: 3.2 + percent_transfer_out: 0.24 + percent_stop_out_return: 0.15 + top_completion_barriers: + - "developmental_math_sequences" + - "financial_emergencies" + - "work_schedule_conflicts" + + instruction: + student_faculty_ratio: 18 + percent_full_time_faculty: 0.45 + percent_adjunct: 0.55 + avg_class_size: 22 + developmental_ed_model: "corequisite" + lms: "Canvas" + + pipeline: + feeder_high_schools: + - name: "Williamson High School" + percent_of_enrollment: 0.12 + avg_readiness: "below_college_level" + - name: "Murphy High School" + percent_of_enrollment: 0.08 + avg_readiness: "mixed" + percent_ged: 0.11 + percent_dual_enrollment_origin: 0.09 + percent_veterans: 0.07 + percent_career_changers: 0.14 + percent_displaced_workers: 0.05 + percent_international: 0.02 + primary_recruitment_radius_miles: 35 + + technology: + percent_students_with_reliable_wifi: 0.71 + percent_students_with_personal_laptop: 0.64 + campus_device_lending: true + hotspot_lending: true + digital_literacy_required: false + broadband_desert_overlap: true + + access: + campus_count: 4 + campuses: + - name: "Main Campus" + address: "351 N Broad St" + public_transit_accessible: true + - name: "Southwest Campus" + address: "925 Dauphin Island Pkwy" + public_transit_accessible: false + percent_students_commute_30_plus_min: 0.35 + public_transit_quality: "limited" + parking_adequate: true + evening_weekend_classes: true + + equity: + known_gaps: + - metric: "gateway_math_pass_rate" + group_a: { name: "Black male students", value: 0.41 } + group_b: { name: "Overall", value: 0.58 } + initiative: "Male Student Success mentoring program" + - metric: "retention" + group_a: { name: "Part-time students", value: 0.38 } + group_b: { name: "Full-time students", value: 0.61 } + initiative: "15-to-Finish advising campaign" + dei_office: true + title_ix_coordinator: true + minority_male_initiative: "Brother 2 Brother" + + interventions: + active: + - name: "Starfish Early Alert" + type: "early_warning" + target: "all students" + trigger: "missed 2+ classes or below C at midterm" + effectiveness: "12% retention lift in pilot cohorts" + - name: "Math Bootcamp" + type: "academic_support" + target: "students placing into developmental math" + timing: "2 weeks before Fall semester" + effectiveness: "participants 2x more likely to pass MAT 100" + - name: "Emergency Micro-Grants" + type: "financial" + target: "students facing unexpected financial hardship" + max_award: 500 + effectiveness: "78% of recipients re-enrolled next term" + planned: + - name: "Proactive advising for 25+ credit students" + launch: "Fall 2026" + + student_life: + percent_working_while_enrolled: 0.72 + percent_working_over_20hrs: 0.48 + percent_single_parents: 0.18 + percent_caregiver_responsibilities: 0.25 + childcare_waitlist: true + student_orgs: 15 + athletics: false + housing_insecurity_rate: 0.14 + food_insecurity_rate: 0.31 + + health: + mental_health_counselor_ratio: "1:1400" + community_health_context: + - "Mobile County has highest diabetes rate in Alabama" + - "Limited mental health providers in service area" + substance_abuse_programs: true + crisis_intervention_protocol: true + + outcomes: + job_placement_rate_6mo: 0.78 + median_salary_after_credential: + associate: 34000 + certificate: 29000 + percent_employed_in_field: 0.65 + licensure_pass_rates: + nursing_nclex: 0.89 + welding_aws: 0.92 + emt: 0.85 + transfer_success_rate: 0.71 + employer_satisfaction_rate: 0.88 + + patterns: + high_attrition_points: + - week: 4 + reason: "Financial aid disbursement delays" + - week: 8 + reason: "Midterm performance shock" + - month: "October" + reason: "Hurricane season peak" + registration_peaks: ["April", "July", "November"] + summer_melt_rate: 0.22 + + trends: + enrollment_direction: "declining" + enrollment_5yr_change: -0.12 + completion_direction: "improving" + notable_changes: + - year: 2020 + event: "COVID shift to online — permanent hybrid expansion" + - year: 2022 + event: "Switched to corequisite math model — dev-ed pass rates doubled" + - year: 2023 + event: "Launched early alert system with ML predictions" + + priorities: + strategic_plan_years: "2024-2029" + top_goals: + - "Increase fall-to-fall retention from 42% to 55%" + - "Launch 3 new short-term workforce certificates" + - "Close equity gap in gateway math by 50%" + accreditation_qep_topic: "Guided Pathways implementation" + grant_funded_initiatives: + - name: "Title III Strengthening Institutions" + focus: "Student support services and advising redesign" + end_date: "2027-09-30" + - name: "NSF ATE Grant" + focus: "Advanced manufacturing curriculum" + end_date: "2026-05-31" + + data_caveats: + - "Pre-2020 cohorts lack online/hybrid delivery classification" + - "Race/ethnicity is self-reported; 6% of records are 'Unknown'" + - "GPA data for dual-enrollment students may reflect high school scale" + - "Transfer-out data relies on National Student Clearinghouse match — ~85% match rate" + - "Course enrollment records before 2019 do not include instructor_status" + +database: + main_table: "student_level_with_predictions" + course_table: "course_enrollments" + connection_env: "DATABASE_URL" + +schema: + student_columns: + Cohort: "Cohort year (numeric: 2019, 2020, etc.)" + Cohort_Term: "Term of cohort entry (Fall, Spring, Summer)" + Student_GUID: "Unique student identifier" + Institution_ID: "Institution identifier (102030 for Bishop State)" + Gender: "Student gender" + Race: "Student race/ethnicity" + Student_Age: "Age of student (integer)" + First_Gen: "First generation status" + Enrollment_Type: "Type of enrollment" + Enrollment_Intensity_First_Term: "Enrollment intensity (Full-Time, Part-Time)" + Program_of_Study_Year_1: "Program of study in year 1 (CIP code)" + Credential_Type_Sought_Year_1: "Credential type being pursued" + Math_Placement: "Math placement level (C=college-level, R=remedial, N=none)" + Retention: "Retention indicator (0 or 1)" + Persistence: "Persistence indicator (0 or 1)" + GPA_Group_Year_1: "GPA in year 1" + GPA_Group_Term_1: "GPA in term 1" + Number_of_Credits_Attempted_Year_1: "Credits attempted in year 1" + Number_of_Credits_Earned_Year_1: "Credits earned in year 1" + Number_of_Credits_Attempted_Year_2: "Credits attempted in year 2" + Number_of_Credits_Earned_Year_2: "Credits earned in year 2" + Time_to_Credential: "Time to any credential" + retention_probability: "Predicted probability of retention (0-1)" + retention_risk_category: "Risk category (Low/Moderate/High/Critical Risk)" + at_risk_alert: "Early warning alert level (LOW/MODERATE/HIGH/URGENT)" + course_completion_rate: "Course completion rate (0-1)" + passing_rate: "Course passing rate (0-1)" + course_columns: + course_prefix: "Course dept code (MAT, ENG, NUR, CIS, etc.)" + course_number: "Course number (100, 201, etc.)" + course_name: "Full course name" + grade: "Student grade (A, B, C, D, F, W, I, AU, P)" + delivery_method: "Delivery (F=face-to-face, O=online, H=hybrid)" + instructor_status: "Instructor type (FT=full-time, PT=part-time)" + gateway_type: "Gateway (M=math, E=English, N=not a gateway)" + credits_attempted: "Credits attempted (numeric)" + credits_earned: "Credits earned (numeric)" + cohort: "Cohort year as text" + academic_year: "Academic year (e.g. 2021-22)" + academic_term: "Term (FALL, SPRING, SUMMER)" + ferpa_excluded: + - "Student_GUID" + - "student_guid" + +domain: + programs: + - name: "Nursing (ADN)" + cip: "51.3801" + gateway_courses: ["BIO 201", "MAT 110"] + - name: "Welding Technology" + cip: "48.0508" + gateway_courses: ["WDT 108", "WDT 109"] + - name: "Business Administration" + cip: "52.0201" + gateway_courses: ["MAT 100", "BUS 241"] + - name: "Computer Information Systems" + cip: "11.0101" + gateway_courses: ["CIS 146", "MAT 100"] + - name: "Emergency Medical Technician" + cip: "51.0904" + gateway_courses: ["EMS 100", "BIO 201"] + key_metrics: + - "retention_rate" + - "dfwi_rate" + - "gateway_pass_rate" + - "completion_rate" + - "transfer_rate" + terminology: + credential: "associate degree or certificate" + at_risk: "students flagged by early warning system" + gateway_course: "first college-level course in math or English" + dfwi: "grades of D, F, W, or I (unsuccessful completion)" + +distillation: + teacher_model: "claude-sonnet-4-20250514" + teacher_backend: "anthropic" + local_teacher_model: "qwen3.5:27b" + local_teacher_backend: "ollama" + pairs_per_task: 1500 + +training: + default_model: "qwen3.5:9b" + fallback_model: "qwen3.5:4b" + method: "qlora" + quantization: 4 + lora_rank: 16 + lora_alpha: 32 + epochs: 3 + learning_rate: 1.0e-4 + batch_size: 4 + warmup_steps: 100 + eval_every: 50 + early_stopping_patience: 3 +``` + +- [ ] **Step 3: Write seed_queries.yaml** + +Create `schools/bishop-state/seed_queries.yaml`: +```yaml +# Example queries for training pair generation +# These seed the template-driven portion of distillation. + +explainer: + # Advisor-perspective queries + - query: "MAT 100 and BIO 201 pairing for nursing students" + style: "advisor" + - query: "ENG 101 and HIS 201 co-enrollment outcomes" + style: "advisor" + - query: "High DFW in MAT 110 for part-time evening students" + style: "advisor" + - query: "CIS 146 and MAT 100 pairing for CIS majors" + style: "advisor" + - query: "WDT 108 and WDT 109 sequential outcomes" + style: "advisor" + + # Administrator-perspective queries + - query: "Online vs face-to-face outcomes in gateway math" + style: "administrator" + - query: "Adjunct vs full-time instructor DFW rates in BIO 201" + style: "administrator" + - query: "Summer vs Fall section outcomes for ENG 101" + style: "administrator" + - query: "Developmental math co-enrollment with science courses" + style: "administrator" + - query: "Dual-enrollment student performance in college-level courses" + style: "administrator" + + # Faculty-perspective queries + - query: "EMS 100 and BIO 201 prerequisite outcomes" + style: "faculty" + - query: "MAT 100 withdrawal patterns by week of semester" + style: "faculty" + - query: "Hybrid delivery outcomes in nursing prerequisite courses" + style: "faculty" + +summarizer: + # Retention and completion + - query: "retention rate by race for 2023 cohort" + style: "faculty" + - query: "overall retention trend from 2019 to 2023" + style: "administrator" + - query: "retention rate for first-generation students" + style: "advisor" + - query: "completion rate by enrollment intensity" + style: "administrator" + + # Course performance + - query: "gateway course pass rates by delivery method" + style: "administrator" + - query: "top 10 courses with highest DFW rates" + style: "faculty" + - query: "DFW rates by instructor status in math courses" + style: "administrator" + - query: "course completion rates for online vs face-to-face" + style: "faculty" + + # Demographics and equity + - query: "enrollment by race and gender" + style: "administrator" + - query: "GPA distribution for Pell-eligible students" + style: "advisor" + - query: "retention gap between full-time and part-time students" + style: "administrator" + - query: "at-risk student count by program" + style: "advisor" + + # Risk and intervention + - query: "students with URGENT early warning alert by cohort" + style: "advisor" + - query: "average retention probability by math placement" + style: "faculty" + - query: "critical risk students in nursing program" + style: "advisor" +``` + +- [ ] **Step 4: Verify config loads correctly** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -c "from training.config import load_school_config; c = load_school_config('bishop-state'); print(f'Loaded: {c[\"school\"][\"name\"]}')"` +Expected: `Loaded: Bishop State Community College` + +- [ ] **Step 5: Commit** + +```bash +git add schools/ +git commit -m "feat(training): add Bishop State school config and seed queries" +``` + +--- + +## Task 4: Teacher Prompt Templates + +**Files:** +- Create: `training/prompts.py` +- Create: `tests/training/test_prompts.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/training/test_prompts.py`: +```python +"""Tests for training.prompts — teacher prompt templates.""" + +import json +import pytest + +from training.prompts import ( + build_system_prompt, + build_explainer_prompt, + build_summarizer_prompt, + EXPLAINER_STUDENT_SYSTEM, + SUMMARIZER_STUDENT_SYSTEM, + EXPLAINER_SCHEMA, + SUMMARIZER_SCHEMA, +) + + +class TestBuildSystemPrompt: + def test_includes_school_name(self, sample_school_config): + result = build_system_prompt(sample_school_config) + assert "Test Community College" in result + + def test_includes_location(self, sample_school_config): + result = build_system_prompt(sample_school_config) + assert "Test City" in result + assert "Alabama" in result + + def test_includes_demographics(self, sample_school_config): + result = build_system_prompt(sample_school_config) + assert "Pell" in result or "pell" in result + + def test_returns_string(self, sample_school_config): + result = build_system_prompt(sample_school_config) + assert isinstance(result, str) + assert len(result) > 100 + + +class TestBuildExplainerPrompt: + def test_includes_course_data(self, sample_school_config, sample_course_pairing_data): + result = build_explainer_prompt(sample_school_config, sample_course_pairing_data) + assert "MAT" in result + assert "BIO" in result + + def test_includes_stats(self, sample_school_config, sample_course_pairing_data): + result = build_explainer_prompt(sample_school_config, sample_course_pairing_data) + assert "0.42" in result or "42" in result + + def test_includes_output_schema(self, sample_school_config, sample_course_pairing_data): + result = build_explainer_prompt(sample_school_config, sample_course_pairing_data) + assert "explanation" in result + assert "structural_factors" in result + assert "advisor_recommendation" in result + + def test_returns_string(self, sample_school_config, sample_course_pairing_data): + result = build_explainer_prompt(sample_school_config, sample_course_pairing_data) + assert isinstance(result, str) + + +class TestBuildSummarizerPrompt: + def test_includes_query(self, sample_school_config, sample_query_result_data): + result = build_summarizer_prompt(sample_school_config, sample_query_result_data) + assert "retention rate by race" in result + + def test_includes_data(self, sample_school_config, sample_query_result_data): + result = build_summarizer_prompt(sample_school_config, sample_query_result_data) + assert "Black" in result + assert "0.41" in result or "41" in result + + def test_includes_output_schema(self, sample_school_config, sample_query_result_data): + result = build_summarizer_prompt(sample_school_config, sample_query_result_data) + assert "summary" in result + assert "key_insights" in result + assert "action_items" in result + + def test_returns_string(self, sample_school_config, sample_query_result_data): + result = build_summarizer_prompt(sample_school_config, sample_query_result_data) + assert isinstance(result, str) + + +class TestStudentPrompts: + def test_explainer_student_system_is_concise(self): + assert len(EXPLAINER_STUDENT_SYSTEM) < 500 + assert "JSON" in EXPLAINER_STUDENT_SYSTEM + + def test_summarizer_student_system_is_concise(self): + assert len(SUMMARIZER_STUDENT_SYSTEM) < 500 + assert "JSON" in SUMMARIZER_STUDENT_SYSTEM + + +class TestOutputSchemas: + def test_explainer_schema_has_required_keys(self): + required = {"explanation", "structural_factors", "student_impact", + "advisor_recommendation", "data_limitations", "related_intervention"} + assert required == set(EXPLAINER_SCHEMA.keys()) + + def test_summarizer_schema_has_required_keys(self): + required = {"summary", "key_insights", "context", "action_items", "caveats"} + assert required == set(SUMMARIZER_SCHEMA.keys()) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_prompts.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'training.prompts'` + +- [ ] **Step 3: Write the implementation** + +Create `training/prompts.py`: +```python +"""Teacher prompt templates for the distillation pipeline. + +Provides school-agnostic prompt builders that inject per-school context +from config.yaml to generate high-quality training pairs. +""" + +from __future__ import annotations + +import json +from typing import Any + +# --------------------------------------------------------------------------- +# Output schemas — define what the fine-tuned model produces +# --------------------------------------------------------------------------- + +EXPLAINER_SCHEMA = { + "explanation": "2-3 sentence plain-language explanation of the course pairing pattern", + "structural_factors": ["list of institutional or systemic factors driving this pattern"], + "student_impact": "what this means for students taking these courses", + "advisor_recommendation": "one actionable next step for advisors", + "data_limitations": ["caveats about interpreting this data"], + "related_intervention": "existing program that addresses this, or null", +} + +SUMMARIZER_SCHEMA = { + "summary": "2-3 sentence headline finding from the query results", + "key_insights": ["list of notable patterns in the data"], + "context": "how this connects to institutional priorities or known challenges", + "action_items": ["what someone should do with this information"], + "caveats": ["data limitations relevant to this specific query"], +} + +# --------------------------------------------------------------------------- +# Student system prompts (what the fine-tuned model sees at inference) +# --------------------------------------------------------------------------- + +EXPLAINER_STUDENT_SYSTEM = ( + "You are a student success analyst. Given course pairing data, generate a " + "structured JSON explanation. Include: explanation, structural_factors, " + "student_impact, advisor_recommendation, data_limitations, and " + "related_intervention. Respond with ONLY valid JSON." +) + +SUMMARIZER_STUDENT_SYSTEM = ( + "You are a student success analyst. Given a query and its results, generate " + "a structured JSON summary. Include: summary, key_insights, context, " + "action_items, and caveats. Respond with ONLY valid JSON." +) + +# --------------------------------------------------------------------------- +# Context builder — extracts relevant sections from school config +# --------------------------------------------------------------------------- + + +def build_system_prompt(config: dict[str, Any]) -> str: + """Build the teacher system prompt with full institutional context. + + Injects school identity, demographics, challenges, interventions, + equity gaps, and priorities from the school config. + + Args: + config: Parsed school config dict. + + Returns: + System prompt string for the teacher model. + """ + school = config["school"] + domain = config["domain"] + + sections = [] + + # Identity + name = school["name"] + location = school.get("location", {}) + city = location.get("city", "") + state = location.get("state", "") + school_type = school.get("type", "institution") + sections.append( + f"You are a student success analyst at {name}, " + f"a {school_type} in {city}, {state}." + ) + + # Designation + designations = school.get("designation", []) + if designations: + sections.append(f"Institutional designations: {', '.join(designations)}.") + + # Enrollment + enrollment = school.get("enrollment", {}) + if enrollment: + parts = [] + if "total_headcount" in enrollment: + parts.append(f"{enrollment['total_headcount']:,} students") + if "percent_part_time" in enrollment: + parts.append(f"{enrollment['percent_part_time']:.0%} part-time") + if "percent_online" in enrollment: + parts.append(f"{enrollment['percent_online']:.0%} online") + if enrollment.get("open_admission"): + parts.append("open admission") + if parts: + sections.append(f"Enrollment profile: {', '.join(parts)}.") + + # Demographics + demographics = school.get("demographics", {}) + if demographics: + parts = [] + for key, label in [ + ("percent_pell_eligible", "Pell-eligible"), + ("percent_first_gen", "first-generation"), + ("percent_adult_learners", "adult learners (25+)"), + ]: + if key in demographics: + parts.append(f"{demographics[key]:.0%} {label}") + if parts: + sections.append(f"Student demographics: {', '.join(parts)}.") + + # Programs + programs = domain.get("programs", []) + if programs: + program_names = [p["name"] for p in programs[:5]] + sections.append(f"Key programs: {', '.join(program_names)}.") + + # Challenges + challenges = school.get("challenges", []) + if challenges: + sections.append("Known challenges:\n" + "\n".join(f"- {c}" for c in challenges)) + + # Strengths + strengths = school.get("strengths", []) + if strengths: + sections.append("Institutional strengths:\n" + "\n".join(f"- {s}" for s in strengths)) + + # Equity gaps + equity = school.get("equity", {}) + known_gaps = equity.get("known_gaps", []) + if known_gaps: + gap_lines = [] + for gap in known_gaps: + ga = gap.get("group_a", {}) + gb = gap.get("group_b", {}) + gap_lines.append( + f"- {gap['metric']}: {ga.get('name', '?')} ({ga.get('value', '?')}) " + f"vs {gb.get('name', '?')} ({gb.get('value', '?')})" + ) + sections.append("Known equity gaps:\n" + "\n".join(gap_lines)) + + # Interventions + interventions = school.get("interventions", {}) + active = interventions.get("active", []) + if active: + lines = [] + for i in active: + line = f"- {i['name']} ({i['type']}): {i.get('effectiveness', 'effectiveness unknown')}" + lines.append(line) + sections.append("Active interventions:\n" + "\n".join(lines)) + + # Priorities + priorities = school.get("priorities", {}) + top_goals = priorities.get("top_goals", []) + if top_goals: + sections.append("Strategic priorities:\n" + "\n".join(f"- {g}" for g in top_goals)) + + # Data caveats + caveats = school.get("data_caveats", []) + if caveats: + sections.append("Data caveats:\n" + "\n".join(f"- {c}" for c in caveats)) + + # Completion context + completion = school.get("completion", {}) + if completion: + parts = [] + if "ipeds_graduation_rate" in completion: + parts.append(f"IPEDS grad rate: {completion['ipeds_graduation_rate']:.0%}") + if "adjusted_completion_rate" in completion: + parts.append(f"adjusted completion: {completion['adjusted_completion_rate']:.0%}") + barriers = completion.get("top_completion_barriers", []) + if barriers: + parts.append(f"top barriers: {', '.join(b.replace('_', ' ') for b in barriers)}") + if parts: + sections.append(f"Completion context: {'; '.join(parts)}.") + + # Student life + student_life = school.get("student_life", {}) + if student_life: + parts = [] + if "percent_working_over_20hrs" in student_life: + parts.append(f"{student_life['percent_working_over_20hrs']:.0%} working 20+ hrs/wk") + if "food_insecurity_rate" in student_life: + parts.append(f"{student_life['food_insecurity_rate']:.0%} food insecure") + if "percent_single_parents" in student_life: + parts.append(f"{student_life['percent_single_parents']:.0%} single parents") + if parts: + sections.append(f"Student life: {', '.join(parts)}.") + + # Patterns + patterns = school.get("patterns", {}) + attrition_points = patterns.get("high_attrition_points", []) + if attrition_points: + lines = [] + for point in attrition_points: + when = f"week {point['week']}" if "week" in point else point.get("month", "?") + lines.append(f"- {when}: {point['reason']}") + sections.append("Known attrition patterns:\n" + "\n".join(lines)) + + # Workforce + workforce = school.get("workforce", {}) + if workforce: + employers = workforce.get("top_employers", []) + fields = workforce.get("high_demand_fields", []) + if employers or fields: + parts = [] + if employers: + parts.append(f"top employers: {', '.join(employers)}") + if fields: + parts.append(f"high-demand fields: {', '.join(fields)}") + sections.append(f"Workforce context: {'; '.join(parts)}.") + + # Outcomes + outcomes = school.get("outcomes", {}) + if outcomes: + parts = [] + if "job_placement_rate_6mo" in outcomes: + parts.append(f"6-month job placement: {outcomes['job_placement_rate_6mo']:.0%}") + licensure = outcomes.get("licensure_pass_rates", {}) + if licensure: + lic_parts = [f"{k}: {v:.0%}" for k, v in licensure.items()] + parts.append(f"licensure pass rates: {', '.join(lic_parts)}") + if parts: + sections.append(f"Outcomes: {'; '.join(parts)}.") + + sections.append("Respond with ONLY valid JSON.") + + return "\n\n".join(sections) + + +# --------------------------------------------------------------------------- +# Explainer prompt +# --------------------------------------------------------------------------- + + +def build_explainer_prompt( + config: dict[str, Any], + course_data: dict[str, Any], +) -> str: + """Build the teacher prompt for generating a course pairing explanation. + + Args: + config: Parsed school config dict. + course_data: Course pairing data dict with keys: course_a, course_b, stats. + + Returns: + User prompt string for the teacher model. + """ + schema_str = json.dumps(EXPLAINER_SCHEMA, indent=2) + data_str = json.dumps(course_data, indent=2, default=str) + + terminology = config.get("domain", {}).get("terminology", {}) + term_lines = "\n".join(f"- {k}: {v}" for k, v in terminology.items()) if terminology else "" + + return f"""Analyze the following course pairing data and explain the pattern. + +COURSE PAIRING DATA: +{data_str} + +{f"TERMINOLOGY:{chr(10)}{term_lines}{chr(10)}" if term_lines else ""} +Generate a JSON response with this exact schema: +{schema_str} + +Guidelines: +- Explain the pattern in plain language accessible to advisors and faculty. +- Connect structural factors to the institution's known challenges and context. +- Make the advisor recommendation specific and actionable. +- Reference existing interventions if relevant. +- Note any data limitations that affect interpretation. +- Do NOT speculate beyond what the data shows.""" + + +# --------------------------------------------------------------------------- +# Summarizer prompt +# --------------------------------------------------------------------------- + + +def build_summarizer_prompt( + config: dict[str, Any], + query_data: dict[str, Any], +) -> str: + """Build the teacher prompt for generating a query result summary. + + Args: + config: Parsed school config dict. + query_data: Dict with keys: prompt, data, rowCount, vizType. + + Returns: + User prompt string for the teacher model. + """ + schema_str = json.dumps(SUMMARIZER_SCHEMA, indent=2) + data_str = json.dumps(query_data["data"][:50], indent=2, default=str) + user_query = query_data["prompt"] + row_count = query_data.get("rowCount", len(query_data["data"])) + viz_type = query_data.get("vizType", "table") + + return f"""Summarize the following query results for a non-technical audience +(advisors, administrators, faculty). + +USER QUERY: {user_query} +VISUALIZATION TYPE: {viz_type} +TOTAL ROWS: {row_count} + +RESULTS: +{data_str} + +Generate a JSON response with this exact schema: +{schema_str} + +Guidelines: +- Lead with the most important finding. +- Connect insights to institutional context and priorities. +- Make action items specific to the roles that would see this data. +- Note data limitations relevant to this specific query. +- Do NOT hallucinate data points not present in the results.""" +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_prompts.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add training/prompts.py tests/training/test_prompts.py +git commit -m "feat(training): teacher prompt templates for explainer and summarizer" +``` + +--- + +## Task 5: Seed Data Generation + +**Files:** +- Create: `training/seed.py` +- Create: `tests/training/test_seed.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/training/test_seed.py`: +```python +"""Tests for training.seed — seed data generation.""" + +import pytest +import yaml +from pathlib import Path +from unittest.mock import patch + +from training.seed import ( + load_seed_queries, + generate_synthetic_course_pairings, + generate_synthetic_query_results, + format_as_chatml, +) + + +class TestLoadSeedQueries: + def test_loads_valid_yaml(self, tmp_path): + seed_file = tmp_path / "seed_queries.yaml" + seed_file.write_text(yaml.dump({ + "explainer": [ + {"query": "MAT 100 and BIO 201", "style": "advisor"}, + ], + "summarizer": [ + {"query": "retention by race", "style": "faculty"}, + ], + })) + + with patch("training.seed.get_school_dir", return_value=tmp_path): + result = load_seed_queries("test-school") + + assert len(result["explainer"]) == 1 + assert len(result["summarizer"]) == 1 + assert result["explainer"][0]["query"] == "MAT 100 and BIO 201" + + def test_returns_empty_on_missing_file(self, tmp_path): + with patch("training.seed.get_school_dir", return_value=tmp_path): + result = load_seed_queries("test-school") + assert result == {"explainer": [], "summarizer": []} + + +class TestGenerateSyntheticCoursePairings: + def test_generates_requested_count(self, sample_school_config): + results = generate_synthetic_course_pairings(sample_school_config, count=5) + assert len(results) == 5 + + def test_each_has_required_keys(self, sample_school_config): + results = generate_synthetic_course_pairings(sample_school_config, count=3) + for r in results: + assert "course_a" in r + assert "course_b" in r + assert "stats" in r + assert "prefix" in r["course_a"] + assert "number" in r["course_a"] + + def test_returns_empty_for_zero(self, sample_school_config): + results = generate_synthetic_course_pairings(sample_school_config, count=0) + assert results == [] + + +class TestGenerateSyntheticQueryResults: + def test_generates_requested_count(self, sample_school_config): + results = generate_synthetic_query_results(sample_school_config, count=5) + assert len(results) == 5 + + def test_each_has_required_keys(self, sample_school_config): + results = generate_synthetic_query_results(sample_school_config, count=3) + for r in results: + assert "prompt" in r + assert "data" in r + assert "rowCount" in r + assert "vizType" in r + + def test_returns_empty_for_zero(self, sample_school_config): + results = generate_synthetic_query_results(sample_school_config, count=0) + assert results == [] + + +class TestFormatAsChatML: + def test_format_structure(self): + result = format_as_chatml("system", "user", "assistant") + assert "messages" in result + assert len(result["messages"]) == 3 + assert result["messages"][0] == {"role": "system", "content": "system"} + assert result["messages"][1] == {"role": "user", "content": "user"} + assert result["messages"][2] == {"role": "assistant", "content": "assistant"} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_seed.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'training.seed'` + +- [ ] **Step 3: Write the implementation** + +Create `training/seed.py`: +```python +"""Seed data generation for the distillation pipeline. + +Generates synthetic course pairing data and query results to serve as +inputs for the teacher model during distillation. Also loads template +seed queries from the school's seed_queries.yaml. +""" + +from __future__ import annotations + +import random +from typing import Any + +import yaml + +from training.config import get_school_dir + +# --------------------------------------------------------------------------- +# Common course data for synthetic generation +# --------------------------------------------------------------------------- + +_PREFIXES = ["MAT", "ENG", "BIO", "CIS", "WDT", "HIS", "PSY", "BUS", "NUR", "EMS"] +_NUMBERS = ["100", "101", "110", "201", "202", "210", "241", "246"] +_NAMES = { + "MAT 100": "Intermediate Algebra", + "MAT 110": "Finite Mathematics", + "MAT 201": "Calculus I", + "ENG 101": "English Composition I", + "ENG 102": "English Composition II", + "BIO 201": "Anatomy & Physiology I", + "BIO 202": "Anatomy & Physiology II", + "CIS 146": "Microcomputer Applications", + "CIS 201": "Introduction to Programming", + "WDT 108": "SMAW Fillet/OFC", + "WDT 109": "SMAW Fillet/PAC/CAC", + "HIS 201": "United States History I", + "PSY 200": "General Psychology", + "BUS 241": "Principles of Accounting I", + "NUR 102": "Fundamentals of Nursing", + "EMS 100": "EMT Basic", +} +_DELIVERY_METHODS = ["Face-to-Face", "Online", "Hybrid"] +_GRADES = ["A", "B", "C", "D", "F", "W", "I"] +_VIZ_TYPES = ["bar", "line", "pie", "kpi", "table"] + +_QUERY_TEMPLATES = [ + ("retention rate by {dim} for {year} cohort", "bar"), + ("overall {metric} trend from 2019 to 2023", "line"), + ("{metric} for first-generation students", "kpi"), + ("{metric} by enrollment intensity", "bar"), + ("top 10 courses with highest DFW rates", "table"), + ("{metric} by {dim}", "bar"), + ("students with {alert} early warning alert", "kpi"), + ("{metric} distribution by program", "bar"), + ("{metric} gap between full-time and part-time students", "bar"), + ("at-risk student count by {dim}", "pie"), +] + +_DIMS = ["race", "gender", "cohort", "program", "enrollment intensity", "math placement"] +_METRICS = ["retention rate", "completion rate", "GPA", "DFW rate", "pass rate"] +_ALERTS = ["URGENT", "HIGH", "MODERATE"] +_YEARS = ["2019", "2020", "2021", "2022", "2023"] +_RACES = ["Black", "White", "Hispanic", "Asian", "Two or More", "Unknown"] + + +# --------------------------------------------------------------------------- +# Seed query loader +# --------------------------------------------------------------------------- + + +def load_seed_queries(school: str) -> dict[str, list[dict]]: + """Load seed queries from a school's seed_queries.yaml. + + Args: + school: School directory name. + + Returns: + Dict with "explainer" and "summarizer" lists of query dicts. + """ + seed_path = get_school_dir(school) / "seed_queries.yaml" + if not seed_path.exists(): + return {"explainer": [], "summarizer": []} + + with seed_path.open("r", encoding="utf-8") as fh: + data = yaml.safe_load(fh) or {} + + return { + "explainer": data.get("explainer", []), + "summarizer": data.get("summarizer", []), + } + + +# --------------------------------------------------------------------------- +# Synthetic course pairing generation +# --------------------------------------------------------------------------- + + +def _random_course() -> dict[str, str]: + """Generate a random course identifier.""" + prefix = random.choice(_PREFIXES) + number = random.choice(_NUMBERS) + key = f"{prefix} {number}" + name = _NAMES.get(key, f"{prefix} {number} Course") + return {"prefix": prefix, "number": number, "name": name} + + +def _random_stats() -> dict[str, Any]: + """Generate random course pairing statistics.""" + dfwi_a = round(random.uniform(0.15, 0.55), 2) + dfwi_b = round(random.uniform(0.15, 0.55), 2) + co_count = random.randint(20, 200) + co_dfwi = round(random.uniform(min(dfwi_a, dfwi_b), max(dfwi_a, dfwi_b) + 0.1), 2) + co_dfwi = min(co_dfwi, 0.75) + + delivery_breakdown = [] + remaining = co_count + for method in _DELIVERY_METHODS: + if method == _DELIVERY_METHODS[-1]: + count = remaining + else: + count = random.randint(5, remaining - 5 * (len(_DELIVERY_METHODS) - len(delivery_breakdown) - 1)) + count = max(count, 1) + remaining -= count + delivery_breakdown.append({ + "method": method, + "count": count, + "dfwi_rate": round(random.uniform(0.15, 0.55), 2), + }) + + return { + "course_a_dfwi": dfwi_a, + "course_b_dfwi": dfwi_b, + "co_enrollment_count": co_count, + "co_enrollment_dfwi": co_dfwi, + "delivery_breakdown": delivery_breakdown, + } + + +def generate_synthetic_course_pairings( + config: dict[str, Any], + count: int, +) -> list[dict[str, Any]]: + """Generate synthetic course pairing data for explainer training. + + Args: + config: Parsed school config dict (used for program-aware generation). + count: Number of pairings to generate. + + Returns: + List of course pairing data dicts. + """ + if count == 0: + return [] + + results = [] + for _ in range(count): + course_a = _random_course() + course_b = _random_course() + while course_b["prefix"] == course_a["prefix"] and course_b["number"] == course_a["number"]: + course_b = _random_course() + results.append({ + "course_a": course_a, + "course_b": course_b, + "stats": _random_stats(), + }) + return results + + +# --------------------------------------------------------------------------- +# Synthetic query result generation +# --------------------------------------------------------------------------- + + +def generate_synthetic_query_results( + config: dict[str, Any], + count: int, +) -> list[dict[str, Any]]: + """Generate synthetic query results for summarizer training. + + Args: + config: Parsed school config dict. + count: Number of query results to generate. + + Returns: + List of query result dicts with prompt, data, rowCount, vizType. + """ + if count == 0: + return [] + + results = [] + for i in range(count): + template, default_viz = _QUERY_TEMPLATES[i % len(_QUERY_TEMPLATES)] + prompt = template.format( + dim=random.choice(_DIMS), + metric=random.choice(_METRICS), + year=random.choice(_YEARS), + alert=random.choice(_ALERTS), + ) + + # Generate plausible result rows + num_rows = random.randint(2, 8) + data = [] + for _ in range(num_rows): + row = { + "Race": random.choice(_RACES), + "value": round(random.uniform(0.15, 0.85), 2), + "count": random.randint(10, 500), + } + data.append(row) + + results.append({ + "prompt": prompt, + "data": data, + "rowCount": num_rows, + "vizType": default_viz, + }) + + return results + + +# --------------------------------------------------------------------------- +# ChatML formatter +# --------------------------------------------------------------------------- + + +def format_as_chatml(system: str, user: str, assistant: str) -> dict: + """Format a (system, user, assistant) triple as a ChatML messages dict. + + Args: + system: The system prompt text. + user: The user message text. + assistant: The assistant response text. + + Returns: + A dict with a "messages" key containing a list of 3 role/content dicts. + """ + return { + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + {"role": "assistant", "content": assistant}, + ] + } +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_seed.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add training/seed.py tests/training/test_seed.py +git commit -m "feat(training): seed data generation for explainer and summarizer" +``` + +--- + +## Task 6: Distillation Pipeline + +**Files:** +- Create: `training/distill.py` +- Create: `tests/training/test_distill.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/training/test_distill.py`: +```python +"""Tests for training.distill — teacher model distillation.""" + +import json +import pytest +from unittest.mock import patch, MagicMock + +from training.distill import ( + validate_json, + call_teacher, + generate_explainer_pairs, + generate_summarizer_pairs, +) + + +class TestValidateJson: + def test_valid_json(self): + result = validate_json('{"key": "value"}') + assert result == {"key": "value"} + + def test_strips_markdown_fences(self): + result = validate_json('```json\n{"key": "value"}\n```') + assert result == {"key": "value"} + + def test_returns_none_for_invalid(self): + assert validate_json("not json") is None + + def test_returns_none_for_empty(self): + assert validate_json("") is None + assert validate_json(None) is None + + def test_returns_none_for_non_dict(self): + assert validate_json("[1, 2, 3]") is None + + +class TestCallTeacher: + def test_calls_anthropic_backend(self): + mock_client = MagicMock() + mock_message = MagicMock() + mock_message.content = [MagicMock(text='{"result": "ok"}')] + mock_message.usage.input_tokens = 100 + mock_message.usage.output_tokens = 50 + mock_client.messages.create.return_value = mock_message + + with patch("training.distill._get_anthropic_client", return_value=mock_client): + result = call_teacher( + system="system prompt", + user="user prompt", + backend="anthropic", + model="claude-sonnet-4-20250514", + ) + + assert result == '{"result": "ok"}' + mock_client.messages.create.assert_called_once() + + def test_calls_ollama_backend(self): + mock_response = {"message": {"content": '{"result": "ok"}'}} + + with patch("training.distill.ollama") as mock_ollama: + mock_ollama.chat.return_value = mock_response + result = call_teacher( + system="system prompt", + user="user prompt", + backend="ollama", + model="qwen3.5:27b", + ) + + assert result == '{"result": "ok"}' + mock_ollama.chat.assert_called_once() + + +class TestGenerateExplainerPairs: + def test_generates_pairs_from_seed_data(self, sample_school_config, sample_course_pairing_data): + mock_response = json.dumps({ + "explanation": "Test explanation", + "structural_factors": ["factor1"], + "student_impact": "impact", + "advisor_recommendation": "recommendation", + "data_limitations": ["caveat"], + "related_intervention": None, + }) + + with patch("training.distill.call_teacher", return_value=mock_response): + pairs = generate_explainer_pairs( + config=sample_school_config, + seed_data=[sample_course_pairing_data], + count=2, + ) + + assert len(pairs) == 2 + assert "messages" in pairs[0] + assert len(pairs[0]["messages"]) == 3 + + def test_skips_invalid_responses(self, sample_school_config, sample_course_pairing_data): + with patch("training.distill.call_teacher", return_value="not json"): + pairs = generate_explainer_pairs( + config=sample_school_config, + seed_data=[sample_course_pairing_data], + count=3, + ) + + assert len(pairs) == 0 + + +class TestGenerateSummarizerPairs: + def test_generates_pairs_from_seed_data(self, sample_school_config, sample_query_result_data): + mock_response = json.dumps({ + "summary": "Test summary", + "key_insights": ["insight1"], + "context": "context", + "action_items": ["action"], + "caveats": ["caveat"], + }) + + with patch("training.distill.call_teacher", return_value=mock_response): + pairs = generate_summarizer_pairs( + config=sample_school_config, + seed_data=[sample_query_result_data], + count=2, + ) + + assert len(pairs) == 2 + assert "messages" in pairs[0] +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_distill.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'training.distill'` + +- [ ] **Step 3: Write the implementation** + +Create `training/distill.py`: +```python +"""Distillation pipeline — generate ChatML training pairs via a teacher model. + +Supports two backends: + - anthropic: Claude Sonnet via Anthropic API (production quality) + - ollama: Local model via Ollama (free iteration) + +Usage: + python -m training.distill --school bishop-state [--local] +""" + +from __future__ import annotations + +import argparse +import functools +import json +import os +import time +from pathlib import Path +from typing import Any + +from training.config import get_training_data_dir, load_school_config, write_jsonl +from training.prompts import ( + EXPLAINER_STUDENT_SYSTEM, + SUMMARIZER_STUDENT_SYSTEM, + build_explainer_prompt, + build_summarizer_prompt, + build_system_prompt, +) +from training.seed import ( + format_as_chatml, + generate_synthetic_course_pairings, + generate_synthetic_query_results, + load_seed_queries, +) + +# --------------------------------------------------------------------------- +# Cost tracking +# --------------------------------------------------------------------------- + +_COST_PER_M_INPUT = 3.00 +_COST_PER_M_OUTPUT = 15.00 +_total_input_tokens = 0 +_total_output_tokens = 0 +_total_calls = 0 + + +def _track_cost(input_tokens: int, output_tokens: int) -> None: + global _total_input_tokens, _total_output_tokens, _total_calls + _total_input_tokens += input_tokens + _total_output_tokens += output_tokens + _total_calls += 1 + + +def _cost_so_far() -> float: + return ( + _total_input_tokens / 1_000_000 * _COST_PER_M_INPUT + + _total_output_tokens / 1_000_000 * _COST_PER_M_OUTPUT + ) + + +def _print_cost_summary() -> None: + cost = _cost_so_far() + print( + f"[cost] {_total_calls} API calls | " + f"{_total_input_tokens:,} in + {_total_output_tokens:,} out tokens | " + f"${cost:.2f} spent so far", + flush=True, + ) + + +# --------------------------------------------------------------------------- +# JSON validation +# --------------------------------------------------------------------------- + + +def validate_json(text: str | None) -> dict | None: + """Strip markdown fences and parse as JSON dict. + + Returns None if text is empty, not valid JSON, or not a dict. + """ + if not text or not isinstance(text, str) or not text.strip(): + return None + + stripped = text.strip() + + if stripped.startswith("```"): + lines = stripped.splitlines() + lines = lines[1:] + if lines and lines[-1].strip() == "```": + lines = lines[:-1] + stripped = "\n".join(lines).strip() + + try: + obj = json.loads(stripped) + except (json.JSONDecodeError, ValueError): + return None + + if not isinstance(obj, dict): + return None + + return obj + + +# --------------------------------------------------------------------------- +# Teacher model caller +# --------------------------------------------------------------------------- + + +@functools.lru_cache(maxsize=1) +def _get_anthropic_client(): + """Return a cached Anthropic client instance.""" + import anthropic + + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + raise EnvironmentError( + "ANTHROPIC_API_KEY environment variable is required for Claude distillation." + ) + return anthropic.Anthropic(api_key=api_key) + + +try: + import ollama +except ImportError: + ollama = None # type: ignore[assignment] + + +def call_teacher( + system: str, + user: str, + backend: str, + model: str, +) -> str: + """Call the teacher model and return the response text. + + Args: + system: System prompt. + user: User message. + backend: "anthropic" or "ollama". + model: Model identifier. + + Returns: + The assistant response as a string. + """ + preview = user[:120].replace("\n", " ") + print(f"[api] Calling {model} ({backend}) | {preview}...", flush=True) + + if backend == "anthropic": + client = _get_anthropic_client() + message = client.messages.create( + model=model, + max_tokens=2048, + system=system, + messages=[{"role": "user", "content": user}], + ) + usage = message.usage + _track_cost(usage.input_tokens, usage.output_tokens) + print(f"[api] done {usage.input_tokens}in/{usage.output_tokens}out tokens", flush=True) + if _total_calls % 10 == 0: + _print_cost_summary() + return message.content[0].text + + elif backend == "ollama": + if ollama is None: + raise ImportError("ollama package is required for local teacher. Install with: pip install ollama") + response = ollama.chat( + model=model, + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + ) + return response["message"]["content"] + + else: + raise ValueError(f"Unknown backend: {backend!r}. Must be 'anthropic' or 'ollama'.") + + +# --------------------------------------------------------------------------- +# Pair generators +# --------------------------------------------------------------------------- + + +def generate_explainer_pairs( + config: dict[str, Any], + seed_data: list[dict[str, Any]], + count: int, + outfile: Path | None = None, +) -> list[dict]: + """Generate explainer training pairs via teacher model distillation. + + Args: + config: Parsed school config dict. + seed_data: List of course pairing data dicts. + count: Number of pairs to generate. + outfile: If provided, pairs are written incrementally. + + Returns: + List of ChatML pair dicts. + """ + distill_config = config.get("distillation", {}) + backend = distill_config.get("teacher_backend", "anthropic") + model = distill_config.get("teacher_model", "claude-sonnet-4-20250514") + + system_prompt = build_system_prompt(config) + pairs: list[dict] = [] + + fh = None + if outfile is not None: + outfile.parent.mkdir(parents=True, exist_ok=True) + fh = outfile.open("w", encoding="utf-8") + + try: + for idx in range(count): + if idx > 0 and idx % 25 == 0: + time.sleep(1) + + course_data = seed_data[idx % len(seed_data)] + teacher_prompt = build_explainer_prompt(config, course_data) + + try: + response_text = call_teacher(system_prompt, teacher_prompt, backend, model) + except Exception as exc: + print(f"[warn] Teacher call failed for explainer pair {idx}: {exc}", flush=True) + continue + + validated = validate_json(response_text) + if validated is None: + print(f"[warn] Invalid JSON for explainer pair {idx}, skipping.", flush=True) + continue + + student_user = json.dumps(course_data, ensure_ascii=False, default=str) + pair = format_as_chatml( + system=EXPLAINER_STUDENT_SYSTEM, + user=student_user, + assistant=json.dumps(validated, ensure_ascii=False), + ) + pairs.append(pair) + if fh is not None: + fh.write(json.dumps(pair, ensure_ascii=False) + "\n") + fh.flush() + print(f"[explainer] {len(pairs)}/{count} pairs generated", flush=True) + finally: + if fh is not None: + fh.close() + print(f"[explainer] Saved {len(pairs)} pairs to {outfile}", flush=True) + + return pairs + + +def generate_summarizer_pairs( + config: dict[str, Any], + seed_data: list[dict[str, Any]], + count: int, + outfile: Path | None = None, +) -> list[dict]: + """Generate summarizer training pairs via teacher model distillation. + + Args: + config: Parsed school config dict. + seed_data: List of query result data dicts. + count: Number of pairs to generate. + outfile: If provided, pairs are written incrementally. + + Returns: + List of ChatML pair dicts. + """ + distill_config = config.get("distillation", {}) + backend = distill_config.get("teacher_backend", "anthropic") + model = distill_config.get("teacher_model", "claude-sonnet-4-20250514") + + system_prompt = build_system_prompt(config) + pairs: list[dict] = [] + + fh = None + if outfile is not None: + outfile.parent.mkdir(parents=True, exist_ok=True) + fh = outfile.open("w", encoding="utf-8") + + try: + for idx in range(count): + if idx > 0 and idx % 25 == 0: + time.sleep(1) + + query_data = seed_data[idx % len(seed_data)] + teacher_prompt = build_summarizer_prompt(config, query_data) + + try: + response_text = call_teacher(system_prompt, teacher_prompt, backend, model) + except Exception as exc: + print(f"[warn] Teacher call failed for summarizer pair {idx}: {exc}", flush=True) + continue + + validated = validate_json(response_text) + if validated is None: + print(f"[warn] Invalid JSON for summarizer pair {idx}, skipping.", flush=True) + continue + + student_user = json.dumps( + {"prompt": query_data["prompt"], "data": query_data["data"][:50]}, + ensure_ascii=False, + default=str, + ) + pair = format_as_chatml( + system=SUMMARIZER_STUDENT_SYSTEM, + user=student_user, + assistant=json.dumps(validated, ensure_ascii=False), + ) + pairs.append(pair) + if fh is not None: + fh.write(json.dumps(pair, ensure_ascii=False) + "\n") + fh.flush() + print(f"[summarizer] {len(pairs)}/{count} pairs generated", flush=True) + finally: + if fh is not None: + fh.close() + print(f"[summarizer] Saved {len(pairs)} pairs to {outfile}", flush=True) + + return pairs + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + + +def main(school: str, local: bool = False) -> None: + """Run distillation for a school. + + Args: + school: School directory name. + local: If True, use local Ollama teacher instead of Claude. + """ + config = load_school_config(school) + + if local: + config["distillation"]["teacher_backend"] = config["distillation"].get( + "local_teacher_backend", "ollama" + ) + config["distillation"]["teacher_model"] = config["distillation"].get( + "local_teacher_model", "qwen3.5:27b" + ) + print(f"[distill] Using local teacher: {config['distillation']['teacher_model']}") + else: + print(f"[distill] Using API teacher: {config['distillation']['teacher_model']}") + + pairs_per_task = config["distillation"].get("pairs_per_task", 1500) + data_dir = get_training_data_dir(school) + pairs_dir = data_dir / "pairs" + + # Load seed queries + seed_queries = load_seed_queries(school) + + # Generate synthetic seed data + synthetic_pairings = generate_synthetic_course_pairings(config, count=pairs_per_task) + synthetic_results = generate_synthetic_query_results(config, count=pairs_per_task) + + # Explainer + print(f"\n{'='*60}") + print(f"EXPLAINER — generating {pairs_per_task} pairs") + print(f"{'='*60}") + explainer_pairs = generate_explainer_pairs( + config=config, + seed_data=synthetic_pairings, + count=pairs_per_task, + outfile=pairs_dir / "explainer.jsonl", + ) + + # Summarizer + print(f"\n{'='*60}") + print(f"SUMMARIZER — generating {pairs_per_task} pairs") + print(f"{'='*60}") + summarizer_pairs = generate_summarizer_pairs( + config=config, + seed_data=synthetic_results, + count=pairs_per_task, + outfile=pairs_dir / "summarizer.jsonl", + ) + + print(f"\n{'='*60}") + print("DISTILLATION COMPLETE") + print(f"{'='*60}") + print(f" Explainer: {len(explainer_pairs)} pairs") + print(f" Summarizer: {len(summarizer_pairs)} pairs") + _print_cost_summary() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Generate training pairs via teacher model distillation." + ) + parser.add_argument("--school", required=True, help="School directory name") + parser.add_argument("--local", action="store_true", help="Use local Ollama teacher") + args = parser.parse_args() + main(args.school, local=args.local) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_distill.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add training/distill.py tests/training/test_distill.py +git commit -m "feat(training): distillation pipeline with dual teacher backend support" +``` + +--- + +## Task 7: Dataset Preparation + +**Files:** +- Create: `training/prepare.py` +- Create: `tests/training/test_prepare.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/training/test_prepare.py`: +```python +"""Tests for training.prepare — filter, deduplicate, and split.""" + +import json +import pytest + +from training.prepare import ( + filter_invalid_json, + deduplicate_by_jaccard, + jaccard_similarity, + split_dataset, +) + + +class TestFilterInvalidJson: + def test_keeps_valid_pairs(self): + pairs = [ + {"messages": [ + {"role": "system", "content": "sys"}, + {"role": "user", "content": "question"}, + {"role": "assistant", "content": '{"key": "value"}'}, + ]} + ] + result = filter_invalid_json(pairs) + assert len(result) == 1 + + def test_removes_invalid_json_assistant(self): + pairs = [ + {"messages": [ + {"role": "system", "content": "sys"}, + {"role": "user", "content": "question"}, + {"role": "assistant", "content": "not json"}, + ]} + ] + result = filter_invalid_json(pairs) + assert len(result) == 0 + + def test_removes_missing_messages(self): + assert filter_invalid_json([{"no_messages": True}]) == [] + + def test_removes_empty_user(self): + pairs = [ + {"messages": [ + {"role": "system", "content": "sys"}, + {"role": "user", "content": ""}, + {"role": "assistant", "content": '{"key": "value"}'}, + ]} + ] + result = filter_invalid_json(pairs) + assert len(result) == 0 + + +class TestJaccardSimilarity: + def test_identical_strings(self): + assert jaccard_similarity("hello world", "hello world") == 1.0 + + def test_completely_different(self): + assert jaccard_similarity("hello", "world") == 0.0 + + def test_partial_overlap(self): + result = jaccard_similarity("hello world foo", "hello world bar") + assert 0.0 < result < 1.0 + + def test_empty_string(self): + assert jaccard_similarity("", "hello") == 0.0 + + +class TestDeduplicateByJaccard: + def test_removes_exact_duplicates(self): + pairs = [ + {"messages": [{"role": "user", "content": "same question"}]}, + {"messages": [{"role": "user", "content": "same question"}]}, + {"messages": [{"role": "user", "content": "different question"}]}, + ] + result = deduplicate_by_jaccard(pairs, threshold=1.0) + assert len(result) == 2 + + def test_empty_input(self): + assert deduplicate_by_jaccard([], threshold=1.0) == [] + + def test_preserves_order(self): + pairs = [ + {"messages": [{"role": "user", "content": "first"}]}, + {"messages": [{"role": "user", "content": "second"}]}, + ] + result = deduplicate_by_jaccard(pairs, threshold=1.0) + assert result[0]["messages"][0]["content"] == "first" + + +class TestSplitDataset: + def test_split_ratios(self): + pairs = [{"id": i} for i in range(100)] + splits = split_dataset(pairs, train_ratio=0.8, val_ratio=0.1) + assert len(splits["train"]) == 80 + assert len(splits["val"]) == 10 + assert len(splits["test"]) == 10 + + def test_deterministic(self): + pairs = [{"id": i} for i in range(50)] + split1 = split_dataset(pairs, seed=42) + split2 = split_dataset(pairs, seed=42) + assert split1["train"] == split2["train"] + + def test_empty_input(self): + splits = split_dataset([]) + assert splits == {"train": [], "val": [], "test": []} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_prepare.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'training.prepare'` + +- [ ] **Step 3: Write the implementation** + +Create `training/prepare.py`: +```python +"""Dataset preparation — filter, deduplicate, and split training pairs. + +Adapted from d4bl pipeline. Loads raw JSONL from distillation, applies +quality filtering, removes near-duplicates, and writes 80/10/10 splits. + +Usage: + python -m training.prepare --school bishop-state +""" + +from __future__ import annotations + +import argparse +import json +import random +from pathlib import Path +from typing import Any + +from training.config import ( + JACCARD_THRESHOLD, + TRAIN_RATIO, + VAL_RATIO, + get_training_data_dir, + write_jsonl, +) + + +# --------------------------------------------------------------------------- +# Pure helpers +# --------------------------------------------------------------------------- + + +def jaccard_similarity(a: str, b: str) -> float: + """Compute word-level Jaccard similarity between two strings.""" + words_a = set(a.lower().split()) + words_b = set(b.lower().split()) + if not words_a or not words_b: + return 0.0 + return len(words_a & words_b) / len(words_a | words_b) + + +def _get_user_text(pair: dict[str, Any]) -> str: + """Extract user message content from a ChatML pair.""" + for msg in pair.get("messages", []): + if msg.get("role") == "user": + return msg.get("content", "") + return "" + + +# --------------------------------------------------------------------------- +# Filtering +# --------------------------------------------------------------------------- + + +def filter_invalid_json(pairs: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Keep only pairs with valid structure and JSON-parseable assistant content.""" + valid = [] + for pair in pairs: + messages = pair.get("messages") + if not isinstance(messages, list) or not messages: + continue + if any(not isinstance(msg, dict) for msg in messages): + continue + has_user = any( + msg.get("role") == "user" and msg.get("content") + for msg in messages + ) + if not has_user: + continue + assistant_content = None + for msg in messages: + if msg.get("role") == "assistant": + assistant_content = msg.get("content") + break + if not isinstance(assistant_content, str) or not assistant_content: + continue + try: + json.loads(assistant_content) + except (json.JSONDecodeError, ValueError): + continue + valid.append(pair) + return valid + + +# --------------------------------------------------------------------------- +# Deduplication +# --------------------------------------------------------------------------- + + +def deduplicate_by_jaccard( + pairs: list[dict[str, Any]], + threshold: float = JACCARD_THRESHOLD, +) -> list[dict[str, Any]]: + """Remove near-duplicate pairs based on user-message Jaccard similarity.""" + if not pairs: + return pairs + + kept: list[dict[str, Any]] = [pairs[0]] + kept_word_sets: list[set] = [set(_get_user_text(pairs[0]).lower().split())] + + for pair in pairs[1:]: + candidate_words = set(_get_user_text(pair).lower().split()) + is_duplicate = any( + _jaccard_sets(candidate_words, kw) >= threshold + for kw in kept_word_sets + ) + if not is_duplicate: + kept.append(pair) + kept_word_sets.append(candidate_words) + + return kept + + +def _jaccard_sets(a: set, b: set) -> float: + if not a or not b: + return 0.0 + return len(a & b) / len(a | b) + + +# --------------------------------------------------------------------------- +# Splitting +# --------------------------------------------------------------------------- + + +def split_dataset( + pairs: list[dict[str, Any]], + train_ratio: float = TRAIN_RATIO, + val_ratio: float = VAL_RATIO, + seed: int = 42, +) -> dict[str, list[dict[str, Any]]]: + """Shuffle and split pairs into train/val/test with a deterministic seed.""" + if not pairs: + return {"train": [], "val": [], "test": []} + + shuffled = list(pairs) + rng = random.Random(seed) + rng.shuffle(shuffled) + + n = len(shuffled) + train_end = round(n * train_ratio) + val_end = train_end + round(n * val_ratio) + + return { + "train": shuffled[:train_end], + "val": shuffled[train_end:val_end], + "test": shuffled[val_end:], + } + + +# --------------------------------------------------------------------------- +# I/O +# --------------------------------------------------------------------------- + + +def _load_pairs(path: Path) -> list[dict[str, Any]]: + """Load newline-delimited JSON from path.""" + pairs = [] + with path.open() as fh: + for line in fh: + line = line.strip() + if line: + pairs.append(json.loads(line)) + return pairs + + +# --------------------------------------------------------------------------- +# Orchestrator +# --------------------------------------------------------------------------- + + +def process_task(school: str, task: str) -> dict[str, int]: + """Load, filter, deduplicate, and split training data for a task. + + Args: + school: School directory name. + task: Task name ("explainer" or "summarizer"). + + Returns: + Dict mapping split name to number of examples written. + """ + data_dir = get_training_data_dir(school) + input_path = data_dir / "pairs" / f"{task}.jsonl" + if not input_path.exists(): + raise FileNotFoundError(f"Pairs file not found: {input_path}") + + pairs = _load_pairs(input_path) + print(f"[{task}] Loaded {len(pairs)} pairs from {input_path}") + + pairs = filter_invalid_json(pairs) + print(f"[{task}] After JSON filter: {len(pairs)} pairs") + + pairs = deduplicate_by_jaccard(pairs, threshold=JACCARD_THRESHOLD) + print(f"[{task}] After deduplication: {len(pairs)} pairs") + + splits = split_dataset(pairs) + + final_dir = data_dir / "final" / task + counts: dict[str, int] = {} + for split_name, split_pairs in splits.items(): + out_path = final_dir / f"{split_name}.jsonl" + n = write_jsonl(split_pairs, out_path) + counts[split_name] = n + print(f"[{task}] Wrote {n} examples to {out_path}") + + return counts + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(school: str) -> None: + """Run preparation for all tasks.""" + for task in ("explainer", "summarizer"): + try: + process_task(school, task) + except FileNotFoundError as e: + print(f"[warn] {e} — skipping") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Filter, deduplicate, and split training pairs.") + parser.add_argument("--school", required=True, help="School directory name") + args = parser.parse_args() + main(args.school) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_prepare.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add training/prepare.py tests/training/test_prepare.py +git commit -m "feat(training): dataset preparation — filter, dedup, and split" +``` + +--- + +## Task 8: Eval Harness and Ship Criteria + +**Files:** +- Create: `training/eval.py` +- Create: `tests/training/test_eval.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/training/test_eval.py`: +```python +"""Tests for training.eval — metrics and ship criteria.""" + +import json +import pytest + +from training.eval import ( + SHIP_CRITERIA, + check_json_validity, + check_schema_adherence, + check_caveat_inclusion, + check_ship_criteria, + ShipDecision, +) + + +class TestCheckJsonValidity: + def test_all_valid(self): + outputs = ['{"key": "value"}', '{"a": 1}'] + assert check_json_validity(outputs) == 1.0 + + def test_some_invalid(self): + outputs = ['{"key": "value"}', "not json", '{"a": 1}'] + assert check_json_validity(outputs) == pytest.approx(2 / 3) + + def test_empty(self): + assert check_json_validity([]) == 0.0 + + +class TestCheckSchemaAdherence: + def test_explainer_all_valid(self, sample_explainer_output): + outputs = [json.dumps(sample_explainer_output)] + assert check_schema_adherence(outputs, "explainer") == 1.0 + + def test_explainer_missing_key(self): + incomplete = json.dumps({"explanation": "test"}) + assert check_schema_adherence([incomplete], "explainer") < 1.0 + + def test_summarizer_all_valid(self, sample_summarizer_output): + outputs = [json.dumps(sample_summarizer_output)] + assert check_schema_adherence(outputs, "summarizer") == 1.0 + + +class TestCheckCaveatInclusion: + def test_all_have_caveats(self, sample_explainer_output): + outputs = [json.dumps(sample_explainer_output)] + assert check_caveat_inclusion(outputs, "explainer") == 1.0 + + def test_missing_caveats(self): + no_caveats = json.dumps({ + "explanation": "test", + "structural_factors": [], + "student_impact": "impact", + "advisor_recommendation": "rec", + "data_limitations": [], + "related_intervention": None, + }) + assert check_caveat_inclusion([no_caveats], "explainer") == 0.0 + + +class TestShipCriteria: + def test_passes_with_good_metrics(self): + metrics = { + "json_validity": 0.98, + "schema_adherence": 0.95, + "caveat_inclusion": 0.92, + "factual_grounding": 0.90, + } + decision = check_ship_criteria(metrics, "explainer") + assert decision.decision == "ship" + assert len(decision.blocking_failures) == 0 + + def test_fails_with_low_json_validity(self): + metrics = { + "json_validity": 0.80, + "schema_adherence": 0.95, + "caveat_inclusion": 0.92, + "factual_grounding": 0.90, + } + decision = check_ship_criteria(metrics, "explainer") + assert decision.decision == "no_ship" + assert len(decision.blocking_failures) > 0 + + def test_ship_with_gaps(self): + metrics = { + "json_validity": 0.98, + "schema_adherence": 0.95, + "caveat_inclusion": 0.85, + "factual_grounding": 0.90, + "explanation_quality": 0.30, # Below non-blocking threshold + } + decision = check_ship_criteria(metrics, "explainer") + assert decision.decision in ("ship", "ship_with_gaps") +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_eval.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'training.eval'` + +- [ ] **Step 3: Write the implementation** + +Create `training/eval.py`: +```python +"""Evaluation harness and ship criteria for fine-tuned models. + +Runs a fine-tuned model against held-out test data and checks +whether it meets the minimum quality thresholds for deployment. + +Usage: + python -m training.eval --school bishop-state +""" + +from __future__ import annotations + +import argparse +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from training.config import get_training_data_dir, load_school_config + +# --------------------------------------------------------------------------- +# Ship criteria thresholds +# --------------------------------------------------------------------------- + +SHIP_CRITERIA: dict[str, dict[str, dict]] = { + "explainer": { + "json_validity": {"min": 0.95, "blocking": True}, + "schema_adherence": {"min": 0.90, "blocking": True}, + "caveat_inclusion": {"min": 0.90, "blocking": True}, + "factual_grounding": {"min": 0.85, "blocking": True}, + "explanation_quality": {"min": 0.35, "blocking": False}, + "actionability": {"min": 0.80, "blocking": False}, + }, + "summarizer": { + "json_validity": {"min": 0.95, "blocking": True}, + "schema_adherence": {"min": 0.90, "blocking": True}, + "caveat_inclusion": {"min": 0.90, "blocking": True}, + "factual_grounding": {"min": 0.85, "blocking": True}, + "explanation_quality": {"min": 0.35, "blocking": False}, + "actionability": {"min": 0.80, "blocking": False}, + }, +} + +_EXPLAINER_REQUIRED_KEYS = { + "explanation", "structural_factors", "student_impact", + "advisor_recommendation", "data_limitations", "related_intervention", +} +_SUMMARIZER_REQUIRED_KEYS = { + "summary", "key_insights", "context", "action_items", "caveats", +} +_CAVEAT_KEY = { + "explainer": "data_limitations", + "summarizer": "caveats", +} + + +# --------------------------------------------------------------------------- +# Data classes +# --------------------------------------------------------------------------- + + +@dataclass +class CriterionFailure: + metric: str + threshold: float + actual: float | None + blocking: bool + + +@dataclass +class ShipDecision: + decision: str # "ship", "no_ship", "ship_with_gaps" + blocking_failures: list[CriterionFailure] = field(default_factory=list) + nonblocking_failures: list[CriterionFailure] = field(default_factory=list) + metrics_checked: int = 0 + + +# --------------------------------------------------------------------------- +# Metric computation +# --------------------------------------------------------------------------- + + +def check_json_validity(outputs: list[str]) -> float: + """Compute the fraction of outputs that parse as valid JSON dicts.""" + if not outputs: + return 0.0 + valid = 0 + for out in outputs: + try: + obj = json.loads(out) + if isinstance(obj, dict): + valid += 1 + except (json.JSONDecodeError, ValueError, TypeError): + pass + return valid / len(outputs) + + +def check_schema_adherence(outputs: list[str], task: str) -> float: + """Compute the fraction of outputs with all required keys present.""" + if not outputs: + return 0.0 + + required = _EXPLAINER_REQUIRED_KEYS if task == "explainer" else _SUMMARIZER_REQUIRED_KEYS + adherent = 0 + for out in outputs: + try: + obj = json.loads(out) + if isinstance(obj, dict) and required.issubset(obj.keys()): + adherent += 1 + except (json.JSONDecodeError, ValueError, TypeError): + pass + return adherent / len(outputs) + + +def check_caveat_inclusion(outputs: list[str], task: str) -> float: + """Compute the fraction of outputs with non-empty caveat/limitation fields.""" + if not outputs: + return 0.0 + + caveat_key = _CAVEAT_KEY.get(task, "caveats") + with_caveats = 0 + for out in outputs: + try: + obj = json.loads(out) + if isinstance(obj, dict): + caveats = obj.get(caveat_key, []) + if isinstance(caveats, list) and len(caveats) > 0: + with_caveats += 1 + except (json.JSONDecodeError, ValueError, TypeError): + pass + return with_caveats / len(outputs) + + +def check_factual_grounding(outputs: list[str], inputs: list[str]) -> float: + """Check that outputs reference values present in their corresponding inputs. + + Simple heuristic: extracts numeric values from the input and checks + that at least one appears in the output. + """ + if not outputs or not inputs: + return 0.0 + + import re + + grounded = 0 + for out, inp in zip(outputs, inputs): + numbers_in_input = set(re.findall(r"\d+\.?\d*", inp)) + if not numbers_in_input: + grounded += 1 # No numbers to check against + continue + # Check if at least one input number appears in the output + if any(n in out for n in numbers_in_input): + grounded += 1 + + return grounded / len(outputs) + + +# --------------------------------------------------------------------------- +# Ship criteria checker +# --------------------------------------------------------------------------- + + +def check_ship_criteria( + metrics: dict[str, float], + task: str, +) -> ShipDecision: + """Compare metrics against ship thresholds. + + Args: + metrics: Dict of metric_name → value. + task: "explainer" or "summarizer". + + Returns: + ShipDecision with pass/fail details. + """ + criteria = SHIP_CRITERIA.get(task, {}) + blocking_failures = [] + nonblocking_failures = [] + checked = 0 + + for metric_name, spec in criteria.items(): + actual = metrics.get(metric_name) + if actual is None: + continue + checked += 1 + + threshold = spec.get("min", spec.get("max")) + blocking = spec.get("blocking", True) + + failed = False + if "min" in spec and actual < spec["min"]: + failed = True + if "max" in spec and actual > spec["max"]: + failed = True + + if failed: + failure = CriterionFailure( + metric=metric_name, + threshold=threshold, + actual=actual, + blocking=blocking, + ) + if blocking: + blocking_failures.append(failure) + else: + nonblocking_failures.append(failure) + + if blocking_failures: + decision = "no_ship" + elif nonblocking_failures: + decision = "ship_with_gaps" + else: + decision = "ship" + + return ShipDecision( + decision=decision, + blocking_failures=blocking_failures, + nonblocking_failures=nonblocking_failures, + metrics_checked=checked, + ) + + +# --------------------------------------------------------------------------- +# Test set loader +# --------------------------------------------------------------------------- + + +def load_test_set(path: Path) -> list[dict]: + """Load a ChatML JSONL test set and extract input/expected pairs.""" + results = [] + with path.open() as fh: + for line in fh: + if not line.strip(): + continue + example = json.loads(line) + messages = example["messages"] + user_msg = messages[1]["content"] + assistant_msg = messages[2]["content"] + results.append({ + "input": user_msg, + "expected_raw": assistant_msg, + }) + return results + + +# --------------------------------------------------------------------------- +# Eval runner +# --------------------------------------------------------------------------- + + +def run_eval(school: str, task: str) -> ShipDecision: + """Run evaluation for a school's fine-tuned model on one task. + + Loads the test set, runs inference via Ollama, computes metrics, + and checks ship criteria. + + Args: + school: School directory name. + task: "explainer" or "summarizer". + + Returns: + ShipDecision. + """ + data_dir = get_training_data_dir(school) + test_path = data_dir / "final" / task / "test.jsonl" + + if not test_path.exists(): + raise FileNotFoundError(f"Test set not found: {test_path}") + + test_set = load_test_set(test_path) + print(f"[{task}] Loaded {len(test_set)} test examples from {test_path}") + + config = load_school_config(school) + model_name = f"{school}-{task}:{config['training']['default_model'].split(':')[1]}" + + # Run inference + try: + import ollama as ollama_client + except ImportError: + raise ImportError("ollama package required for evaluation. Install with: pip install ollama") + + outputs = [] + inputs = [] + for i, example in enumerate(test_set): + try: + response = ollama_client.chat( + model=model_name, + messages=[ + {"role": "user", "content": example["input"]}, + ], + ) + outputs.append(response["message"]["content"]) + inputs.append(example["input"]) + except Exception as exc: + print(f"[warn] Inference failed for example {i}: {exc}") + outputs.append("") + inputs.append(example["input"]) + + if (i + 1) % 10 == 0: + print(f"[{task}] Evaluated {i + 1}/{len(test_set)} examples", flush=True) + + # Compute metrics + metrics = { + "json_validity": check_json_validity(outputs), + "schema_adherence": check_schema_adherence(outputs, task), + "caveat_inclusion": check_caveat_inclusion(outputs, task), + "factual_grounding": check_factual_grounding(outputs, inputs), + } + + # Print results + print(f"\n[{task}] Metrics:") + for name, value in metrics.items(): + threshold_info = SHIP_CRITERIA.get(task, {}).get(name, {}) + threshold = threshold_info.get("min", threshold_info.get("max", "?")) + status = "PASS" if value >= threshold if isinstance(threshold, (int, float)) else True else "FAIL" + print(f" {name}: {value:.1%} (threshold: {threshold}) {status}") + + decision = check_ship_criteria(metrics, task) + print(f"\n[{task}] DECISION: {decision.decision.upper()}") + if decision.blocking_failures: + for f in decision.blocking_failures: + print(f" BLOCKING: {f.metric} = {f.actual:.1%} (need {f.threshold})") + if decision.nonblocking_failures: + for f in decision.nonblocking_failures: + print(f" WARNING: {f.metric} = {f.actual:.1%} (need {f.threshold})") + + return decision + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(school: str) -> None: + """Run evaluation for all tasks.""" + results = {} + for task in ("explainer", "summarizer"): + try: + results[task] = run_eval(school, task) + except FileNotFoundError as e: + print(f"[warn] {e} — skipping") + + print(f"\n{'='*60}") + print("EVALUATION SUMMARY") + print(f"{'='*60}") + all_ship = True + for task, decision in results.items(): + status = decision.decision.upper() + print(f" {task}: {status}") + if decision.decision == "no_ship": + all_ship = False + + if all_ship: + print("\nAll adapters PASS — ready to export.") + else: + print("\nSome adapters FAILED — fix issues before exporting.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Evaluate fine-tuned models against ship criteria.") + parser.add_argument("--school", required=True, help="School directory name") + args = parser.parse_args() + main(args.school) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_eval.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add training/eval.py tests/training/test_eval.py +git commit -m "feat(training): eval harness with ship criteria for model quality gates" +``` + +--- + +## Task 9: MLX Fine-Tuning Wrapper + +**Files:** +- Create: `training/finetune.py` + +This task wraps MLX's `mlx_lm` fine-tuning CLI. No unit tests for the actual training (it requires GPU time), but we test the config generation. + +- [ ] **Step 1: Write the implementation** + +Create `training/finetune.py`: +```python +"""Fine-tuning wrapper for MLX QLoRA on Apple Silicon. + +Wraps mlx_lm's LoRA fine-tuning with school-specific config. + +Usage: + python -m training.finetune --school bishop-state --model 9b +""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from pathlib import Path + +from training.config import get_training_data_dir, load_school_config + +# --------------------------------------------------------------------------- +# Model name mapping +# --------------------------------------------------------------------------- + +_MODEL_MAP = { + "4b": "Qwen/Qwen3.5-4B", + "9b": "Qwen/Qwen3.5-9B", + "27b": "Qwen/Qwen3.5-27B", +} + + +def _resolve_model(model_shorthand: str) -> str: + """Resolve a shorthand like '9b' to a HuggingFace model path.""" + if model_shorthand in _MODEL_MAP: + return _MODEL_MAP[model_shorthand] + return model_shorthand + + +# --------------------------------------------------------------------------- +# Config generation +# --------------------------------------------------------------------------- + + +def build_lora_config(config: dict, task: str, data_dir: Path) -> dict: + """Build the MLX LoRA fine-tuning config dict. + + Args: + config: Parsed school config. + task: "explainer" or "summarizer". + data_dir: Path to the school's training_data directory. + + Returns: + Dict suitable for writing as JSON config for mlx_lm.lora. + """ + training = config.get("training", {}) + final_dir = data_dir / "final" / task + + return { + "train": str(final_dir / "train.jsonl"), + "valid": str(final_dir / "val.jsonl"), + "test": str(final_dir / "test.jsonl"), + "lora_layers": training.get("lora_rank", 16), + "lora_parameters": { + "rank": training.get("lora_rank", 16), + "alpha": training.get("lora_alpha", 32), + "dropout": 0.05, + "scale": training.get("lora_alpha", 32) / training.get("lora_rank", 16), + }, + "learning_rate": training.get("learning_rate", 1e-4), + "batch_size": training.get("batch_size", 4), + "iters": training.get("epochs", 3) * 1000, # Approximate + "val_batches": 25, + "steps_per_eval": training.get("eval_every", 50), + "save_every": 100, + "max_seq_length": 2048, + "grad_checkpoint": True, + } + + +# --------------------------------------------------------------------------- +# Fine-tuning runner +# --------------------------------------------------------------------------- + + +def run_finetune(school: str, model: str = "9b", task: str | None = None) -> None: + """Run MLX LoRA fine-tuning for a school's adapter(s). + + Args: + school: School directory name. + model: Model shorthand ("4b", "9b") or full HF path. + task: Specific task, or None to train both adapters. + """ + config = load_school_config(school) + data_dir = get_training_data_dir(school) + hf_model = _resolve_model(model) + + tasks = [task] if task else ["explainer", "summarizer"] + + for t in tasks: + print(f"\n{'='*60}") + print(f"FINE-TUNING: {t} adapter on {hf_model}") + print(f"{'='*60}") + + adapter_dir = data_dir / "models" / f"qwen3.5-{model}" / t + adapter_dir.mkdir(parents=True, exist_ok=True) + + lora_config = build_lora_config(config, t, data_dir) + config_path = adapter_dir / "lora_config.json" + config_path.write_text(json.dumps(lora_config, indent=2)) + + cmd = [ + sys.executable, "-m", "mlx_lm.lora", + "--model", hf_model, + "--adapter-path", str(adapter_dir), + "--data", str(data_dir / "final" / t), + "--train", + "--batch-size", str(lora_config["batch_size"]), + "--lora-layers", str(lora_config["lora_layers"]), + "--iters", str(lora_config["iters"]), + "--val-batches", str(lora_config["val_batches"]), + "--steps-per-eval", str(lora_config["steps_per_eval"]), + "--save-every", str(lora_config["save_every"]), + "--learning-rate", str(lora_config["learning_rate"]), + "--max-seq-length", str(lora_config["max_seq_length"]), + "--grad-checkpoint", + ] + + print(f"[finetune] Running: {' '.join(cmd[:6])}...") + result = subprocess.run(cmd, cwd=str(data_dir)) + + if result.returncode != 0: + print(f"[finetune] FAILED for {t} — exit code {result.returncode}") + else: + print(f"[finetune] SUCCESS — adapter saved to {adapter_dir}") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Fine-tune a model for a school via MLX QLoRA.") + parser.add_argument("--school", required=True, help="School directory name") + parser.add_argument("--model", default="9b", help="Model size: 4b, 9b, or HF path") + parser.add_argument("--task", choices=["explainer", "summarizer"], help="Train one adapter only") + args = parser.parse_args() + run_finetune(args.school, model=args.model, task=args.task) +``` + +- [ ] **Step 2: Commit** + +```bash +git add training/finetune.py +git commit -m "feat(training): MLX QLoRA fine-tuning wrapper" +``` + +--- + +## Task 10: Ollama Export + +**Files:** +- Create: `training/export.py` + +- [ ] **Step 1: Write the implementation** + +Create `training/export.py`: +```python +"""Export fine-tuned adapters to Ollama for serving. + +Creates an Ollama Modelfile and registers the model. + +Usage: + python -m training.export --school bishop-state +""" + +from __future__ import annotations + +import argparse +import subprocess +import sys +from pathlib import Path + +from training.config import get_training_data_dir, load_school_config + +# --------------------------------------------------------------------------- +# Modelfile generation +# --------------------------------------------------------------------------- + +_MODELFILE_TEMPLATE = """FROM {base_model} +ADAPTER {adapter_path} + +PARAMETER temperature 0.3 +PARAMETER top_p 0.9 +PARAMETER num_ctx 4096 + +SYSTEM {system_prompt} +""" + + +def generate_modelfile( + base_model: str, + adapter_path: Path, + system_prompt: str, +) -> str: + """Generate an Ollama Modelfile string. + + Args: + base_model: Base model name (e.g. "qwen3.5:9b"). + adapter_path: Path to the LoRA adapter directory. + system_prompt: System prompt to bake into the model. + + Returns: + Modelfile content string. + """ + return _MODELFILE_TEMPLATE.format( + base_model=base_model, + adapter_path=str(adapter_path), + system_prompt=json.dumps(system_prompt), + ) + + +# --------------------------------------------------------------------------- +# Registration +# --------------------------------------------------------------------------- + +import json + +from training.prompts import EXPLAINER_STUDENT_SYSTEM, SUMMARIZER_STUDENT_SYSTEM + +_SYSTEM_PROMPTS = { + "explainer": EXPLAINER_STUDENT_SYSTEM, + "summarizer": SUMMARIZER_STUDENT_SYSTEM, +} + + +def export_model(school: str, task: str, model: str = "9b") -> bool: + """Export a fine-tuned adapter to Ollama. + + Args: + school: School directory name. + task: "explainer" or "summarizer". + model: Model size shorthand. + + Returns: + True if registration succeeded. + """ + data_dir = get_training_data_dir(school) + adapter_dir = data_dir / "models" / f"qwen3.5-{model}" / task + + if not adapter_dir.exists(): + print(f"[export] Adapter not found: {adapter_dir}") + return False + + base_model = f"qwen3.5:{model}" + ollama_name = f"{school}-{task}:{model}" + system_prompt = _SYSTEM_PROMPTS.get(task, "") + + modelfile_content = generate_modelfile(base_model, adapter_dir, system_prompt) + modelfile_path = adapter_dir / "Modelfile" + modelfile_path.write_text(modelfile_content) + print(f"[export] Wrote Modelfile to {modelfile_path}") + + # Register with Ollama + cmd = ["ollama", "create", ollama_name, "-f", str(modelfile_path)] + print(f"[export] Registering: {' '.join(cmd)}") + + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + if result.returncode == 0: + print(f"[export] Registered: {ollama_name}") + return True + else: + print(f"[export] FAILED: {result.stderr}") + return False + except FileNotFoundError: + print("[export] Ollama CLI not found. Install from https://ollama.com") + return False + except subprocess.TimeoutExpired: + print("[export] Ollama create timed out after 5 minutes") + return False + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(school: str) -> None: + """Export all adapters for a school.""" + config = load_school_config(school) + model = config["training"]["default_model"].split(":")[1] + + results = {} + for task in ("explainer", "summarizer"): + results[task] = export_model(school, task, model=model) + + print(f"\n{'='*60}") + print("EXPORT SUMMARY") + print(f"{'='*60}") + for task, success in results.items(): + status = "OK" if success else "FAILED" + print(f" {school}-{task}:{model} — {status}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Export fine-tuned models to Ollama.") + parser.add_argument("--school", required=True, help="School directory name") + args = parser.parse_args() + main(args.school) +``` + +- [ ] **Step 2: Commit** + +```bash +git add training/export.py +git commit -m "feat(training): Ollama model export and registration" +``` + +--- + +## Task 11: Dashboard Model Client + +**Files:** +- Create: `codebenders-dashboard/lib/model-client.ts` + +- [ ] **Step 1: Write the implementation** + +Create `codebenders-dashboard/lib/model-client.ts`: +```typescript +/** + * Model client adapter — routes inference to Ollama (fine-tuned) or + * OpenAI (fallback) based on MODEL_BACKEND env var. + */ + +import { generateText } from "ai" +import { createOpenAI } from "@ai-sdk/openai" + +const MODEL_BACKEND = process.env.MODEL_BACKEND || "openai" +const SCHOOL_CODE = process.env.SCHOOL_CODE || "bishop-state" +const OLLAMA_BASE_URL = process.env.OLLAMA_BASE_URL || "http://localhost:11434" + +const openai = createOpenAI({ + apiKey: process.env.OPENAI_API_KEY || "", +}) + +interface ModelResponse { + text: string +} + +async function callOllama(model: string, prompt: string): Promise { + const response = await fetch(`${OLLAMA_BASE_URL}/api/generate`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + model, + prompt, + stream: false, + options: { + temperature: 0.3, + num_predict: 1024, + }, + }), + }) + + if (!response.ok) { + throw new Error(`Ollama error: ${response.status} ${response.statusText}`) + } + + const data = await response.json() + return data.response +} + +async function callOpenAI(prompt: string, maxTokens: number): Promise { + const result = await generateText({ + model: openai("gpt-4o-mini"), + prompt, + maxTokens, + }) + return result.text +} + +/** + * Generate a course pairing explanation. + * + * Routes to the school's fine-tuned explainer model via Ollama, + * or falls back to OpenAI GPT-4o-mini. + */ +export async function generateExplanation( + prompt: string, + maxTokens: number = 320, +): Promise { + if (MODEL_BACKEND === "ollama") { + const modelSize = process.env.MODEL_SIZE || "9b" + const model = `${SCHOOL_CODE}-explainer:${modelSize}` + return callOllama(model, prompt) + } + return callOpenAI(prompt, maxTokens) +} + +/** + * Generate a query result summary. + * + * Routes to the school's fine-tuned summarizer model via Ollama, + * or falls back to OpenAI GPT-4o-mini. + */ +export async function generateSummary( + prompt: string, + maxTokens: number = 200, +): Promise { + if (MODEL_BACKEND === "ollama") { + const modelSize = process.env.MODEL_SIZE || "9b" + const model = `${SCHOOL_CODE}-summarizer:${modelSize}` + return callOllama(model, prompt) + } + return callOpenAI(prompt, maxTokens) +} +``` + +- [ ] **Step 2: Commit** + +```bash +git add codebenders-dashboard/lib/model-client.ts +git commit -m "feat(dashboard): model client adapter for Ollama/OpenAI routing" +``` + +--- + +## Task 12: Integrate Model Client into API Routes + +**Files:** +- Modify: `codebenders-dashboard/app/api/courses/explain-pairing/route.ts` +- Modify: `codebenders-dashboard/app/api/query-summary/route.ts` + +- [ ] **Step 1: Update explain-pairing route** + +In `codebenders-dashboard/app/api/courses/explain-pairing/route.ts`, replace the inline OpenAI call with the model client. + +Find the import section and add: +```typescript +import { generateExplanation } from "@/lib/model-client" +``` + +Find the `generateText` call block (approximately lines 192-196) and replace: +```typescript +// Before: +const { text } = await generateText({ + model: openai("gpt-4o-mini"), + prompt: llmPrompt, + maxTokens: 320, +}) + +// After: +const text = await generateExplanation(llmPrompt, 320) +``` + +Remove the now-unused inline OpenAI client imports if they become unreferenced after this change. + +- [ ] **Step 2: Update query-summary route** + +In `codebenders-dashboard/app/api/query-summary/route.ts`, replace the inline OpenAI call with the model client. + +Add import: +```typescript +import { generateSummary } from "@/lib/model-client" +``` + +Find the `generateText` call (approximately lines 50-54) and replace: +```typescript +// Before: +const { text } = await generateText({ + model: openai("gpt-4o-mini"), + prompt: llmPrompt, + maxTokens: 200, +}) + +// After: +const text = await generateSummary(llmPrompt, 200) +``` + +Remove unused inline OpenAI client imports. + +- [ ] **Step 3: Verify dashboard builds** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon/codebenders-dashboard && npm run build` +Expected: Build succeeds with no TypeScript errors + +- [ ] **Step 4: Commit** + +```bash +git add codebenders-dashboard/app/api/courses/explain-pairing/route.ts \ + codebenders-dashboard/app/api/query-summary/route.ts +git commit -m "feat(dashboard): route explain-pairing and query-summary through model client" +``` + +--- + +## Task 13: Run All Tests and Final Verification + +- [ ] **Step 1: Run full Python test suite** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/ -v` +Expected: All tests PASS + +- [ ] **Step 2: Verify dashboard builds** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon/codebenders-dashboard && npm run build` +Expected: Build succeeds + +- [ ] **Step 3: Verify pipeline CLI entry points** + +Run: +```bash +cd /Users/william-meroxa/Development/codebenders-datathon +venv/bin/python -m training.distill --help +venv/bin/python -m training.prepare --help +venv/bin/python -m training.finetune --help +venv/bin/python -m training.eval --help +venv/bin/python -m training.export --help +``` +Expected: Each prints usage without errors + +- [ ] **Step 4: Verify config loads end-to-end** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -c "from training.config import load_school_config; c = load_school_config('bishop-state'); print(f'School: {c[\"school\"][\"name\"]}'); print(f'Programs: {len(c[\"domain\"][\"programs\"])}'); print(f'Student columns: {len(c[\"schema\"][\"student_columns\"])}'); print(f'Course columns: {len(c[\"schema\"][\"course_columns\"])}')"` +Expected: Prints school name, program count, and column counts without errors From 29af1058a06b9dcd0276ea153e49da46b666bfb9 Mon Sep 17 00:00:00 2001 From: William Hill Date: Wed, 1 Apr 2026 19:33:38 -0400 Subject: [PATCH 05/15] docs: add design spec for SIS deep-link feature (#78) --- .../specs/2026-04-01-sis-deep-link-design.md | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 docs/superpowers/specs/2026-04-01-sis-deep-link-design.md diff --git a/docs/superpowers/specs/2026-04-01-sis-deep-link-design.md b/docs/superpowers/specs/2026-04-01-sis-deep-link-design.md new file mode 100644 index 0000000..c26a8e5 --- /dev/null +++ b/docs/superpowers/specs/2026-04-01-sis-deep-link-design.md @@ -0,0 +1,119 @@ +# SIS Deep-Link from Student Detail View — Design Spec + +**Date:** 2026-04-01 +**Issue:** #78 +**Scope:** Proof of concept / demo +**Branch:** `feature/sis-deep-link` (from `main`) + +## Summary + +Add a FERPA-compliant "Open in SIS" button to the student detail page that constructs a deep-link URL to the institution's Student Information System. Identity resolution happens server-side — the browser never receives the SIS student ID. This POC validates the architecture with sample data and a configurable demo URL. + +## Architecture + +``` +Browser (student detail page) + │ + ├─ GET /api/students/[guid]/sis-link + │ │ + │ ├─ Role check (x-user-role header, admin/advisor/ir only) + │ ├─ Query guid_sis_map table for sis_id + │ ├─ Build URL: SIS_BASE_URL?SIS_ID_PARAM= + │ ├─ Append audit log entry (GUID + role, never sis_id) + │ └─ Return { url } or 404 + │ + └─ window.open(url, "_blank") +``` + +The SIS ID never reaches the client. The audit log records access by GUID and role only. + +## 1. Database — `guid_sis_map` Table + +Table in the existing Postgres database: + +```sql +CREATE TABLE guid_sis_map ( + student_guid TEXT PRIMARY KEY, + sis_id TEXT NOT NULL +); +``` + +A seed script picks ~20 random GUIDs from `student_level_with_predictions` and assigns fake SIS IDs (`BSC-100001` through `BSC-100020`). This demonstrates both the happy path (button works) and the fallback (no mapping → disabled button with tooltip). + +## 2. Environment Configuration + +Two server-only env vars in `.env.local` (no `NEXT_PUBLIC_` prefix): + +```env +# SIS deep-link (leave blank to hide the button entirely) +SIS_BASE_URL=https://sis-demo.example.com/students +# Query param name the SIS expects (default: id) +SIS_ID_PARAM=id +``` + +When `SIS_BASE_URL` is unset, the API returns 404 and the UI hides the button — the feature is effectively disabled. + +## 3. API Route — `GET /api/students/[guid]/sis-link` + +**File:** `codebenders-dashboard/app/api/students/[guid]/sis-link/route.ts` + +**Behavior:** + +| Condition | Response | +|-----------|----------| +| `SIS_BASE_URL` unset | 404 `{ url: null }` | +| Role not in `admin, advisor, ir` | 403 `{ error: "Forbidden" }` | +| No mapping in `guid_sis_map` | 404 `{ url: null }` | +| Mapping found | 200 `{ url: "https://sis-demo.example.com/students?id=BSC-100001" }` | + +**Role gating:** Reads `x-user-role` header injected by existing middleware. No changes to `lib/roles.ts` needed — the `/api/students` prefix is already gated to `admin`, `advisor`, `ir`. + +**Audit logging:** Appends to `logs/query-history.jsonl`: + +```json +{ "event": "sis_link_accessed", "guid": "", "role": "advisor", "timestamp": "2026-04-01T12:00:00.000Z" } +``` + +The `sis_id` is never logged. + +## 4. UI — "Open in SIS" Button + +**File:** `codebenders-dashboard/app/students/[guid]/page.tsx` + +**Placement:** In the student header card, alongside the existing alert/readiness badges. + +**Visibility logic** (determined by the API response on page load): + +| API result | Button state | +|------------|--------------| +| 200 with URL | Visible and clickable — opens URL in new tab | +| 404 (no mapping) | Visible but disabled — tooltip: "No SIS record linked for this student" | +| 403 or fetch error | Hidden entirely | + +Uses existing `Button` from shadcn/ui and `ExternalLink` icon from lucide-react. No new component file needed. + +## Files Changed + +| File | Change | +|------|--------| +| `operations/seed_guid_sis_map.py` | New — seed script for demo data | +| `codebenders-dashboard/.env.local` | Add `SIS_BASE_URL`, `SIS_ID_PARAM` | +| `codebenders-dashboard/app/api/students/[guid]/sis-link/route.ts` | New — server-side SIS URL builder | +| `codebenders-dashboard/app/students/[guid]/page.tsx` | Add "Open in SIS" button with fetch logic | + +## Out of Scope + +- Row-Level Security on `guid_sis_map` (not needed for POC) +- Real institution SIS integration (demo uses placeholder URL) +- `.env.example` file (can be added later) +- Supabase Edge Function alternative + +## Acceptance Criteria (from issue #78) + +- [x] `SIS_BASE_URL` env var controls whether the button appears (hidden when blank) +- [x] Button only visible to Advisor + IR + Admin roles +- [x] SIS ID is never stored in `student_level_with_predictions` or `llm_recommendations` +- [x] SIS ID is never returned by any public API endpoint (only pre-built URL returned) +- [x] Every deep-link access is logged (GUID + role, not SIS ID) +- [x] Button shows a graceful fallback if no mapping exists for a GUID +- [x] Works with any SIS that accepts a query-param student ID in a URL From 8b89b7fa5472124e2c1a5ec322b78546d9ec64d9 Mon Sep 17 00:00:00 2001 From: William Hill Date: Wed, 1 Apr 2026 19:36:55 -0400 Subject: [PATCH 06/15] docs: add implementation plan for SIS deep-link feature (#78) --- .../plans/2026-04-01-sis-deep-link.md | 455 ++++++++++++++++++ 1 file changed, 455 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-01-sis-deep-link.md diff --git a/docs/superpowers/plans/2026-04-01-sis-deep-link.md b/docs/superpowers/plans/2026-04-01-sis-deep-link.md new file mode 100644 index 0000000..128856b --- /dev/null +++ b/docs/superpowers/plans/2026-04-01-sis-deep-link.md @@ -0,0 +1,455 @@ +# SIS Deep-Link Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a FERPA-compliant "Open in SIS" button to the student detail page that constructs a deep-link URL server-side, keeping the SIS student ID out of the browser. + +**Architecture:** A new `guid_sis_map` table maps anonymized GUIDs to SIS IDs. A new API route (`GET /api/students/[guid]/sis-link`) performs the lookup, builds the URL server-side, logs access, and returns only the constructed URL. The student detail page fetches this endpoint and renders the button accordingly. + +**Tech Stack:** Next.js 16 (App Router), PostgreSQL (pg driver), shadcn/ui, Tailwind CSS, Python (psycopg2 for seed script) + +--- + +## File Structure + +| File | Action | Responsibility | +|------|--------|----------------| +| `operations/seed_guid_sis_map.py` | Create | Create table + seed ~20 demo mappings | +| `codebenders-dashboard/.env.local` | Modify | Add `SIS_BASE_URL`, `SIS_ID_PARAM` | +| `codebenders-dashboard/app/api/students/[guid]/sis-link/route.ts` | Create | Server-side SIS URL builder + audit log | +| `codebenders-dashboard/app/students/[guid]/page.tsx` | Modify | Add "Open in SIS" button | + +--- + +### Task 1: Create `guid_sis_map` Table and Seed Demo Data + +**Files:** +- Create: `operations/seed_guid_sis_map.py` + +- [ ] **Step 1: Write the seed script** + +Create `operations/seed_guid_sis_map.py` using the existing `db_config` and `psycopg2` pattern from `operations/db_utils.py`: + +```python +""" +Seed guid_sis_map Table +======================== +Creates the guid_sis_map table and populates it with ~20 demo mappings +for POC/demo purposes. Maps real Student_GUIDs to fake SIS IDs. +""" + +import psycopg2 +from psycopg2.extras import RealDictCursor +from .db_config import DB_CONFIG + + +def seed_guid_sis_map(): + """Create guid_sis_map table and seed with demo data.""" + connection = psycopg2.connect( + host=DB_CONFIG['host'], + user=DB_CONFIG['user'], + password=DB_CONFIG['password'], + dbname=DB_CONFIG['database'], + port=DB_CONFIG['port'], + cursor_factory=RealDictCursor + ) + cursor = connection.cursor() + + # Create table + cursor.execute(""" + CREATE TABLE IF NOT EXISTS guid_sis_map ( + student_guid TEXT PRIMARY KEY, + sis_id TEXT NOT NULL + ); + """) + print("✓ guid_sis_map table created/verified") + + # Pick ~20 random GUIDs from student_level_with_predictions + cursor.execute(""" + SELECT "Student_GUID" + FROM student_level_with_predictions + ORDER BY RANDOM() + LIMIT 20 + """) + guids = [row['Student_GUID'] for row in cursor.fetchall()] + + if not guids: + print("✗ No students found in student_level_with_predictions") + cursor.close() + connection.close() + return False + + # Clear existing demo data and insert fresh mappings + cursor.execute("DELETE FROM guid_sis_map") + + for i, guid in enumerate(guids, start=100001): + sis_id = f"BSC-{i}" + cursor.execute( + "INSERT INTO guid_sis_map (student_guid, sis_id) VALUES (%s, %s)", + (guid, sis_id) + ) + + connection.commit() + print(f"✓ Seeded {len(guids)} GUID → SIS ID mappings (BSC-100001 .. BSC-{100000 + len(guids)})") + + # Verify + cursor.execute("SELECT COUNT(*) AS count FROM guid_sis_map") + count = cursor.fetchone()['count'] + print(f"✓ Verified: {count} records in guid_sis_map") + + cursor.close() + connection.close() + return True + + +if __name__ == "__main__": + print("=" * 60) + print("SEEDING guid_sis_map TABLE") + print("=" * 60) + seed_guid_sis_map() +``` + +- [ ] **Step 2: Run the seed script** + +Run: +```bash +cd /Users/william-meroxa/Development/codebenders-datathon +source venv/bin/activate +python -m operations.seed_guid_sis_map +``` + +Expected output: +``` +============================================================ +SEEDING guid_sis_map TABLE +============================================================ +✓ Connected to database: postgres +✓ guid_sis_map table created/verified +✓ Seeded 20 GUID → SIS ID mappings (BSC-100001 .. BSC-100020) +✓ Verified: 20 records in guid_sis_map +``` + +- [ ] **Step 3: Commit** + +```bash +git add operations/seed_guid_sis_map.py +git commit -m "feat(#78): add guid_sis_map seed script for SIS deep-link POC" +``` + +--- + +### Task 2: Add Environment Variables + +**Files:** +- Modify: `codebenders-dashboard/.env.local` (append at end) + +- [ ] **Step 1: Add SIS env vars to `.env.local`** + +Append to the end of `codebenders-dashboard/.env.local`: + +```env + +# SIS Deep-Link Configuration (leave SIS_BASE_URL blank to disable) +SIS_BASE_URL=https://sis-demo.example.com/students +SIS_ID_PARAM=id +``` + +- [ ] **Step 2: Commit** + +```bash +cd /Users/william-meroxa/Development/codebenders-datathon +git add codebenders-dashboard/.env.local +git commit -m "feat(#78): add SIS deep-link env vars" +``` + +Note: `.env.local` is already gitignored. If it is, skip the commit for this file — the env vars are documented in the design spec and the API route defaults `SIS_ID_PARAM` to `"id"`. + +--- + +### Task 3: Create SIS Link API Route + +**Files:** +- Create: `codebenders-dashboard/app/api/students/[guid]/sis-link/route.ts` + +- [ ] **Step 1: Create the API route** + +Create `codebenders-dashboard/app/api/students/[guid]/sis-link/route.ts`: + +```typescript +import { type NextRequest, NextResponse } from "next/server" +import { mkdir, appendFile } from "fs/promises" +import path from "path" +import { getPool } from "@/lib/db" +import type { Role } from "@/lib/roles" + +const ALLOWED_ROLES: Role[] = ["admin", "advisor", "ir"] + +const LOGS_DIR = path.join(process.cwd(), "logs") +const LOG_FILE = path.join(LOGS_DIR, "query-history.jsonl") + +export async function GET( + request: NextRequest, + { params }: { params: Promise<{ guid: string }> } +) { + // Feature disabled if SIS_BASE_URL is not configured + const sisBaseUrl = process.env.SIS_BASE_URL + if (!sisBaseUrl) { + return NextResponse.json({ url: null }, { status: 404 }) + } + + // Role check + const role = request.headers.get("x-user-role") as Role | null + if (!role || !ALLOWED_ROLES.includes(role)) { + return NextResponse.json({ error: "Forbidden" }, { status: 403 }) + } + + const { guid } = await params + if (!guid) { + return NextResponse.json({ error: "Missing student GUID" }, { status: 400 }) + } + + try { + // Look up SIS ID from mapping table + const pool = getPool() + const result = await pool.query( + "SELECT sis_id FROM guid_sis_map WHERE student_guid = $1 LIMIT 1", + [guid] + ) + + if (result.rows.length === 0) { + return NextResponse.json({ url: null }, { status: 404 }) + } + + // Build URL server-side — SIS ID never reaches the client + const sisIdParam = process.env.SIS_ID_PARAM || "id" + const sisId = result.rows[0].sis_id + const url = `${sisBaseUrl}?${encodeURIComponent(sisIdParam)}=${encodeURIComponent(sisId)}` + + // Audit log — GUID and role only, never the SIS ID + const logEntry = { + event: "sis_link_accessed", + guid, + role, + timestamp: new Date().toISOString(), + } + await mkdir(LOGS_DIR, { recursive: true }) + await appendFile(LOG_FILE, JSON.stringify(logEntry) + "\n", "utf8") + + return NextResponse.json({ url }) + } catch (error) { + console.error("SIS link lookup error:", error) + return NextResponse.json( + { error: "Failed to look up SIS link" }, + { status: 500 } + ) + } +} +``` + +- [ ] **Step 2: Verify the route loads** + +Start the dev server and test the endpoint: + +```bash +cd /Users/william-meroxa/Development/codebenders-datathon/codebenders-dashboard +npm run dev +``` + +Then in another terminal, test with curl (this will return 403 without auth headers, which confirms the route loads and the role check works): + +```bash +curl -s http://localhost:3000/api/students/test-guid/sis-link | jq . +``` + +Expected: `{ "error": "Forbidden" }` with status 403. + +- [ ] **Step 3: Commit** + +```bash +cd /Users/william-meroxa/Development/codebenders-datathon +git add codebenders-dashboard/app/api/students/\[guid\]/sis-link/route.ts +git commit -m "feat(#78): add SIS deep-link API route with audit logging" +``` + +--- + +### Task 4: Add "Open in SIS" Button to Student Detail Page + +**Files:** +- Modify: `codebenders-dashboard/app/students/[guid]/page.tsx` + +- [ ] **Step 1: Add SIS link state and fetch logic** + +In `codebenders-dashboard/app/students/[guid]/page.tsx`, add imports for `ExternalLink` at the top alongside the existing lucide-react imports: + +```typescript +import { ArrowLeft, ExternalLink, ShieldCheck } from "lucide-react" +``` + +Add a `Tooltip` import from shadcn/ui (if available) or we'll use the `title` attribute for the POC. + +Add new state variables inside the `StudentDetailPage` component, after the existing `error` state: + +```typescript +const [sisLink, setSisLink] = useState(null) +const [sisStatus, setSisStatus] = useState<"loading" | "available" | "unavailable" | "hidden">("loading") +``` + +Add a second `useEffect` after the existing one that fetches student data, to fetch the SIS link: + +```typescript +useEffect(() => { + if (!guid) return + fetch(`/api/students/${encodeURIComponent(guid)}/sis-link`) + .then(r => { + if (r.status === 403) { + setSisStatus("hidden") + return null + } + if (r.status === 404) { + setSisStatus("unavailable") + return null + } + if (!r.ok) { + setSisStatus("hidden") + return null + } + return r.json() + }) + .then(data => { + if (data?.url) { + setSisLink(data.url) + setSisStatus("available") + } + }) + .catch(() => setSisStatus("hidden")) +}, [guid]) +``` + +- [ ] **Step 2: Add the button to the student header** + +In the same file, find the badges `
` in the student header (the `
` that contains the alert and readiness badges). Add the SIS button before the badges: + +Replace this block (around line 181): +```tsx +
+ {student.at_risk_alert && ( +``` + +With: +```tsx +
+ {sisStatus === "available" && sisLink && ( + + )} + {sisStatus === "unavailable" && ( + + )} + {student.at_risk_alert && ( +``` + +- [ ] **Step 3: Verify in the browser** + +1. Start the dev server: `npm run dev` +2. Navigate to a student detail page for a GUID that was seeded in `guid_sis_map` +3. Verify the "Open in SIS" button appears and clicking it opens the demo URL in a new tab +4. Navigate to a student NOT in `guid_sis_map` +5. Verify the button appears disabled with the tooltip text + +To find a seeded GUID for testing: +```bash +cd /Users/william-meroxa/Development/codebenders-datathon +source venv/bin/activate +python -c " +import psycopg2 +from operations.db_config import DB_CONFIG +conn = psycopg2.connect(**DB_CONFIG) +cur = conn.cursor() +cur.execute('SELECT student_guid FROM guid_sis_map LIMIT 3') +for row in cur.fetchall(): + print(row[0]) +cur.close() +conn.close() +" +``` + +- [ ] **Step 4: Verify audit log entry** + +After clicking the button, check that an audit entry was written: + +```bash +tail -1 codebenders-dashboard/logs/query-history.jsonl +``` + +Expected: a JSON line like: +```json +{"event":"sis_link_accessed","guid":"","role":"advisor","timestamp":"2026-04-01T..."} +``` + +- [ ] **Step 5: Commit** + +```bash +cd /Users/william-meroxa/Development/codebenders-datathon +git add codebenders-dashboard/app/students/\[guid\]/page.tsx +git commit -m "feat(#78): add Open in SIS button to student detail page" +``` + +--- + +### Task 5: Final Verification + +- [ ] **Step 1: Run lint** + +```bash +cd /Users/william-meroxa/Development/codebenders-datathon/codebenders-dashboard +npm run lint +``` + +Expected: no new warnings or errors. + +- [ ] **Step 2: Run build** + +```bash +cd /Users/william-meroxa/Development/codebenders-datathon/codebenders-dashboard +npm run build +``` + +Expected: build succeeds with no type errors. + +- [ ] **Step 3: End-to-end walkthrough** + +Verify all acceptance criteria from issue #78: + +| Criterion | How to verify | +|-----------|---------------| +| `SIS_BASE_URL` controls button | Remove the env var, restart dev server, confirm button is hidden | +| Button only for admin/advisor/ir | Log in as a leadership/faculty user, confirm button is hidden | +| SIS ID never in public API | Check Network tab — `/sis-link` returns `{ url }` only, not the raw SIS ID | +| SIS ID never in student data | Check `/api/students/[guid]` response — no `sis_id` field | +| Deep-link access logged | Check `logs/query-history.jsonl` for `sis_link_accessed` entries | +| Graceful fallback | Visit a student without a mapping — disabled button with tooltip | +| Works with any SIS URL | Change `SIS_BASE_URL` in env, verify the URL changes | + +- [ ] **Step 4: Final commit (if any lint/build fixes needed)** + +```bash +git add -p +git commit -m "fix(#78): address lint/build issues" +``` From 7e2bd031292d03b66cb2079ec05ae8d71112ae5c Mon Sep 17 00:00:00 2001 From: William Hill Date: Wed, 1 Apr 2026 19:46:52 -0400 Subject: [PATCH 07/15] feat(#78): add guid_sis_map seed script for SIS deep-link POC --- operations/seed_guid_sis_map.py | 76 +++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 operations/seed_guid_sis_map.py diff --git a/operations/seed_guid_sis_map.py b/operations/seed_guid_sis_map.py new file mode 100644 index 0000000..a544757 --- /dev/null +++ b/operations/seed_guid_sis_map.py @@ -0,0 +1,76 @@ +""" +Seed guid_sis_map Table +======================== +Creates the guid_sis_map table and populates it with ~20 demo mappings +for POC/demo purposes. Maps real Student_GUIDs to fake SIS IDs. +""" + +import psycopg2 +from psycopg2.extras import RealDictCursor +from .db_config import DB_CONFIG + + +def seed_guid_sis_map(): + """Create guid_sis_map table and seed with demo data.""" + connection = psycopg2.connect( + host=DB_CONFIG['host'], + user=DB_CONFIG['user'], + password=DB_CONFIG['password'], + dbname=DB_CONFIG['database'], + port=DB_CONFIG['port'], + cursor_factory=RealDictCursor + ) + cursor = connection.cursor() + + # Create table + cursor.execute(""" + CREATE TABLE IF NOT EXISTS guid_sis_map ( + student_guid TEXT PRIMARY KEY, + sis_id TEXT NOT NULL + ); + """) + print("✓ guid_sis_map table created/verified") + + # Pick ~20 random GUIDs from student_level_with_predictions + cursor.execute(""" + SELECT "Student_GUID" + FROM student_level_with_predictions + ORDER BY RANDOM() + LIMIT 20 + """) + guids = [row['Student_GUID'] for row in cursor.fetchall()] + + if not guids: + print("✗ No students found in student_level_with_predictions") + cursor.close() + connection.close() + return False + + # Clear existing demo data and insert fresh mappings + cursor.execute("DELETE FROM guid_sis_map") + + for i, guid in enumerate(guids, start=100001): + sis_id = f"BSC-{i}" + cursor.execute( + "INSERT INTO guid_sis_map (student_guid, sis_id) VALUES (%s, %s)", + (guid, sis_id) + ) + + connection.commit() + print(f"✓ Seeded {len(guids)} GUID → SIS ID mappings (BSC-100001 .. BSC-{100000 + len(guids)})") + + # Verify + cursor.execute("SELECT COUNT(*) AS count FROM guid_sis_map") + count = cursor.fetchone()['count'] + print(f"✓ Verified: {count} records in guid_sis_map") + + cursor.close() + connection.close() + return True + + +if __name__ == "__main__": + print("=" * 60) + print("SEEDING guid_sis_map TABLE") + print("=" * 60) + seed_guid_sis_map() From 2e7b9e0df5f97514aaa37f2510b732f77c95775f Mon Sep 17 00:00:00 2001 From: William Hill Date: Wed, 1 Apr 2026 19:49:53 -0400 Subject: [PATCH 08/15] fix(#78): use get_connection helper and add error handling to seed script --- operations/seed_guid_sis_map.py | 95 ++++++++++++++++----------------- 1 file changed, 46 insertions(+), 49 deletions(-) diff --git a/operations/seed_guid_sis_map.py b/operations/seed_guid_sis_map.py index a544757..c15c2c6 100644 --- a/operations/seed_guid_sis_map.py +++ b/operations/seed_guid_sis_map.py @@ -5,68 +5,65 @@ for POC/demo purposes. Maps real Student_GUIDs to fake SIS IDs. """ -import psycopg2 -from psycopg2.extras import RealDictCursor -from .db_config import DB_CONFIG +from .db_utils import get_connection def seed_guid_sis_map(): """Create guid_sis_map table and seed with demo data.""" - connection = psycopg2.connect( - host=DB_CONFIG['host'], - user=DB_CONFIG['user'], - password=DB_CONFIG['password'], - dbname=DB_CONFIG['database'], - port=DB_CONFIG['port'], - cursor_factory=RealDictCursor - ) + connection = get_connection() cursor = connection.cursor() - # Create table - cursor.execute(""" - CREATE TABLE IF NOT EXISTS guid_sis_map ( - student_guid TEXT PRIMARY KEY, - sis_id TEXT NOT NULL - ); - """) - print("✓ guid_sis_map table created/verified") + try: + # Create table + cursor.execute(""" + CREATE TABLE IF NOT EXISTS guid_sis_map ( + student_guid TEXT PRIMARY KEY, + sis_id TEXT NOT NULL + ); + """) + print("✓ guid_sis_map table created/verified") - # Pick ~20 random GUIDs from student_level_with_predictions - cursor.execute(""" - SELECT "Student_GUID" - FROM student_level_with_predictions - ORDER BY RANDOM() - LIMIT 20 - """) - guids = [row['Student_GUID'] for row in cursor.fetchall()] + # Pick ~20 random GUIDs from student_level_with_predictions + cursor.execute(""" + SELECT "Student_GUID" + FROM student_level_with_predictions + ORDER BY RANDOM() + LIMIT 20 + """) + guids = [row['Student_GUID'] for row in cursor.fetchall()] - if not guids: - print("✗ No students found in student_level_with_predictions") - cursor.close() - connection.close() - return False + if not guids: + print("✗ No students found in student_level_with_predictions") + return False + + # Clear existing demo data and insert fresh mappings + cursor.execute("DELETE FROM guid_sis_map") - # Clear existing demo data and insert fresh mappings - cursor.execute("DELETE FROM guid_sis_map") + for i, guid in enumerate(guids, start=100001): + sis_id = f"BSC-{i}" + cursor.execute( + "INSERT INTO guid_sis_map (student_guid, sis_id) VALUES (%s, %s)", + (guid, sis_id) + ) - for i, guid in enumerate(guids, start=100001): - sis_id = f"BSC-{i}" - cursor.execute( - "INSERT INTO guid_sis_map (student_guid, sis_id) VALUES (%s, %s)", - (guid, sis_id) - ) + connection.commit() + print(f"✓ Seeded {len(guids)} GUID → SIS ID mappings (BSC-100001 .. BSC-{100000 + len(guids)})") - connection.commit() - print(f"✓ Seeded {len(guids)} GUID → SIS ID mappings (BSC-100001 .. BSC-{100000 + len(guids)})") + # Verify + cursor.execute("SELECT COUNT(*) AS count FROM guid_sis_map") + count = cursor.fetchone()['count'] + print(f"✓ Verified: {count} records in guid_sis_map") - # Verify - cursor.execute("SELECT COUNT(*) AS count FROM guid_sis_map") - count = cursor.fetchone()['count'] - print(f"✓ Verified: {count} records in guid_sis_map") + return True - cursor.close() - connection.close() - return True + except Exception as e: + connection.rollback() + print(f"✗ Failed to seed guid_sis_map: {e}") + return False + + finally: + cursor.close() + connection.close() if __name__ == "__main__": From b70cdda96acbf85071ede663339b4c3c444a7df0 Mon Sep 17 00:00:00 2001 From: William Hill Date: Wed, 1 Apr 2026 20:02:42 -0400 Subject: [PATCH 09/15] feat(#78): add SIS deep-link API route with audit logging --- .../app/api/students/[guid]/sis-link/route.ts | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 codebenders-dashboard/app/api/students/[guid]/sis-link/route.ts diff --git a/codebenders-dashboard/app/api/students/[guid]/sis-link/route.ts b/codebenders-dashboard/app/api/students/[guid]/sis-link/route.ts new file mode 100644 index 0000000..c01091d --- /dev/null +++ b/codebenders-dashboard/app/api/students/[guid]/sis-link/route.ts @@ -0,0 +1,68 @@ +import { type NextRequest, NextResponse } from "next/server" +import { mkdir, appendFile } from "fs/promises" +import path from "path" +import { getPool } from "@/lib/db" +import type { Role } from "@/lib/roles" + +const ALLOWED_ROLES: Role[] = ["admin", "advisor", "ir"] + +const LOGS_DIR = path.join(process.cwd(), "logs") +const LOG_FILE = path.join(LOGS_DIR, "query-history.jsonl") + +export async function GET( + request: NextRequest, + { params }: { params: Promise<{ guid: string }> } +) { + // Feature disabled if SIS_BASE_URL is not configured + const sisBaseUrl = process.env.SIS_BASE_URL + if (!sisBaseUrl) { + return NextResponse.json({ url: null }, { status: 404 }) + } + + // Role check + const role = request.headers.get("x-user-role") as Role | null + if (!role || !ALLOWED_ROLES.includes(role)) { + return NextResponse.json({ error: "Forbidden" }, { status: 403 }) + } + + const { guid } = await params + if (!guid) { + return NextResponse.json({ error: "Missing student GUID" }, { status: 400 }) + } + + try { + // Look up SIS ID from mapping table + const pool = getPool() + const result = await pool.query( + "SELECT sis_id FROM guid_sis_map WHERE student_guid = $1 LIMIT 1", + [guid] + ) + + if (result.rows.length === 0) { + return NextResponse.json({ url: null }, { status: 404 }) + } + + // Build URL server-side — SIS ID never reaches the client + const sisIdParam = process.env.SIS_ID_PARAM || "id" + const sisId = result.rows[0].sis_id + const url = `${sisBaseUrl}?${encodeURIComponent(sisIdParam)}=${encodeURIComponent(sisId)}` + + // Audit log — GUID and role only, never the SIS ID + const logEntry = { + event: "sis_link_accessed", + guid, + role, + timestamp: new Date().toISOString(), + } + await mkdir(LOGS_DIR, { recursive: true }) + await appendFile(LOG_FILE, JSON.stringify(logEntry) + "\n", "utf8") + + return NextResponse.json({ url }) + } catch (error) { + console.error("SIS link lookup error:", error) + return NextResponse.json( + { error: "Failed to look up SIS link" }, + { status: 500 } + ) + } +} From c06d4f82e55dd8bb082be5c9ee85d8a2b897ac19 Mon Sep 17 00:00:00 2001 From: William Hill Date: Wed, 1 Apr 2026 20:06:09 -0400 Subject: [PATCH 10/15] fix(#78): use canAccess pattern and isolate audit log writes --- .../app/api/students/[guid]/sis-link/route.ts | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/codebenders-dashboard/app/api/students/[guid]/sis-link/route.ts b/codebenders-dashboard/app/api/students/[guid]/sis-link/route.ts index c01091d..c389cde 100644 --- a/codebenders-dashboard/app/api/students/[guid]/sis-link/route.ts +++ b/codebenders-dashboard/app/api/students/[guid]/sis-link/route.ts @@ -2,9 +2,7 @@ import { type NextRequest, NextResponse } from "next/server" import { mkdir, appendFile } from "fs/promises" import path from "path" import { getPool } from "@/lib/db" -import type { Role } from "@/lib/roles" - -const ALLOWED_ROLES: Role[] = ["admin", "advisor", "ir"] +import { canAccess, type Role } from "@/lib/roles" const LOGS_DIR = path.join(process.cwd(), "logs") const LOG_FILE = path.join(LOGS_DIR, "query-history.jsonl") @@ -21,7 +19,7 @@ export async function GET( // Role check const role = request.headers.get("x-user-role") as Role | null - if (!role || !ALLOWED_ROLES.includes(role)) { + if (!role || !canAccess("/api/students", role)) { return NextResponse.json({ error: "Forbidden" }, { status: 403 }) } @@ -30,6 +28,8 @@ export async function GET( return NextResponse.json({ error: "Missing student GUID" }, { status: 400 }) } + let url: string + try { // Look up SIS ID from mapping table const pool = getPool() @@ -45,19 +45,7 @@ export async function GET( // Build URL server-side — SIS ID never reaches the client const sisIdParam = process.env.SIS_ID_PARAM || "id" const sisId = result.rows[0].sis_id - const url = `${sisBaseUrl}?${encodeURIComponent(sisIdParam)}=${encodeURIComponent(sisId)}` - - // Audit log — GUID and role only, never the SIS ID - const logEntry = { - event: "sis_link_accessed", - guid, - role, - timestamp: new Date().toISOString(), - } - await mkdir(LOGS_DIR, { recursive: true }) - await appendFile(LOG_FILE, JSON.stringify(logEntry) + "\n", "utf8") - - return NextResponse.json({ url }) + url = `${sisBaseUrl}?${encodeURIComponent(sisIdParam)}=${encodeURIComponent(sisId)}` } catch (error) { console.error("SIS link lookup error:", error) return NextResponse.json( @@ -65,4 +53,20 @@ export async function GET( { status: 500 } ) } + + // Audit log — GUID and role only, never the SIS ID + const logEntry = { + event: "sis_link_accessed", + guid, + role, + timestamp: new Date().toISOString(), + } + try { + await mkdir(LOGS_DIR, { recursive: true }) + await appendFile(LOG_FILE, JSON.stringify(logEntry) + "\n", "utf8") + } catch (auditErr) { + console.error("SIS audit log write failed:", auditErr) + } + + return NextResponse.json({ url }) } From f7cd37e2fd819095cc80c1aa36691a9b54f584f7 Mon Sep 17 00:00:00 2001 From: William Hill Date: Wed, 1 Apr 2026 20:07:23 -0400 Subject: [PATCH 11/15] feat(#78): add Open in SIS button to student detail page --- .../app/students/[guid]/page.tsx | 54 ++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/codebenders-dashboard/app/students/[guid]/page.tsx b/codebenders-dashboard/app/students/[guid]/page.tsx index 90d3a10..b4f7e7f 100644 --- a/codebenders-dashboard/app/students/[guid]/page.tsx +++ b/codebenders-dashboard/app/students/[guid]/page.tsx @@ -2,7 +2,7 @@ import { useEffect, useState } from "react" import { useParams, useRouter } from "next/navigation" -import { ArrowLeft, ShieldCheck } from "lucide-react" +import { ArrowLeft, ExternalLink, ShieldCheck } from "lucide-react" import { Button } from "@/components/ui/button" import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card" @@ -88,6 +88,8 @@ export default function StudentDetailPage() { const [student, setStudent] = useState(null) const [loading, setLoading] = useState(true) const [error, setError] = useState(null) + const [sisLink, setSisLink] = useState(null) + const [sisStatus, setSisStatus] = useState<"loading" | "available" | "unavailable" | "hidden">("loading") useEffect(() => { if (!guid) return @@ -102,6 +104,33 @@ export default function StudentDetailPage() { .catch(e => { setError(e.message); setLoading(false) }) }, [guid]) + useEffect(() => { + if (!guid) return + fetch(`/api/students/${encodeURIComponent(guid)}/sis-link`) + .then(r => { + if (r.status === 403) { + setSisStatus("hidden") + return null + } + if (r.status === 404) { + setSisStatus("unavailable") + return null + } + if (!r.ok) { + setSisStatus("hidden") + return null + } + return r.json() + }) + .then(data => { + if (data?.url) { + setSisLink(data.url) + setSisStatus("available") + } + }) + .catch(() => setSisStatus("hidden")) + }, [guid]) + // ─── Loading skeleton ──────────────────────────────────────────────────── if (loading) { @@ -179,6 +208,29 @@ export default function StudentDetailPage() {
+ {sisStatus === "available" && sisLink && ( + + )} + {sisStatus === "unavailable" && ( + + )} {student.at_risk_alert && ( Date: Wed, 1 Apr 2026 20:17:15 -0400 Subject: [PATCH 12/15] fix(#78): handle loading state, null URL edge case, and button styling --- codebenders-dashboard/app/students/[guid]/page.tsx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/codebenders-dashboard/app/students/[guid]/page.tsx b/codebenders-dashboard/app/students/[guid]/page.tsx index b4f7e7f..98a1527 100644 --- a/codebenders-dashboard/app/students/[guid]/page.tsx +++ b/codebenders-dashboard/app/students/[guid]/page.tsx @@ -126,6 +126,8 @@ export default function StudentDetailPage() { if (data?.url) { setSisLink(data.url) setSisStatus("available") + } else if (data !== null) { + setSisStatus("unavailable") } }) .catch(() => setSisStatus("hidden")) @@ -208,6 +210,9 @@ export default function StudentDetailPage() {
+ {sisStatus === "loading" && ( +
+ )} {sisStatus === "available" && sisLink && ( - )} - {sisStatus === "unavailable" && ( -