Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
ffd9ab2
fix: enforce Codex 1024-char description limit + auto-heal stale inst…
garrytan Mar 23, 2026
f4bbfaa
feat: CI evals on Ubicloud — 12 parallel runners + Docker image (v0.1…
garrytan Mar 23, 2026
6f1bdb6
feat: Wave 3 — community bug fixes & platform support (v0.11.6.0) (#359)
garrytan Mar 24, 2026
2c5ae38
feat: triple-voice multi-model review in /autoplan (v0.11.12.0) (#424)
garrytan Mar 24, 2026
dc5e053
feat: worktree isolation for E2E tests + infrastructure elegance (v0.…
garrytan Mar 24, 2026
3501f5d
fix: Windows browse — health-check-first ensureServer, detached start…
garrytan Mar 24, 2026
6156122
test: E2E tests for plan review report and Codex offering (v0.11.15.0…
garrytan Mar 24, 2026
64d5a3e
fix: Supabase telemetry security lockdown (v0.11.16.0) (#460)
garrytan Mar 24, 2026
3703320
fix: verify-rls.sh matches deployed policy (inserts allowed, HTTP par…
garrytan Mar 24, 2026
2b85b1d
fix: random UUID installation_id + verify-rls.sh edge cases (v0.11.16…
garrytan Mar 24, 2026
315c172
feat: 2-tier E2E test system — granular touchfiles + gate/periodic sp…
garrytan Mar 24, 2026
8500136
feat: remove trigger guard + proactive opt-out prompt (#457)
garrytan Mar 25, 2026
7e0b879
feat: test coverage gate + plan completion audit + auto-verification …
garrytan Mar 25, 2026
70c51d5
feat: universal 'one decision per question' AskUserQuestion rule (v0.…
garrytan Mar 25, 2026
9870a4e
fix: Windows browse — stdio array format for Bun compatibility (v0.11…
garrytan Mar 25, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
interface:
display_name: "gstack-setup-browser-cookies"
short_description: "Import cookies from your real browser (Comet, Chrome, Arc, Brave, Edge) into the headless browse session. Opens an..."
short_description: "Import cookies from your real Chromium browser into the headless browse session. Opens an interactive picker UI..."
default_prompt: "Use gstack-setup-browser-cookies for this task."
policy:
allow_implicit_invocation: true
4 changes: 4 additions & 0 deletions .github/actionlint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
self-hosted-runner:
labels:
- ubicloud-standard-2
- ubicloud-standard-8
63 changes: 63 additions & 0 deletions .github/docker/Dockerfile.ci
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# gstack CI eval runner — pre-baked toolchain + deps
# Rebuild weekly via ci-image.yml, on Dockerfile changes, or on lockfile changes
FROM ubuntu:24.04

ENV DEBIAN_FRONTEND=noninteractive

# System deps
RUN apt-get update && apt-get install -y --no-install-recommends \
git curl unzip ca-certificates jq bc gpg \
&& rm -rf /var/lib/apt/lists/*

# GitHub CLI
RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg \
| gpg --dearmor -o /usr/share/keyrings/githubcli-archive-keyring.gpg \
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" \
| tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
&& apt-get update && apt-get install -y --no-install-recommends gh \
&& rm -rf /var/lib/apt/lists/*

# Node.js 22 LTS (needed for claude CLI)
RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \
&& apt-get install -y --no-install-recommends nodejs \
&& rm -rf /var/lib/apt/lists/*

# Bun (install to /usr/local so non-root users can access it)
ENV BUN_INSTALL="/usr/local"
RUN curl -fsSL https://bun.sh/install | bash

# Claude CLI
RUN npm i -g @anthropic-ai/claude-code

# Playwright system deps (Chromium) — needed for browse E2E tests
RUN npx playwright install-deps chromium

# Pre-install dependencies (cached layer — only rebuilds when package.json changes)
COPY package.json /workspace/
WORKDIR /workspace
RUN bun install && rm -rf /tmp/*

# Install Playwright Chromium to a shared location accessible by all users
ENV PLAYWRIGHT_BROWSERS_PATH=/opt/playwright-browsers
RUN npx playwright install chromium \
&& chmod -R a+rX /opt/playwright-browsers

# Verify everything works
RUN bun --version && node --version && claude --version && jq --version && gh --version \
&& npx playwright --version

# At runtime: checkout overwrites /workspace, but node_modules persists
# if we move it out of the way and symlink back
# Save node_modules + package.json snapshot for cache validation at runtime
RUN mv /workspace/node_modules /opt/node_modules_cache \
&& cp /workspace/package.json /opt/node_modules_cache/.package.json

# Claude CLI refuses --dangerously-skip-permissions as root.
# Create a non-root user for eval runs (GH Actions overrides USER, so
# the workflow must set options.user or use gosu/su-exec at runtime).
RUN useradd -m -s /bin/bash runner \
&& chmod -R a+rX /opt/node_modules_cache \
&& mkdir -p /home/runner/.gstack && chown -R runner:runner /home/runner/.gstack \
&& chmod 1777 /tmp \
&& mkdir -p /home/runner/.bun && chown -R runner:runner /home/runner/.bun \
&& chmod -R 1777 /tmp
8 changes: 8 additions & 0 deletions .github/workflows/actionlint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: Workflow Lint
on: [push, pull_request]
jobs:
actionlint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: rhysd/actionlint@v1.7.11
40 changes: 40 additions & 0 deletions .github/workflows/ci-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: Build CI Image
on:
# Rebuild weekly (Monday 6am UTC) to pick up CLI updates
schedule:
- cron: '0 6 * * 1'
# Rebuild on Dockerfile or lockfile changes
push:
branches: [main]
paths:
- '.github/docker/Dockerfile.ci'
- 'package.json'
# Manual trigger
workflow_dispatch:

jobs:
build:
runs-on: ubicloud-standard-2
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v4

# Copy lockfile + package.json into Docker build context
- run: cp package.json .github/docker/

- uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- uses: docker/build-push-action@v6
with:
context: .github/docker
file: .github/docker/Dockerfile.ci
push: true
tags: |
ghcr.io/${{ github.repository }}/ci:latest
ghcr.io/${{ github.repository }}/ci:${{ github.sha }}
129 changes: 129 additions & 0 deletions .github/workflows/evals-periodic.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
name: Periodic Evals
on:
schedule:
- cron: '0 6 * * 1' # Monday 6 AM UTC
workflow_dispatch:

concurrency:
group: evals-periodic
cancel-in-progress: true

env:
IMAGE: ghcr.io/${{ github.repository }}/ci
EVALS_TIER: periodic
EVALS_ALL: 1 # Ignore diff — run all periodic tests

jobs:
build-image:
runs-on: ubicloud-standard-2
permissions:
contents: read
packages: write
outputs:
image-tag: ${{ steps.meta.outputs.tag }}
steps:
- uses: actions/checkout@v4

- id: meta
run: echo "tag=${{ env.IMAGE }}:${{ hashFiles('.github/docker/Dockerfile.ci', 'package.json') }}" >> "$GITHUB_OUTPUT"

- uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Check if image exists
id: check
run: |
if docker manifest inspect ${{ steps.meta.outputs.tag }} > /dev/null 2>&1; then
echo "exists=true" >> "$GITHUB_OUTPUT"
else
echo "exists=false" >> "$GITHUB_OUTPUT"
fi
- if: steps.check.outputs.exists == 'false'
run: cp package.json .github/docker/

- if: steps.check.outputs.exists == 'false'
uses: docker/build-push-action@v6
with:
context: .github/docker
file: .github/docker/Dockerfile.ci
push: true
tags: |
${{ steps.meta.outputs.tag }}
${{ env.IMAGE }}:latest
evals:
runs-on: ubicloud-standard-2
needs: build-image
container:
image: ${{ needs.build-image.outputs.image-tag }}
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
options: --user runner
timeout-minutes: 25
strategy:
fail-fast: false
matrix:
suite:
- name: e2e-plan
file: test/skill-e2e-plan.test.ts
- name: e2e-design
file: test/skill-e2e-design.test.ts
- name: e2e-qa-bugs
file: test/skill-e2e-qa-bugs.test.ts
- name: e2e-qa-workflow
file: test/skill-e2e-qa-workflow.test.ts
- name: e2e-review
file: test/skill-e2e-review.test.ts
- name: e2e-workflow
file: test/skill-e2e-workflow.test.ts
- name: e2e-routing
file: test/skill-routing-e2e.test.ts
- name: e2e-codex
file: test/codex-e2e.test.ts
- name: e2e-gemini
file: test/gemini-e2e.test.ts
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Fix bun temp
run: |
mkdir -p /home/runner/.cache/bun
{
echo "BUN_INSTALL_CACHE_DIR=/home/runner/.cache/bun"
echo "BUN_TMPDIR=/home/runner/.cache/bun"
echo "TMPDIR=/home/runner/.cache"
} >> "$GITHUB_ENV"
- name: Restore deps
run: |
if [ -d /opt/node_modules_cache ] && diff -q /opt/node_modules_cache/.package.json package.json >/dev/null 2>&1; then
ln -s /opt/node_modules_cache node_modules
else
bun install
fi
- run: bun run build

- name: Run ${{ matrix.suite.name }}
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
EVALS_CONCURRENCY: "40"
PLAYWRIGHT_BROWSERS_PATH: /opt/playwright-browsers
run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }}

- name: Upload eval results
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-periodic-${{ matrix.suite.name }}
path: ~/.gstack-dev/evals/*.json
retention-days: 90
Loading