From 0cb1942b73967b8d31aa4817b459ab5769c6721f Mon Sep 17 00:00:00 2001 From: John Myers <9696606+johntmyers@users.noreply.github.com> Date: Mon, 16 Mar 2026 11:14:57 -0700 Subject: [PATCH 1/5] perf(cluster): speed up local deploy loop and harden fast path Improve local DX by adding profile-aware fast Docker builds, deploy/baseline reporting, and resilient cluster bootstrap/deploy behavior so iterative redeploys are faster and more predictable on macOS/Linux. Made-with: Cursor --- Cargo.toml | 8 + architecture/build-containers.md | 17 ++ architecture/gateway-single-node.md | 2 +- crates/openshell-cli/src/main.rs | 13 +- crates/openshell-cli/src/run.rs | 31 ++-- crates/openshell-sandbox/src/proxy.rs | 2 +- deploy/docker/Dockerfile.cluster | 35 +++- deploy/docker/Dockerfile.gateway | 22 ++- scripts/bin/openshell | 13 +- tasks/cluster.toml | 16 ++ tasks/scripts/cluster-baseline.sh | 210 ++++++++++++++++++++++++ tasks/scripts/cluster-bootstrap.sh | 49 +++++- tasks/scripts/cluster-deploy-fast.sh | 127 +++++++++++++- tasks/scripts/cluster.sh | 8 + tasks/scripts/docker-build-cluster.sh | 50 ++++++ tasks/scripts/docker-build-component.sh | 13 +- 16 files changed, 582 insertions(+), 34 deletions(-) create mode 100755 tasks/scripts/cluster-baseline.sh diff --git a/Cargo.toml b/Cargo.toml index 215862cf..20949608 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -131,3 +131,11 @@ strip = true [profile.dev] # Faster compile times for dev builds debug = 1 + +[profile.local-fast] +# Local-only profile for faster dockerized inner-loop builds. +inherits = "dev" +opt-level = 1 +debug = 0 +codegen-units = 256 +incremental = true diff --git a/architecture/build-containers.md b/architecture/build-containers.md index 705b00d6..f2c807b6 100644 --- a/architecture/build-containers.md +++ b/architecture/build-containers.md @@ -37,6 +37,10 @@ This pulls `ghcr.io/nvidia/openshell-community/sandboxes/:latest`. `mise run cluster` is the primary development command. It bootstraps a cluster if one doesn't exist, then performs incremental deploys for subsequent runs. +For local (non-CI) Docker builds, OpenShell defaults to the Cargo profile +`local-fast` to reduce rebuild latency. CI keeps `release` builds by default. +Set `OPENSHELL_CARGO_PROFILE=release` locally when you need release-equivalent binaries. + The incremental deploy (`cluster-deploy-fast.sh`) fingerprints local Git changes and only rebuilds components whose files have changed: | Changed files | Rebuild triggered | @@ -58,3 +62,16 @@ mise run cluster -- supervisor # rebuild supervisor only mise run cluster -- chart # helm upgrade only mise run cluster -- all # rebuild everything ``` + +To baseline local compile and image build latency before optimization work: + +```bash +mise run cluster:baseline # cold + warm build timings +mise run cluster:baseline:full # same plus `mise run cluster` deploy timing +mise run cluster:baseline:warm # warm-only build timings +mise run cluster:baseline:warm:full # warm-only + deploy +``` + +Reports are written to `.cache/perf/` as both CSV and markdown. + +Each `mise run cluster` invocation also emits a deploy transaction report to `.cache/deploy-reports/.md`, including selected actions (gateway rebuild, supervisor update, helm upgrade), fingerprints, and per-step durations. diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index 679bc338..22ceff2c 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -143,7 +143,7 @@ flowchart LR The `deploy_gateway_with_logs` variant accepts an `FnMut(String)` callback for progress reporting. The CLI wraps this in a `GatewayDeployLogPanel` for interactive terminals. -**Pre-deploy check** (CLI layer in `gateway_start`): In interactive terminals, `check_existing_deployment` inspects whether a container or volume already exists. If found, the user is prompted to destroy and recreate or reuse the existing gateway. +**Pre-deploy check** (CLI layer in `gateway_start`): `check_existing_deployment` inspects whether a container or volume already exists. In interactive terminals, the user is prompted to destroy and recreate or reuse the existing gateway. In non-interactive mode, the command fails unless `--recreate` or `--reuse-ok` is provided explicitly. ### 2) Image readiness diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 84a323b5..0548d561 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -742,11 +742,18 @@ enum GatewayCommands { /// Destroy and recreate the gateway from scratch if one already exists. /// - /// Without this flag, an interactive prompt asks whether to recreate; - /// in non-interactive mode the existing gateway is reused silently. + /// Without this flag, an interactive prompt asks whether to recreate. + /// In non-interactive mode, the command fails unless `--reuse-ok` is set. #[arg(long)] recreate: bool, + /// Reuse an existing gateway in non-interactive mode. + /// + /// Use this in automation when you intentionally want idempotent reuse. + /// Conflicts with `--recreate`. + #[arg(long, conflicts_with = "recreate")] + reuse_ok: bool, + /// Listen on plaintext HTTP instead of mTLS. /// /// Use when the gateway sits behind a reverse proxy (e.g., Cloudflare @@ -1445,6 +1452,7 @@ async fn main() -> Result<()> { port, gateway_host, recreate, + reuse_ok, plaintext, disable_gateway_auth, registry_username, @@ -1458,6 +1466,7 @@ async fn main() -> Result<()> { port, gateway_host.as_deref(), recreate, + reuse_ok, plaintext, disable_gateway_auth, registry_username.as_deref(), diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 052a7de2..e55d683f 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1315,6 +1315,7 @@ pub async fn gateway_admin_deploy( port: u16, gateway_host: Option<&str>, recreate: bool, + reuse_ok: bool, disable_tls: bool, disable_gateway_auth: bool, registry_username: Option<&str>, @@ -1334,21 +1335,22 @@ pub async fn gateway_admin_deploy( }); // Check whether a gateway already exists. If so, prompt the user (unless - // --recreate was passed or we're in non-interactive mode). + // --recreate was passed). Non-interactive mode now fails by default unless + // --reuse-ok is explicitly set. let mut should_recreate = recreate; if let Some(existing) = openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await? { + let status = if existing.container_running { + "running" + } else if existing.container_exists { + "stopped" + } else { + "volume only" + }; if !should_recreate { let interactive = std::io::stdin().is_terminal() && std::io::stderr().is_terminal(); if interactive { - let status = if existing.container_running { - "running" - } else if existing.container_exists { - "stopped" - } else { - "volume only" - }; eprintln!(); eprintln!( "{} Gateway '{name}' already exists ({status}).", @@ -1371,10 +1373,17 @@ pub async fn gateway_admin_deploy( eprintln!("Keeping existing gateway."); return Ok(()); } - } else { - // Non-interactive mode: reuse existing gateway silently. - eprintln!("Gateway '{name}' already exists, reusing."); + } else if reuse_ok { + eprintln!("Gateway '{name}' already exists ({status}), reusing (--reuse-ok)."); return Ok(()); + } else { + return Err(miette::miette!( + "Gateway '{name}' already exists ({status}).\n\ + Non-interactive mode requires explicit intent.\n\ + Re-run with one of:\n\ + --reuse-ok # keep existing gateway\n\ + --recreate # destroy and redeploy" + )); } } } diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 44234f66..1398f675 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -123,7 +123,7 @@ impl ProxyHandle { /// The proxy uses OPA for network decisions with process-identity binding /// via `/proc/net/tcp`. All connections are evaluated through OPA policy. #[allow(clippy::too_many_arguments)] - pub async fn start_with_bind_addr( + pub(crate) async fn start_with_bind_addr( policy: &ProxyPolicy, bind_addr: Option, opa_engine: Arc, diff --git a/deploy/docker/Dockerfile.cluster b/deploy/docker/Dockerfile.cluster index 49e29a98..2692c6cd 100644 --- a/deploy/docker/Dockerfile.cluster +++ b/deploy/docker/Dockerfile.cluster @@ -77,6 +77,7 @@ FROM --platform=$BUILDPLATFORM rust:1.88-slim AS supervisor-builder ARG TARGETARCH ARG BUILDARCH ARG OPENSHELL_CARGO_VERSION +ARG OPENSHELL_CARGO_PROFILE=release ARG CARGO_TARGET_CACHE_SCOPE=default ARG SCCACHE_MEMCACHED_ENDPOINT @@ -121,7 +122,14 @@ COPY proto/ proto/ RUN --mount=type=cache,id=cargo-registry-supervisor-${TARGETARCH},sharing=locked,target=/usr/local/cargo/registry \ --mount=type=cache,id=cargo-target-supervisor-${TARGETARCH}-${CARGO_TARGET_CACHE_SCOPE},sharing=locked,target=/build/target \ --mount=type=cache,id=sccache-supervisor-${TARGETARCH},sharing=locked,target=/tmp/sccache \ - . cross-build.sh && cargo_cross_build -p openshell-sandbox 2>/dev/null || true + . cross-build.sh && \ + if [ "${OPENSHELL_CARGO_PROFILE}" = "release" ]; then \ + cargo_cross_build --release -p openshell-sandbox 2>/dev/null || true; \ + elif [ "${OPENSHELL_CARGO_PROFILE}" = "dev" ]; then \ + cargo_cross_build -p openshell-sandbox 2>/dev/null || true; \ + else \ + cargo_cross_build --profile "${OPENSHELL_CARGO_PROFILE}" -p openshell-sandbox 2>/dev/null || true; \ + fi # Copy actual source code COPY crates/ crates/ @@ -139,9 +147,28 @@ RUN --mount=type=cache,id=cargo-registry-supervisor-${TARGETARCH},sharing=locked if [ -n "${OPENSHELL_CARGO_VERSION:-}" ]; then \ sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${OPENSHELL_CARGO_VERSION}"'"/}' Cargo.toml; \ fi && \ - cargo_cross_build --release -p openshell-sandbox && \ - mkdir -p /build/out && \ - cp "$(cross_output_dir release)/openshell-sandbox" /build/out/ + if [ "${OPENSHELL_CARGO_PROFILE}" = "release" ]; then \ + cargo_cross_build --release -p openshell-sandbox && \ + mkdir -p /build/out && \ + cp "$(cross_output_dir release)/openshell-sandbox" /build/out/; \ + elif [ "${OPENSHELL_CARGO_PROFILE}" = "dev" ]; then \ + cargo_cross_build -p openshell-sandbox && \ + mkdir -p /build/out && \ + cp "$(cross_output_dir debug)/openshell-sandbox" /build/out/; \ + else \ + cargo_cross_build --profile "${OPENSHELL_CARGO_PROFILE}" -p openshell-sandbox && \ + mkdir -p /build/out && \ + cp "$(cross_output_dir "${OPENSHELL_CARGO_PROFILE}")/openshell-sandbox" /build/out/; \ + fi + +# --------------------------------------------------------------------------- +# Stage 1e: Minimal export stage for local supervisor extraction +# --------------------------------------------------------------------------- +# Exporting directly from supervisor-builder with --output type=local copies the +# full build filesystem (including target cache) and is very slow on macOS. +# This scratch stage contains only the final binary. +FROM scratch AS supervisor-export +COPY --from=supervisor-builder /build/out/openshell-sandbox /openshell-sandbox # --------------------------------------------------------------------------- # Stage 2: Install NVIDIA container toolkit on Ubuntu diff --git a/deploy/docker/Dockerfile.gateway b/deploy/docker/Dockerfile.gateway index 05d2a46f..1ffec15e 100644 --- a/deploy/docker/Dockerfile.gateway +++ b/deploy/docker/Dockerfile.gateway @@ -11,6 +11,7 @@ FROM --platform=$BUILDPLATFORM rust:1.88-slim AS builder ARG TARGETARCH ARG BUILDARCH ARG OPENSHELL_CARGO_VERSION +ARG OPENSHELL_CARGO_PROFILE=release ARG CARGO_TARGET_CACHE_SCOPE=default # Install build dependencies @@ -55,7 +56,14 @@ COPY proto/ proto/ RUN --mount=type=cache,id=cargo-registry-gateway-${TARGETARCH},sharing=locked,target=/usr/local/cargo/registry \ --mount=type=cache,id=cargo-target-gateway-${TARGETARCH}-${CARGO_TARGET_CACHE_SCOPE},sharing=locked,target=/build/target \ --mount=type=cache,id=sccache-gateway-${TARGETARCH},sharing=locked,target=/tmp/sccache \ - . cross-build.sh && cargo_cross_build --release -p openshell-server 2>/dev/null || true + . cross-build.sh && \ + if [ "${OPENSHELL_CARGO_PROFILE}" = "release" ]; then \ + cargo_cross_build --release -p openshell-server 2>/dev/null || true; \ + elif [ "${OPENSHELL_CARGO_PROFILE}" = "dev" ]; then \ + cargo_cross_build -p openshell-server 2>/dev/null || true; \ + else \ + cargo_cross_build --profile "${OPENSHELL_CARGO_PROFILE}" -p openshell-server 2>/dev/null || true; \ + fi # Copy actual source code COPY crates/ crates/ @@ -75,8 +83,16 @@ RUN --mount=type=cache,id=cargo-registry-gateway-${TARGETARCH},sharing=locked,ta if [ -n "${OPENSHELL_CARGO_VERSION:-}" ]; then \ sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${OPENSHELL_CARGO_VERSION}"'"/}' Cargo.toml; \ fi && \ - cargo_cross_build --release -p openshell-server && \ - cp "$(cross_output_dir release)/openshell-server" /build/openshell-server + if [ "${OPENSHELL_CARGO_PROFILE}" = "release" ]; then \ + cargo_cross_build --release -p openshell-server && \ + cp "$(cross_output_dir release)/openshell-server" /build/openshell-server; \ + elif [ "${OPENSHELL_CARGO_PROFILE}" = "dev" ]; then \ + cargo_cross_build -p openshell-server && \ + cp "$(cross_output_dir debug)/openshell-server" /build/openshell-server; \ + else \ + cargo_cross_build --profile "${OPENSHELL_CARGO_PROFILE}" -p openshell-server && \ + cp "$(cross_output_dir "${OPENSHELL_CARGO_PROFILE}")/openshell-server" /build/openshell-server; \ + fi # Stage 2: Runtime (uses target platform) # NVIDIA hardened Ubuntu base for supply chain consistency. diff --git a/scripts/bin/openshell b/scripts/bin/openshell index 8b8a9c21..5171bd9c 100755 --- a/scripts/bin/openshell +++ b/scripts/bin/openshell @@ -25,8 +25,12 @@ else # Current HEAD commit (detects branch switches, pulls, rebases) current_head=$(git rev-parse HEAD 2>/dev/null || echo "unknown") - # Collect dirty (modified, staged, untracked) files - mapfile -t changed_files < <( + # Collect dirty (modified, staged, untracked) files. + # Use a bash-3-compatible read loop (macOS default bash has no mapfile). + changed_files=() + while IFS= read -r path; do + changed_files+=("$path") + done < <( { git diff --name-only 2>/dev/null git diff --name-only --cached 2>/dev/null @@ -95,7 +99,10 @@ if [[ "$needs_build" == "1" ]]; then cd "$PROJECT_ROOT" new_head=$(git rev-parse HEAD 2>/dev/null || echo "unknown") # Recompute fingerprint of remaining dirty files (build may not change them) - mapfile -t post_files < <( + post_files=() + while IFS= read -r path; do + post_files+=("$path") + done < <( { git diff --name-only 2>/dev/null git diff --name-only --cached 2>/dev/null diff --git a/tasks/cluster.toml b/tasks/cluster.toml index debda04c..7bc516e9 100644 --- a/tasks/cluster.toml +++ b/tasks/cluster.toml @@ -34,3 +34,19 @@ hide = true description = "Tag and push gateway image to pull registry" run = "tasks/scripts/cluster-push-component.sh gateway" hide = true + +["cluster:baseline"] +description = "Capture cold/warm baseline timings for local builds" +run = "tasks/scripts/cluster-baseline.sh --mode both" + +["cluster:baseline:full"] +description = "Capture baseline timings including cluster deploy" +run = "tasks/scripts/cluster-baseline.sh --mode both --with-deploy" + +["cluster:baseline:warm"] +description = "Capture warm-path baseline timings for local builds" +run = "tasks/scripts/cluster-baseline.sh --mode warm" + +["cluster:baseline:warm:full"] +description = "Capture warm-path baseline timings including cluster deploy" +run = "tasks/scripts/cluster-baseline.sh --mode warm --with-deploy" diff --git a/tasks/scripts/cluster-baseline.sh b/tasks/scripts/cluster-baseline.sh new file mode 100755 index 00000000..6c6cf98a --- /dev/null +++ b/tasks/scripts/cluster-baseline.sh @@ -0,0 +1,210 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -uo pipefail + +MODE="both" +WITH_DEPLOY=0 +OUTPUT_DIR="${PERF_OUTPUT_DIR:-.cache/perf}" +RUN_ID="$(date +%Y%m%d-%H%M%S)" +FAILURES=0 +CARGO_BUILD_PROFILE="${OPENSHELL_CARGO_PROFILE:-local-fast}" + +usage() { + cat <<'EOF' +Usage: tasks/scripts/cluster-baseline.sh [options] + +Capture local baseline timings for: + - CLI compile (cargo build -p openshell-cli) + - Gateway image build + - Supervisor-only build stage + - Cluster image build + - Optional cluster deploy + +Options: + --mode Which measurement passes to run (default: both) + --with-deploy Include `mise run --skip-deps cluster` + --output-dir Output directory for CSV/markdown (default: .cache/perf) + -h, --help Show this help text +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --mode) + MODE="${2:-}" + shift 2 + ;; + --with-deploy) + WITH_DEPLOY=1 + shift + ;; + --output-dir) + OUTPUT_DIR="${2:-}" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if [[ "${MODE}" != "cold" && "${MODE}" != "warm" && "${MODE}" != "both" ]]; then + echo "Invalid --mode value: ${MODE}" >&2 + exit 1 +fi + +mkdir -p "${OUTPUT_DIR}" + +CSV_FILE="${OUTPUT_DIR}/cluster-baseline-${RUN_ID}.csv" +SUMMARY_FILE="${OUTPUT_DIR}/cluster-baseline-${RUN_ID}.md" +echo "run,category,step,status,duration_s" > "${CSV_FILE}" + +normalize_arch() { + case "$1" in + x86_64) echo "amd64" ;; + aarch64) echo "arm64" ;; + *) echo "$1" ;; + esac +} + +record_result() { + local run_label=$1 + local category=$2 + local step=$3 + local status=$4 + local duration_s=$5 + + echo "${run_label},${category},${step},${status},${duration_s}" >> "${CSV_FILE}" +} + +run_step() { + local run_label=$1 + local category=$2 + local step=$3 + local command=$4 + shift 4 + + local start_s end_s duration_s status + start_s=$(date +%s) + + echo "" + echo "[${run_label}] ${category}/${step}" + echo " ${command}" + + if env "$@" bash -lc "${command}"; then + status="ok" + else + status="fail" + FAILURES=$((FAILURES + 1)) + fi + + end_s=$(date +%s) + duration_s=$((end_s - start_s)) + record_result "${run_label}" "${category}" "${step}" "${status}" "${duration_s}" +} + +run_pass() { + local run_label=$1 + local scope_seed=$2 + + local cli_target_dir gateway_cache_dir cluster_cache_dir supervisor_output_dir + local image_tag docker_arch + local supervisor_version_arg supervisor_profile_arg cargo_version + + cli_target_dir=".cache/perf/target-${scope_seed}" + gateway_cache_dir=".cache/perf/buildkit-gateway-${scope_seed}" + cluster_cache_dir=".cache/perf/buildkit-cluster-${scope_seed}" + supervisor_output_dir="${OUTPUT_DIR}/supervisor-${scope_seed}" + image_tag="perf-${scope_seed}" + docker_arch="$(normalize_arch "$(docker version --format '{{.Server.Arch}}')")" + supervisor_version_arg="" + supervisor_profile_arg=" --build-arg OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}" + cargo_version=$(uv run python tasks/scripts/release.py get-version --cargo 2>/dev/null || true) + if [[ -n "${cargo_version}" ]]; then + supervisor_version_arg=" --build-arg OPENSHELL_CARGO_VERSION=${cargo_version}" + fi + + if [[ "${run_label}" == "cold" ]]; then + rm -rf "${cli_target_dir}" "${gateway_cache_dir}" "${cluster_cache_dir}" "${supervisor_output_dir}" + fi + + run_step "${run_label}" "rust" "cli_debug" \ + "cargo build -p openshell-cli" \ + "CARGO_TARGET_DIR=${cli_target_dir}" + + run_step "${run_label}" "docker" "gateway_image" \ + "tasks/scripts/docker-build-component.sh gateway" \ + "RUST_TOOLCHAIN_SCOPE=${scope_seed}" \ + "DOCKER_BUILD_CACHE_DIR=${gateway_cache_dir}" \ + "OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}" \ + "IMAGE_TAG=${image_tag}" + + run_step "${run_label}" "docker" "supervisor_stage" \ + "docker buildx build --file deploy/docker/Dockerfile.cluster --target supervisor-export --build-arg BUILDARCH=${docker_arch} --build-arg TARGETARCH=${docker_arch} --build-arg CARGO_TARGET_CACHE_SCOPE=${scope_seed}${supervisor_profile_arg}${supervisor_version_arg} --output type=local,dest=${supervisor_output_dir} --platform linux/${docker_arch} ." \ + "DOCKER_BUILD_CACHE_DIR=${cluster_cache_dir}" \ + "IMAGE_TAG=${image_tag}" + + run_step "${run_label}" "docker" "cluster_image" \ + "tasks/scripts/docker-build-cluster.sh" \ + "DOCKER_BUILD_CACHE_DIR=${cluster_cache_dir}" \ + "OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}" \ + "IMAGE_TAG=${image_tag}" + + if [[ "${WITH_DEPLOY}" == "1" ]]; then + run_step "${run_label}" "deploy" "cluster_task" \ + "mise run --skip-deps cluster" \ + "OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}" + fi +} + +SCOPE_SEED="baseline-${RUN_ID}" + +case "${MODE}" in + cold) + run_pass "cold" "${SCOPE_SEED}" + ;; + warm) + run_pass "warm" "baseline-warm" + ;; + both) + run_pass "cold" "${SCOPE_SEED}" + run_pass "warm" "${SCOPE_SEED}" + ;; +esac + +{ + echo "# Cluster Baseline Report" + echo "" + echo "- run_id: \`${RUN_ID}\`" + echo "- mode: \`${MODE}\`" + echo "- include_deploy: \`${WITH_DEPLOY}\`" + echo "- cargo_build_profile: \`${CARGO_BUILD_PROFILE}\`" + echo "- csv: \`${CSV_FILE}\`" + echo "" + echo "| run | category | step | status | duration_s |" + echo "|---|---|---|---|---|" + tail -n +2 "${CSV_FILE}" | while IFS=, read -r run_label category step status duration_s; do + echo "| ${run_label} | ${category} | ${step} | ${status} | ${duration_s} |" + done +} > "${SUMMARY_FILE}" + +echo "" +echo "Baseline report written:" +echo " ${SUMMARY_FILE}" +echo " ${CSV_FILE}" + +if [[ "${FAILURES}" -gt 0 ]]; then + echo "Completed with ${FAILURES} failed step(s)." >&2 + exit 1 +fi + +echo "Completed successfully." diff --git a/tasks/scripts/cluster-bootstrap.sh b/tasks/scripts/cluster-bootstrap.sh index f354daea..364476cc 100755 --- a/tasks/scripts/cluster-bootstrap.sh +++ b/tasks/scripts/cluster-bootstrap.sh @@ -16,6 +16,16 @@ if [ "${MODE}" != "build" ] && [ "${MODE}" != "fast" ]; then exit 1 fi +DEPLOY_TX_ID=${DEPLOY_TX_ID:-"tx-$(date +%Y%m%d-%H%M%S)-$RANDOM"} +DEPLOY_REPORT_DIR=${DEPLOY_REPORT_DIR:-.cache/deploy-reports} +DEPLOY_REPORT_FILE="${DEPLOY_REPORT_DIR}/${DEPLOY_TX_ID}.md" +overall_start=$(date +%s) +recreated_cluster=0 +pushed_gateway=0 +built_cluster_image=0 + +mkdir -p "${DEPLOY_REPORT_DIR}" + if [ -n "${IMAGE_TAG:-}" ]; then IMAGE_TAG=${IMAGE_TAG} else @@ -122,8 +132,8 @@ is_local_registry_host() { } registry_reachable() { - curl -4 -fsS --max-time 2 "http://127.0.0.1:5000/v2/" >/dev/null 2>&1 || \ - curl -4 -fsS --max-time 2 "http://localhost:5000/v2/" >/dev/null 2>&1 + curl -fsS --max-time 2 "http://127.0.0.1:5000/v2/" >/dev/null 2>&1 || \ + curl -fsS --max-time 2 "http://localhost:5000/v2/" >/dev/null 2>&1 } wait_for_registry_ready() { @@ -172,12 +182,23 @@ ensure_local_registry() { return fi + # Docker Desktop occasionally leaves published ports in a bad state after + # daemon/network churn. Recreate once before failing hard. + echo "Local registry probe failed; recreating registry container and retrying..." >&2 + docker rm -f "${LOCAL_REGISTRY_CONTAINER}" >/dev/null 2>&1 || true + docker run -d --restart=always --name "${LOCAL_REGISTRY_CONTAINER}" -p 5000:5000 registry:2 >/dev/null + + if wait_for_registry_ready 20 1; then + return + fi + if registry_reachable; then return fi echo "Error: local registry is not reachable at ${REGISTRY_HOST}." >&2 echo " Ensure a registry is running on port 5000 (e.g. docker run -d --name openshell-local-registry -p 5000:5000 registry:2)." >&2 + echo " Active docker context: $(docker context show 2>/dev/null || echo unknown)" >&2 docker ps -a >&2 || true docker logs "${LOCAL_REGISTRY_CONTAINER}" >&2 || true exit 1 @@ -217,6 +238,7 @@ if [ "${MODE}" = "fast" ]; then if docker inspect "${CONTAINER_NAME}" >/dev/null 2>&1 || docker volume inspect "${VOLUME_NAME}" >/dev/null 2>&1; then echo "Recreating cluster '${CLUSTER_NAME}' from scratch..." openshell gateway destroy --name "${CLUSTER_NAME}" + recreated_cluster=1 fi fi @@ -224,6 +246,7 @@ if [ "${SKIP_IMAGE_PUSH:-}" = "1" ]; then echo "Skipping image push (SKIP_IMAGE_PUSH=1; images already in registry)." elif [ "${MODE}" = "build" ] || [ "${MODE}" = "fast" ]; then tasks/scripts/cluster-push-component.sh gateway + pushed_gateway=1 fi # Build the cluster image so it contains the latest Helm chart, manifests, @@ -231,6 +254,7 @@ fi # always starts with the correct chart version. if [ "${SKIP_CLUSTER_IMAGE_BUILD:-}" != "1" ]; then tasks/scripts/docker-build-cluster.sh + built_cluster_image=1 fi # In fast/build modes, use the locally-built cluster image rather than the @@ -268,5 +292,26 @@ fi DEPLOY_FAST_STATE_FILE=${DEPLOY_FAST_STATE_FILE:-.cache/cluster-deploy-fast.state} rm -f "${DEPLOY_FAST_STATE_FILE}" +overall_end=$(date +%s) +total_duration=$((overall_end - overall_start)) + +cat > "${DEPLOY_REPORT_FILE}" </dev/null 2>&1; then + sha256sum "$1" | awk '{print substr($1, 1, 16)}' + else + shasum -a 256 "$1" | awk '{print substr($1, 1, 16)}' + fi +} + +sha256_16_stdin() { + if command -v sha256sum >/dev/null 2>&1; then + sha256sum | awk '{print substr($1, 1, 16)}' + else + shasum -a 256 | awk '{print substr($1, 1, 16)}' + fi +} + +detect_rust_scope() { + local dockerfile=$1 + local rust_from + rust_from=$(grep -E '^FROM --platform=\$BUILDPLATFORM rust:[^ ]+' "$dockerfile" | head -n1 | sed -E 's/^FROM --platform=\$BUILDPLATFORM rust:([^ ]+).*/\1/' || true) + if [[ -n "${rust_from}" ]]; then + echo "rust-${rust_from}" + return + fi + + if grep -q "rustup.rs" "$dockerfile"; then + echo "rustup-stable" + return + fi + + echo "no-rust" +} + +change_detection_duration=0 +builds_duration=0 +supervisor_duration=0 +image_push_duration=0 +helm_upgrade_duration=0 +gateway_rollout_duration=0 +total_duration=0 + +mkdir -p "${DEPLOY_REPORT_DIR}" + if ! docker ps -q --filter "name=^${CONTAINER_NAME}$" --filter "health=healthy" | grep -q .; then echo "Error: Cluster container '${CONTAINER_NAME}' is not running or not healthy." echo "Start the cluster first with: mise run cluster" @@ -48,6 +101,7 @@ build_gateway=0 build_supervisor=0 needs_helm_upgrade=0 explicit_target=0 +targets_requested="${*:-auto}" previous_gateway_fingerprint="" previous_supervisor_fingerprint="" @@ -88,7 +142,9 @@ fi declare -a changed_files=() detect_start=$(date +%s) -mapfile -t changed_files < <( +while IFS= read -r path; do + changed_files+=("${path}") +done < <( { git diff --name-only git diff --name-only --cached @@ -97,6 +153,8 @@ mapfile -t changed_files < <( ) detect_end=$(date +%s) log_duration "Change detection" "${detect_start}" "${detect_end}" +change_detection_duration=$((detect_end - detect_start)) +changed_files_count=${#changed_files[@]} # Track the cluster container ID so we can detect when the cluster was # recreated (e.g. via bootstrap). A new container means the k3s state is @@ -288,6 +346,7 @@ echo "Fast deploy plan:" echo " build gateway: ${build_gateway}" echo " build supervisor: ${build_supervisor}" echo " helm upgrade: ${needs_helm_upgrade}" +echo " cargo profile: ${CARGO_BUILD_PROFILE}" if [[ "${explicit_target}" == "0" && "${build_gateway}" == "0" && "${build_supervisor}" == "0" && "${needs_helm_upgrade}" == "0" && "${DEPLOY_FAST_MODE}" != "full" ]]; then echo "No new local changes since last deploy." @@ -316,8 +375,8 @@ if [[ "${build_supervisor}" == "1" ]]; then CLUSTER_ARCH=$(docker image inspect --format '{{.Architecture}}' "${_cluster_image}" 2>/dev/null || echo "amd64") # Build the supervisor binary using docker buildx with a lightweight build. - # We use the same cross-build.sh helpers as the full cluster image but only - # compile openshell-sandbox, then extract the binary via --output. + # Use the dedicated supervisor-export target to avoid copying the entire + # builder filesystem (target cache) back to the host. SUPERVISOR_BUILD_DIR=$(mktemp -d) trap 'rm -rf "${SUPERVISOR_BUILD_DIR}"' EXIT @@ -332,11 +391,19 @@ if [[ "${build_supervisor}" == "1" ]]; then fi fi + SUPERVISOR_PROFILE_ARGS=(--build-arg "OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}") + SUPERVISOR_RUST_SCOPE=${RUST_TOOLCHAIN_SCOPE:-$(detect_rust_scope "deploy/docker/Dockerfile.cluster")} + SUPERVISOR_LOCK_HASH=$(sha256_16 Cargo.lock) + SUPERVISOR_SCOPE_INPUT="v2|cluster|base|${SUPERVISOR_LOCK_HASH}|${SUPERVISOR_RUST_SCOPE}|${CARGO_BUILD_PROFILE}" + SUPERVISOR_CACHE_SCOPE=$(printf '%s' "${SUPERVISOR_SCOPE_INPUT}" | sha256_16_stdin) + docker buildx build \ --file deploy/docker/Dockerfile.cluster \ - --target supervisor-builder \ + --target supervisor-export \ --build-arg "BUILDARCH=$(docker version --format '{{.Server.Arch}}')" \ --build-arg "TARGETARCH=${CLUSTER_ARCH}" \ + --build-arg "CARGO_TARGET_CACHE_SCOPE=${SUPERVISOR_CACHE_SCOPE}" \ + ${SUPERVISOR_PROFILE_ARGS[@]+"${SUPERVISOR_PROFILE_ARGS[@]}"} \ ${SUPERVISOR_VERSION_ARGS[@]+"${SUPERVISOR_VERSION_ARGS[@]}"} \ --output "type=local,dest=${SUPERVISOR_BUILD_DIR}" \ --platform "linux/${CLUSTER_ARCH}" \ @@ -344,17 +411,19 @@ if [[ "${build_supervisor}" == "1" ]]; then # Copy the built binary into the running k3s container docker exec "${CONTAINER_NAME}" mkdir -p /opt/openshell/bin - docker cp "${SUPERVISOR_BUILD_DIR}/build/out/openshell-sandbox" \ + docker cp "${SUPERVISOR_BUILD_DIR}/openshell-sandbox" \ "${CONTAINER_NAME}:/opt/openshell/bin/openshell-sandbox" docker exec "${CONTAINER_NAME}" chmod 755 /opt/openshell/bin/openshell-sandbox built_components+=("supervisor") supervisor_end=$(date +%s) log_duration "Supervisor build + deploy" "${supervisor_start}" "${supervisor_end}" + supervisor_duration=$((supervisor_end - supervisor_start)) fi build_end=$(date +%s) log_duration "Builds" "${build_start}" "${build_end}" +builds_duration=$((build_end - build_start)) # Push rebuilt gateway image to local registry. declare -a pushed_images=() @@ -373,6 +442,7 @@ if [[ "${#pushed_images[@]}" -gt 0 ]]; then done push_end=$(date +%s) log_duration "Image push" "${push_start}" "${push_end}" + image_push_duration=$((push_end - push_start)) fi # Evict rebuilt gateway image from k3s containerd cache so new pods pull @@ -428,6 +498,7 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then ${helm_wait_args}" helm_end=$(date +%s) log_duration "Helm upgrade" "${helm_start}" "${helm_end}" + helm_upgrade_duration=$((helm_end - helm_start)) fi if [[ "${build_gateway}" == "1" ]]; then @@ -444,6 +515,7 @@ if [[ "${build_gateway}" == "1" ]]; then fi rollout_end=$(date +%s) log_duration "Gateway rollout" "${rollout_start}" "${rollout_end}" + gateway_rollout_duration=$((rollout_end - rollout_start)) fi if [[ "${build_supervisor}" == "1" ]]; then @@ -465,5 +537,48 @@ fi overall_end=$(date +%s) log_duration "Total deploy" "${overall_start}" "${overall_end}" +total_duration=$((overall_end - overall_start)) + +cat > "${DEPLOY_REPORT_FILE}" </dev/null 2>&1; then + sha256sum "$1" | awk '{print substr($1, 1, 16)}' + else + shasum -a 256 "$1" | awk '{print substr($1, 1, 16)}' + fi +} + +sha256_16_stdin() { + if command -v sha256sum >/dev/null 2>&1; then + sha256sum | awk '{print substr($1, 1, 16)}' + else + shasum -a 256 | awk '{print substr($1, 1, 16)}' + fi +} + +detect_rust_scope() { + local dockerfile="$1" + local rust_from + rust_from=$(grep -E '^FROM --platform=\$BUILDPLATFORM rust:[^ ]+' "$dockerfile" | head -n1 | sed -E 's/^FROM --platform=\$BUILDPLATFORM rust:([^ ]+).*/\1/' || true) + if [[ -n "${rust_from}" ]]; then + echo "rust-${rust_from}" + return + fi + + if grep -q "rustup.rs" "$dockerfile"; then + echo "rustup-stable" + return + fi + + echo "no-rust" +} + IMAGE_TAG=${IMAGE_TAG:-dev} IMAGE_NAME="openshell/cluster" if [[ -n "${IMAGE_REGISTRY:-}" ]]; then @@ -65,6 +98,16 @@ elif [[ "${DOCKER_PLATFORM:-}" == *","* ]]; then OUTPUT_FLAG="--push" fi +CARGO_BUILD_PROFILE="${OPENSHELL_CARGO_PROFILE:-}" +if [[ -z "${CARGO_BUILD_PROFILE}" ]]; then + if [[ -n "${CI:-}" ]]; then + CARGO_BUILD_PROFILE="release" + else + CARGO_BUILD_PROFILE="local-fast" + fi +fi +PROFILE_ARGS=(--build-arg "OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}") + # Compute cargo version from git tags (same scheme as docker-build-component.sh). VERSION_ARGS=() if [[ -n "${OPENSHELL_CARGO_VERSION:-}" ]]; then @@ -76,11 +119,18 @@ else fi fi +LOCK_HASH=$(sha256_16 Cargo.lock) +RUST_SCOPE=${RUST_TOOLCHAIN_SCOPE:-$(detect_rust_scope "deploy/docker/Dockerfile.cluster")} +CACHE_SCOPE_INPUT="v2|cluster|base|${LOCK_HASH}|${RUST_SCOPE}|${CARGO_BUILD_PROFILE}" +CARGO_TARGET_CACHE_SCOPE=$(printf '%s' "${CACHE_SCOPE_INPUT}" | sha256_16_stdin) + docker buildx build \ ${BUILDER_ARGS[@]+"${BUILDER_ARGS[@]}"} \ ${DOCKER_PLATFORM:+--platform ${DOCKER_PLATFORM}} \ ${CACHE_ARGS[@]+"${CACHE_ARGS[@]}"} \ + ${PROFILE_ARGS[@]+"${PROFILE_ARGS[@]}"} \ ${VERSION_ARGS[@]+"${VERSION_ARGS[@]}"} \ + --build-arg "CARGO_TARGET_CACHE_SCOPE=${CARGO_TARGET_CACHE_SCOPE}" \ -f deploy/docker/Dockerfile.cluster \ -t ${IMAGE_NAME}:${IMAGE_TAG} \ ${K3S_VERSION:+--build-arg K3S_VERSION=${K3S_VERSION}} \ diff --git a/tasks/scripts/docker-build-component.sh b/tasks/scripts/docker-build-component.sh index f20d7295..daad0977 100755 --- a/tasks/scripts/docker-build-component.sh +++ b/tasks/scripts/docker-build-component.sh @@ -133,6 +133,16 @@ if [[ -n "${SCCACHE_MEMCACHED_ENDPOINT:-}" ]]; then SCCACHE_ARGS=(--build-arg "SCCACHE_MEMCACHED_ENDPOINT=${SCCACHE_MEMCACHED_ENDPOINT}") fi +CARGO_BUILD_PROFILE="${OPENSHELL_CARGO_PROFILE:-}" +if [[ -z "${CARGO_BUILD_PROFILE}" ]]; then + if [[ -n "${CI:-}" ]]; then + CARGO_BUILD_PROFILE="release" + else + CARGO_BUILD_PROFILE="local-fast" + fi +fi +PROFILE_ARGS=(--build-arg "OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}") + VERSION_ARGS=() if [[ -n "${OPENSHELL_CARGO_VERSION:-}" ]]; then VERSION_ARGS=(--build-arg "OPENSHELL_CARGO_VERSION=${OPENSHELL_CARGO_VERSION}") @@ -143,7 +153,7 @@ fi LOCK_HASH=$(sha256_16 Cargo.lock) RUST_SCOPE=${RUST_TOOLCHAIN_SCOPE:-$(detect_rust_scope "${DOCKERFILE}")} -CACHE_SCOPE_INPUT="v1|${COMPONENT}|${VARIANT:-base}|${LOCK_HASH}|${RUST_SCOPE}" +CACHE_SCOPE_INPUT="v2|${COMPONENT}|${VARIANT:-base}|${LOCK_HASH}|${RUST_SCOPE}|${CARGO_BUILD_PROFILE}" CARGO_TARGET_CACHE_SCOPE=$(printf '%s' "${CACHE_SCOPE_INPUT}" | sha256_16_stdin) docker buildx build \ @@ -151,6 +161,7 @@ docker buildx build \ ${DOCKER_PLATFORM:+--platform ${DOCKER_PLATFORM}} \ ${CACHE_ARGS[@]+"${CACHE_ARGS[@]}"} \ ${SCCACHE_ARGS[@]+"${SCCACHE_ARGS[@]}"} \ + ${PROFILE_ARGS[@]+"${PROFILE_ARGS[@]}"} \ ${VERSION_ARGS[@]+"${VERSION_ARGS[@]}"} \ --build-arg "CARGO_TARGET_CACHE_SCOPE=${CARGO_TARGET_CACHE_SCOPE}" \ -f "${DOCKERFILE}" \ From 511237780fb71a13a05910604a303f35966472c4 Mon Sep 17 00:00:00 2001 From: John Myers <9696606+johntmyers@users.noreply.github.com> Date: Mon, 16 Mar 2026 12:00:42 -0700 Subject: [PATCH 2/5] perf(cluster): skip gateway reconcile on unchanged image digest Avoid unnecessary helm upgrades and gateway rollouts when an explicit gateway deploy produces the same digest, and keep explicit-target state writes deterministic so subsequent auto deploys do not drift. Made-with: Cursor --- architecture/build-containers.md | 4 +- tasks/scripts/cluster-deploy-fast.sh | 119 ++++++++++++++++++++++++--- 2 files changed, 111 insertions(+), 12 deletions(-) diff --git a/architecture/build-containers.md b/architecture/build-containers.md index f2c807b6..5b4d0c54 100644 --- a/architecture/build-containers.md +++ b/architecture/build-containers.md @@ -52,9 +52,9 @@ The incremental deploy (`cluster-deploy-fast.sh`) fingerprints local Git changes When no local changes are detected, the command is a no-op. -**Gateway updates** are pushed to a local registry and the StatefulSet is restarted. **Supervisor updates** are copied directly into the running cluster container via `docker cp` — new sandbox pods pick up the updated binary immediately through the hostPath mount, with no image rebuild or cluster restart required. +**Gateway updates** are pushed to a local registry and normally restart the StatefulSet. If the pushed digest already matches the running gateway image digest, fast deploy now skips Helm+rollout to avoid unnecessary restarts. **Supervisor updates** are copied directly into the running cluster container via `docker cp` — new sandbox pods pick up the updated binary immediately through the hostPath mount, with no image rebuild or cluster restart required. -Fingerprints are stored in `.cache/cluster-deploy-fast.state`. You can also target specific components explicitly: +Fingerprints are stored in `.cache/cluster-deploy-fast.state`. Explicit target deploys update only the reconciled component fingerprints so subsequent auto deploys stay deterministic. You can also target specific components explicitly: ```bash mise run cluster -- gateway # rebuild gateway only diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh index 76bbbdbe..e1504998 100755 --- a/tasks/scripts/cluster-deploy-fast.sh +++ b/tasks/scripts/cluster-deploy-fast.sh @@ -80,6 +80,14 @@ image_push_duration=0 helm_upgrade_duration=0 gateway_rollout_duration=0 total_duration=0 +gateway_deployed_digest="" +gateway_target_digest="" +skip_gateway_reconcile=0 +skip_gateway_reconcile_reason="" + +state_gateway_fingerprint="" +state_supervisor_fingerprint="" +state_helm_fingerprint="" mkdir -p "${DEPLOY_REPORT_DIR}" @@ -94,6 +102,42 @@ cluster_exec() { docker exec "${CONTAINER_NAME}" sh -c "KUBECONFIG=/etc/rancher/k3s/k3s.yaml $*" } +# Best-effort: find the currently running gateway image digest from pod status. +get_deployed_gateway_digest() { + local image_id + + while IFS= read -r image_id; do + case "${image_id}" in + *"${IMAGE_REPO_BASE}/gateway@"*) + printf '%s\n' "${image_id##*@}" + return 0 + ;; + esac + done < <( + cluster_exec "kubectl get pods -n openshell -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.imageID}{\"\\n\"}{end}{end}'" 2>/dev/null || true + ) + + return 1 +} + +# Best-effort: resolve the pushed gateway tag to a digest from local docker metadata. +get_tagged_gateway_digest() { + local repo_digest + + while IFS= read -r repo_digest; do + case "${repo_digest}" in + "${IMAGE_REPO_BASE}/gateway@"*) + printf '%s\n' "${repo_digest##*@}" + return 0 + ;; + esac + done < <( + docker image inspect --format '{{range .RepoDigests}}{{println .}}{{end}}' "${IMAGE_REPO_BASE}/gateway:${IMAGE_TAG}" 2>/dev/null || true + ) + + return 1 +} + # Path inside the container where the chart is copied for helm upgrades. CONTAINER_CHART_DIR=/tmp/openshell-chart @@ -102,6 +146,7 @@ build_supervisor=0 needs_helm_upgrade=0 explicit_target=0 targets_requested="${*:-auto}" +helm_requested_by_gateway_build=0 previous_gateway_fingerprint="" previous_supervisor_fingerprint="" @@ -338,8 +383,9 @@ fi # Always run helm upgrade when the gateway image is rebuilt so that # the image tag and pull policy are set correctly. -if [[ "${build_gateway}" == "1" ]]; then +if [[ "${build_gateway}" == "1" && "${needs_helm_upgrade}" == "0" ]]; then needs_helm_upgrade=1 + helm_requested_by_gateway_build=1 fi echo "Fast deploy plan:" @@ -445,9 +491,28 @@ if [[ "${#pushed_images[@]}" -gt 0 ]]; then image_push_duration=$((push_end - push_start)) fi +# If the pushed gateway digest is already running, skip gateway-only reconcile. +# This avoids unnecessary helm+rollout churn when an explicit gateway build +# produces the same digest. +if [[ "${build_gateway}" == "1" ]]; then + gateway_deployed_digest=$(get_deployed_gateway_digest || true) + gateway_target_digest=$(get_tagged_gateway_digest || true) + + if [[ -n "${gateway_target_digest}" && -n "${gateway_deployed_digest}" && "${gateway_target_digest}" == "${gateway_deployed_digest}" ]]; then + skip_gateway_reconcile=1 + skip_gateway_reconcile_reason="digest_already_deployed" + echo "Gateway digest ${gateway_target_digest} is already deployed." + + if [[ "${helm_requested_by_gateway_build}" == "1" && "${FORCE_HELM_UPGRADE}" != "1" ]]; then + needs_helm_upgrade=0 + echo "Skipping helm upgrade (it was only requested for gateway image refresh)." + fi + fi +fi + # Evict rebuilt gateway image from k3s containerd cache so new pods pull # the updated image from the registry. -if [[ "${build_gateway}" == "1" ]]; then +if [[ "${build_gateway}" == "1" && "${skip_gateway_reconcile}" == "0" ]]; then echo "Evicting stale gateway image from k3s..." docker exec "${CONTAINER_NAME}" crictl rmi "${IMAGE_REPO_BASE}/gateway:${IMAGE_TAG}" >/dev/null 2>&1 || true fi @@ -501,7 +566,7 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then helm_upgrade_duration=$((helm_end - helm_start)) fi -if [[ "${build_gateway}" == "1" ]]; then +if [[ "${build_gateway}" == "1" && "${skip_gateway_reconcile}" == "0" ]]; then rollout_start=$(date +%s) echo "Restarting gateway to pick up updated image..." if cluster_exec "kubectl get statefulset/openshell -n openshell" >/dev/null 2>&1; then @@ -516,6 +581,8 @@ if [[ "${build_gateway}" == "1" ]]; then rollout_end=$(date +%s) log_duration "Gateway rollout" "${rollout_start}" "${rollout_end}" gateway_rollout_duration=$((rollout_end - rollout_start)) +elif [[ "${build_gateway}" == "1" ]]; then + echo "Skipping gateway rollout (digest unchanged)." fi if [[ "${build_supervisor}" == "1" ]]; then @@ -524,16 +591,37 @@ if [[ "${build_supervisor}" == "1" ]]; then echo "New sandbox pods will use the updated binary immediately (hostPath mount)." fi -if [[ "${explicit_target}" == "0" ]]; then - mkdir -p "$(dirname "${DEPLOY_FAST_STATE_FILE}")" - cat > "${DEPLOY_FAST_STATE_FILE}" < "${DEPLOY_FAST_STATE_FILE}" < "${DEPLOY_REPORT_FILE}" < Date: Mon, 16 Mar 2026 12:29:01 -0700 Subject: [PATCH 3/5] perf(cluster): narrow docker invalidation for faster warm rebuilds Stop touching gateway and supervisor main sources in Docker builds so unrelated edits keep incremental cache hits while retaining proto/build.rs invalidation safety. Made-with: Cursor --- architecture/build-containers.md | 5 +++++ crates/openshell-core/build.rs | 14 ++++++++++++-- deploy/docker/Dockerfile.cluster | 8 +++++--- deploy/docker/Dockerfile.gateway | 6 +++--- 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/architecture/build-containers.md b/architecture/build-containers.md index 5b4d0c54..f6eb28ec 100644 --- a/architecture/build-containers.md +++ b/architecture/build-containers.md @@ -41,6 +41,11 @@ For local (non-CI) Docker builds, OpenShell defaults to the Cargo profile `local-fast` to reduce rebuild latency. CI keeps `release` builds by default. Set `OPENSHELL_CARGO_PROFILE=release` locally when you need release-equivalent binaries. +The Dockerfiles keep the proto/build-script invalidation touch, but they no +longer touch gateway/supervisor main sources. That preserves incremental reuse +for unrelated rebuilds while still forcing protobuf regeneration safety when +needed. + The incremental deploy (`cluster-deploy-fast.sh`) fingerprints local Git changes and only rebuilds components whose files have changed: | Changed files | Rebuild triggered | diff --git a/crates/openshell-core/build.rs b/crates/openshell-core/build.rs index c06702c5..b1c4c43b 100644 --- a/crates/openshell-core/build.rs +++ b/crates/openshell-core/build.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::env; +use std::path::Path; fn main() -> Result<(), Box> { // --- Git-derived version --- @@ -9,8 +10,17 @@ fn main() -> Result<(), Box> { // builds where .git is absent, this silently does nothing and the binary // falls back to CARGO_PKG_VERSION (which is already sed-patched by the // build pipeline). - println!("cargo:rerun-if-changed=../../.git/HEAD"); - println!("cargo:rerun-if-changed=../../.git/refs/tags"); + // In Docker builds we do not copy .git into the context, so registering + // missing rerun paths can force unnecessary build script churn. + if Path::new("../../.git/HEAD").exists() { + println!("cargo:rerun-if-changed=../../.git/HEAD"); + } + if Path::new("../../.git/refs/tags").exists() { + println!("cargo:rerun-if-changed=../../.git/refs/tags"); + } + if Path::new("../../.git/packed-refs").exists() { + println!("cargo:rerun-if-changed=../../.git/packed-refs"); + } if let Some(version) = git_version() { println!("cargo:rustc-env=OPENSHELL_GIT_VERSION={version}"); diff --git a/deploy/docker/Dockerfile.cluster b/deploy/docker/Dockerfile.cluster index 2692c6cd..7b2a3755 100644 --- a/deploy/docker/Dockerfile.cluster +++ b/deploy/docker/Dockerfile.cluster @@ -134,9 +134,11 @@ RUN --mount=type=cache,id=cargo-registry-supervisor-${TARGETARCH},sharing=locked # Copy actual source code COPY crates/ crates/ -# Touch source files to ensure they're rebuilt (not the cached dummy) -RUN touch crates/openshell-sandbox/src/main.rs \ - crates/openshell-core/build.rs \ +# Touch build.rs and proto files to force proto code regeneration when the +# cargo target cache mount retains stale OUT_DIR artifacts from prior builds. +# Do not touch supervisor sources here; that defeats incremental reuse for +# unrelated changes and makes inner-loop builds slower. +RUN touch crates/openshell-core/build.rs \ proto/*.proto # Build the supervisor binary diff --git a/deploy/docker/Dockerfile.gateway b/deploy/docker/Dockerfile.gateway index 1ffec15e..e1371666 100644 --- a/deploy/docker/Dockerfile.gateway +++ b/deploy/docker/Dockerfile.gateway @@ -68,11 +68,11 @@ RUN --mount=type=cache,id=cargo-registry-gateway-${TARGETARCH},sharing=locked,ta # Copy actual source code COPY crates/ crates/ -# Touch source files to ensure they're rebuilt (not the cached dummy). # Touch build.rs and proto files to force proto code regeneration when the # cargo target cache mount retains stale OUT_DIR artifacts from prior builds. -RUN touch crates/openshell-server/src/main.rs \ - crates/openshell-core/build.rs \ +# Do not touch service sources here; that defeats incremental reuse for +# unrelated changes and makes inner-loop builds slower. +RUN touch crates/openshell-core/build.rs \ proto/*.proto # Build the actual application From 8483b8e327793607358934aaa1919c4dc63c99b2 Mon Sep 17 00:00:00 2001 From: John Myers <9696606+johntmyers@users.noreply.github.com> Date: Mon, 16 Mar 2026 13:49:06 -0700 Subject: [PATCH 4/5] perf(cluster): add bounded readiness gate and supervisor reconcile Make fast deploy paths use bounded readiness checks and rollout timeouts, and add deterministic supervisor pod reconcile behavior so local deploy outcomes are reliable without regressing steady-state no-op speed. Made-with: Cursor --- architecture/build-containers.md | 6 +- tasks/scripts/cluster-deploy-fast.sh | 245 ++++++++++++++++++++++++++- 2 files changed, 245 insertions(+), 6 deletions(-) diff --git a/architecture/build-containers.md b/architecture/build-containers.md index f6eb28ec..7c877b74 100644 --- a/architecture/build-containers.md +++ b/architecture/build-containers.md @@ -57,7 +57,11 @@ The incremental deploy (`cluster-deploy-fast.sh`) fingerprints local Git changes When no local changes are detected, the command is a no-op. -**Gateway updates** are pushed to a local registry and normally restart the StatefulSet. If the pushed digest already matches the running gateway image digest, fast deploy now skips Helm+rollout to avoid unnecessary restarts. **Supervisor updates** are copied directly into the running cluster container via `docker cp` — new sandbox pods pick up the updated binary immediately through the hostPath mount, with no image rebuild or cluster restart required. +**Gateway updates** are pushed to a local registry and normally restart the StatefulSet. If the pushed digest already matches the running gateway image digest, fast deploy now skips Helm+rollout to avoid unnecessary restarts. + +**Supervisor updates** are copied directly into the running cluster container via `docker cp`. By default (`DEPLOY_FAST_SUPERVISOR_RECONCILE=rolling-delete`), fast deploy restarts running sandbox pods one-by-one with bounded waits so they deterministically pick up the new supervisor binary. Set `DEPLOY_FAST_SUPERVISOR_RECONCILE=none` to keep current pods untouched until they naturally restart. + +All fast deploy paths finish with a bounded readiness gate (`DEPLOY_FAST_READINESS_TIMEOUT_SECONDS`, default `90`) that validates Kubernetes `readyz`, gateway workload readiness, and supervisor binary presence before writing state. Fingerprints are stored in `.cache/cluster-deploy-fast.state`. Explicit target deploys update only the reconciled component fingerprints so subsequent auto deploys stay deterministic. You can also target specific components explicitly: diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh index e1504998..8e0e2407 100755 --- a/tasks/scripts/cluster-deploy-fast.sh +++ b/tasks/scripts/cluster-deploy-fast.sh @@ -26,6 +26,10 @@ fi DEPLOY_FAST_MODE=${DEPLOY_FAST_MODE:-auto} FORCE_HELM_UPGRADE=${FORCE_HELM_UPGRADE:-0} DEPLOY_FAST_HELM_WAIT=${DEPLOY_FAST_HELM_WAIT:-0} +DEPLOY_FAST_READINESS_TIMEOUT_SECONDS=${DEPLOY_FAST_READINESS_TIMEOUT_SECONDS:-90} +DEPLOY_FAST_READINESS_POLL_SECONDS=${DEPLOY_FAST_READINESS_POLL_SECONDS:-2} +DEPLOY_FAST_SUPERVISOR_RECONCILE=${DEPLOY_FAST_SUPERVISOR_RECONCILE:-rolling-delete} +DEPLOY_FAST_SUPERVISOR_RECONCILE_TIMEOUT_SECONDS=${DEPLOY_FAST_SUPERVISOR_RECONCILE_TIMEOUT_SECONDS:-90} DEPLOY_FAST_STATE_FILE=${DEPLOY_FAST_STATE_FILE:-.cache/cluster-deploy-fast.state} DEPLOY_TX_ID=${DEPLOY_TX_ID:-"tx-$(date +%Y%m%d-%H%M%S)-$RANDOM"} DEPLOY_REPORT_DIR=${DEPLOY_REPORT_DIR:-.cache/deploy-reports} @@ -79,11 +83,20 @@ supervisor_duration=0 image_push_duration=0 helm_upgrade_duration=0 gateway_rollout_duration=0 +supervisor_reconcile_duration=0 +readiness_duration=0 total_duration=0 gateway_deployed_digest="" gateway_target_digest="" skip_gateway_reconcile=0 skip_gateway_reconcile_reason="" +readiness_gate_status="pending" +readiness_failure_reason="" +supervisor_reconcile_performed=0 +supervisor_reconcile_restarted_pods=0 +supervisor_reconcile_waits=0 +supervisor_reconcile_expected_running=0 +supervisor_reconcile_failure_reason="" state_gateway_fingerprint="" state_supervisor_fingerprint="" @@ -91,6 +104,15 @@ state_helm_fingerprint="" mkdir -p "${DEPLOY_REPORT_DIR}" +case "${DEPLOY_FAST_SUPERVISOR_RECONCILE}" in + rolling-delete|none) + ;; + *) + echo "Error: DEPLOY_FAST_SUPERVISOR_RECONCILE must be 'rolling-delete' or 'none' (got '${DEPLOY_FAST_SUPERVISOR_RECONCILE}')." + exit 1 + ;; +esac + if ! docker ps -q --filter "name=^${CONTAINER_NAME}$" --filter "health=healthy" | grep -q .; then echo "Error: Cluster container '${CONTAINER_NAME}' is not running or not healthy." echo "Start the cluster first with: mise run cluster" @@ -102,6 +124,186 @@ cluster_exec() { docker exec "${CONTAINER_NAME}" sh -c "KUBECONFIG=/etc/rancher/k3s/k3s.yaml $*" } +gateway_workload_kind() { + if cluster_exec "kubectl get statefulset/openshell -n openshell" >/dev/null 2>&1; then + echo "statefulset" + return + fi + if cluster_exec "kubectl get deployment/openshell -n openshell" >/dev/null 2>&1; then + echo "deployment" + return + fi + echo "" +} + +wait_for_gateway_ready() { + local timeout_s=${1:-90} + local poll_s=${2:-2} + local deadline + local workload_kind + local desired_replicas + local ready_replicas + local reason="unknown" + + deadline=$(( $(date +%s) + timeout_s )) + + while true; do + if ! cluster_exec "kubectl get --raw='/readyz'" >/dev/null 2>&1; then + reason="apiserver_not_ready" + elif ! docker exec "${CONTAINER_NAME}" test -x /opt/openshell/bin/openshell-sandbox >/dev/null 2>&1; then + reason="supervisor_binary_missing" + else + workload_kind=$(gateway_workload_kind) + if [[ -z "${workload_kind}" ]]; then + reason="gateway_workload_missing" + else + desired_replicas=$(cluster_exec "kubectl get ${workload_kind}/openshell -n openshell -o jsonpath='{.spec.replicas}'" 2>/dev/null || true) + ready_replicas=$(cluster_exec "kubectl get ${workload_kind}/openshell -n openshell -o jsonpath='{.status.readyReplicas}'" 2>/dev/null || true) + if [[ -z "${desired_replicas}" ]]; then + desired_replicas=1 + fi + if [[ -z "${ready_replicas}" ]]; then + ready_replicas=0 + fi + if [[ "${ready_replicas}" -ge "${desired_replicas}" ]]; then + return 0 + fi + reason="gateway_not_ready:${ready_replicas}/${desired_replicas}" + fi + fi + + if [[ $(date +%s) -ge "${deadline}" ]]; then + readiness_failure_reason="${reason}" + return 1 + fi + + sleep "${poll_s}" + done +} + +list_running_sandbox_pods() { + cluster_exec "kubectl get pods -n openshell -l 'openshell.ai/managed-by=openshell,openshell.ai/sandbox-id' --field-selector=status.phase=Running -o jsonpath='{range .items[*]}{.metadata.name}{\"\\n\"}{end}'" 2>/dev/null || true +} + +count_running_sandbox_pods() { + local markers + markers=$(cluster_exec "kubectl get pods -n openshell -l 'openshell.ai/managed-by=openshell,openshell.ai/sandbox-id' --field-selector=status.phase=Running -o jsonpath='{range .items[*]}x{end}'" 2>/dev/null || true) + if [[ -z "${markers}" ]]; then + echo "0" + return + fi + echo "${#markers}" +} + +wait_for_running_sandboxes_ready() { + local expected_count=$1 + local timeout_s=${2:-90} + local poll_s=${3:-2} + local deadline + local running_count + + if [[ "${expected_count}" -le 0 ]]; then + return 0 + fi + + deadline=$(( $(date +%s) + timeout_s )) + + while true; do + running_count=$(count_running_sandbox_pods) + if [[ "${running_count}" -ge "${expected_count}" ]]; then + if cluster_exec "kubectl wait --for=condition=Ready pod -n openshell -l 'openshell.ai/managed-by=openshell,openshell.ai/sandbox-id' --field-selector=status.phase=Running --timeout=1s" >/dev/null 2>&1; then + return 0 + fi + supervisor_reconcile_failure_reason="sandbox_pods_not_ready" + else + supervisor_reconcile_failure_reason="sandbox_pod_count:${running_count}/${expected_count}" + fi + + if [[ $(date +%s) -ge "${deadline}" ]]; then + if [[ -z "${supervisor_reconcile_failure_reason}" ]]; then + supervisor_reconcile_failure_reason="sandbox_reconcile_timeout" + fi + return 1 + fi + + sleep "${poll_s}" + done +} + +reconcile_supervisor_pods() { + local mode=$1 + local timeout_s=$2 + local poll_s=$3 + local reconcile_start + local reconcile_end + local pod_name + local -a sandbox_pods=() + + if [[ "${mode}" == "none" ]]; then + echo "Supervisor reconcile mode is 'none'; skipping running sandbox pod restart." + return 0 + fi + + while IFS= read -r pod_name; do + if [[ -n "${pod_name}" ]]; then + sandbox_pods+=("${pod_name}") + fi + done < <(list_running_sandbox_pods) + + supervisor_reconcile_expected_running=${#sandbox_pods[@]} + if [[ "${supervisor_reconcile_expected_running}" -eq 0 ]]; then + echo "No running sandbox pods found for supervisor reconcile." + return 0 + fi + + supervisor_reconcile_performed=1 + reconcile_start=$(date +%s) + echo "Reconciling ${supervisor_reconcile_expected_running} running sandbox pod(s) after supervisor update..." + + for pod_name in "${sandbox_pods[@]}"; do + echo "Restarting sandbox pod ${pod_name}..." + cluster_exec "kubectl delete pod ${pod_name} -n openshell --wait=true --timeout=${timeout_s}s" >/dev/null + supervisor_reconcile_restarted_pods=$((supervisor_reconcile_restarted_pods + 1)) + + if ! wait_for_running_sandboxes_ready "${supervisor_reconcile_expected_running}" "${timeout_s}" "${poll_s}"; then + echo "Error: sandbox pod readiness did not recover after restarting ${pod_name} (${supervisor_reconcile_failure_reason})." + return 1 + fi + supervisor_reconcile_waits=$((supervisor_reconcile_waits + 1)) + done + + reconcile_end=$(date +%s) + log_duration "Supervisor pod reconcile" "${reconcile_start}" "${reconcile_end}" + supervisor_reconcile_duration=$((reconcile_end - reconcile_start)) + return 0 +} + +run_readiness_gate() { + local gate_start + local gate_end + + gate_start=$(date +%s) + echo "Running readiness gate..." + + if ! wait_for_gateway_ready "${DEPLOY_FAST_READINESS_TIMEOUT_SECONDS}" "${DEPLOY_FAST_READINESS_POLL_SECONDS}"; then + gate_end=$(date +%s) + readiness_duration=$((gate_end - gate_start)) + readiness_gate_status="failed" + if [[ -z "${readiness_failure_reason}" ]]; then + readiness_failure_reason="unknown" + fi + echo "Error: readiness gate failed (${readiness_failure_reason})." + cluster_exec "kubectl get pods -n openshell -o wide" || true + return 1 + fi + + gate_end=$(date +%s) + readiness_duration=$((gate_end - gate_start)) + readiness_gate_status="passed" + echo "Readiness gate passed." + return 0 +} + # Best-effort: find the currently running gateway image digest from pod status. get_deployed_gateway_digest() { local image_id @@ -393,6 +595,8 @@ echo " build gateway: ${build_gateway}" echo " build supervisor: ${build_supervisor}" echo " helm upgrade: ${needs_helm_upgrade}" echo " cargo profile: ${CARGO_BUILD_PROFILE}" +echo " readiness timeout:${DEPLOY_FAST_READINESS_TIMEOUT_SECONDS}s" +echo " supervisor reconcile: ${DEPLOY_FAST_SUPERVISOR_RECONCILE}" if [[ "${explicit_target}" == "0" && "${build_gateway}" == "0" && "${build_supervisor}" == "0" && "${needs_helm_upgrade}" == "0" && "${DEPLOY_FAST_MODE}" != "full" ]]; then echo "No new local changes since last deploy." @@ -461,6 +665,11 @@ if [[ "${build_supervisor}" == "1" ]]; then "${CONTAINER_NAME}:/opt/openshell/bin/openshell-sandbox" docker exec "${CONTAINER_NAME}" chmod 755 /opt/openshell/bin/openshell-sandbox + reconcile_supervisor_pods \ + "${DEPLOY_FAST_SUPERVISOR_RECONCILE}" \ + "${DEPLOY_FAST_SUPERVISOR_RECONCILE_TIMEOUT_SECONDS}" \ + "${DEPLOY_FAST_READINESS_POLL_SECONDS}" + built_components+=("supervisor") supervisor_end=$(date +%s) log_duration "Supervisor build + deploy" "${supervisor_start}" "${supervisor_end}" @@ -521,6 +730,7 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then helm_start=$(date +%s) echo "Upgrading helm release..." helm_wait_args="" + helm_timeout_args="--timeout ${DEPLOY_FAST_READINESS_TIMEOUT_SECONDS}s" if [[ "${DEPLOY_FAST_HELM_WAIT}" == "1" ]]; then helm_wait_args="--wait" fi @@ -560,7 +770,8 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then --set server.tls.clientTlsSecretName=openshell-client-tls \ --set server.sshHandshakeSecret=${SSH_HANDSHAKE_SECRET} \ ${HOST_GATEWAY_ARGS} \ - ${helm_wait_args}" + ${helm_wait_args} \ + ${helm_timeout_args}" helm_end=$(date +%s) log_duration "Helm upgrade" "${helm_start}" "${helm_end}" helm_upgrade_duration=$((helm_end - helm_start)) @@ -571,10 +782,10 @@ if [[ "${build_gateway}" == "1" && "${skip_gateway_reconcile}" == "0" ]]; then echo "Restarting gateway to pick up updated image..." if cluster_exec "kubectl get statefulset/openshell -n openshell" >/dev/null 2>&1; then cluster_exec "kubectl rollout restart statefulset/openshell -n openshell" - cluster_exec "kubectl rollout status statefulset/openshell -n openshell" + cluster_exec "kubectl rollout status statefulset/openshell -n openshell --timeout=${DEPLOY_FAST_READINESS_TIMEOUT_SECONDS}s" elif cluster_exec "kubectl get deployment/openshell -n openshell" >/dev/null 2>&1; then cluster_exec "kubectl rollout restart deployment/openshell -n openshell" - cluster_exec "kubectl rollout status deployment/openshell -n openshell" + cluster_exec "kubectl rollout status deployment/openshell -n openshell --timeout=${DEPLOY_FAST_READINESS_TIMEOUT_SECONDS}s" else echo "Warning: no openshell workload found to roll out in namespace 'openshell'." fi @@ -587,10 +798,18 @@ fi if [[ "${build_supervisor}" == "1" ]]; then echo "Supervisor binary updated on cluster node." - echo "Existing sandbox pods will use the new binary on next restart." - echo "New sandbox pods will use the updated binary immediately (hostPath mount)." + if [[ "${DEPLOY_FAST_SUPERVISOR_RECONCILE}" == "none" ]]; then + echo "Running sandbox pods keep their current supervisor until they restart." + elif [[ "${supervisor_reconcile_performed}" == "1" ]]; then + echo "Reconciled ${supervisor_reconcile_restarted_pods} running sandbox pod(s) to pick up the new supervisor." + else + echo "No running sandbox pods required reconcile." + fi + echo "New sandbox pods use the updated binary immediately (hostPath mount)." fi +run_readiness_gate + # Keep deploy state aligned even when explicit targets are used. # For explicit runs, update only the components we actually reconciled so auto # mode remains accurate for untouched components. @@ -649,6 +868,15 @@ cat > "${DEPLOY_REPORT_FILE}" < "${DEPLOY_REPORT_FILE}" < Date: Mon, 16 Mar 2026 15:01:29 -0700 Subject: [PATCH 5/5] perf(cluster): detect infra drift and escalate to bootstrap Extend fast deploy classification to track cluster infrastructure paths and route those changes through full cluster bootstrap so local redeploys remain deterministic while preserving fast no-op runs. Made-with: Cursor --- architecture/build-containers.md | 1 + tasks/scripts/cluster-deploy-fast.sh | 79 ++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/architecture/build-containers.md b/architecture/build-containers.md index 7c877b74..774f4296 100644 --- a/architecture/build-containers.md +++ b/architecture/build-containers.md @@ -54,6 +54,7 @@ The incremental deploy (`cluster-deploy-fast.sh`) fingerprints local Git changes | `crates/openshell-server/*`, `Dockerfile.gateway` | Gateway | | `crates/openshell-sandbox/*`, `crates/openshell-policy/*` | Supervisor | | `deploy/helm/openshell/*` | Helm upgrade | +| `Dockerfile.cluster`, cluster entrypoint/healthcheck, kube manifests, bootstrap scripts | Full cluster bootstrap | When no local changes are detected, the command is a no-op. diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh index 8e0e2407..56816890 100755 --- a/tasks/scripts/cluster-deploy-fast.sh +++ b/tasks/scripts/cluster-deploy-fast.sh @@ -101,6 +101,7 @@ supervisor_reconcile_failure_reason="" state_gateway_fingerprint="" state_supervisor_fingerprint="" state_helm_fingerprint="" +state_cluster_infra_fingerprint="" mkdir -p "${DEPLOY_REPORT_DIR}" @@ -353,9 +354,14 @@ helm_requested_by_gateway_build=0 previous_gateway_fingerprint="" previous_supervisor_fingerprint="" previous_helm_fingerprint="" +previous_cluster_infra_fingerprint="" current_gateway_fingerprint="" current_supervisor_fingerprint="" current_helm_fingerprint="" +current_cluster_infra_fingerprint="" +cluster_infra_worktree_change=0 +requires_cluster_bootstrap=0 +cluster_bootstrap_reason="" if [[ "$#" -gt 0 ]]; then explicit_target=1 @@ -427,6 +433,9 @@ if [[ -f "${DEPLOY_FAST_STATE_FILE}" ]]; then helm) previous_helm_fingerprint=${value} ;; + cluster_infra) + previous_cluster_infra_fingerprint=${value} + ;; esac done < "${DEPLOY_FAST_STATE_FILE}" @@ -434,6 +443,7 @@ if [[ -f "${DEPLOY_FAST_STATE_FILE}" ]]; then previous_gateway_fingerprint="" previous_supervisor_fingerprint="" previous_helm_fingerprint="" + previous_cluster_infra_fingerprint="" fi # Invalidate gateway and helm fingerprints when the cluster container has @@ -499,6 +509,27 @@ matches_helm() { esac } +matches_cluster_infra() { + local path=$1 + case "${path}" in + deploy/docker/Dockerfile.cluster|deploy/docker/cluster-entrypoint.sh|deploy/docker/cluster-healthcheck.sh) + return 0 + ;; + deploy/kube/manifests/*|deploy/kube/gpu-manifests/*) + return 0 + ;; + tasks/scripts/cluster-bootstrap.sh|tasks/scripts/docker-build-cluster.sh) + return 0 + ;; + crates/openshell-bootstrap/*) + return 0 + ;; + *) + return 1 + ;; + esac +} + compute_fingerprint() { local component=$1 local payload="" @@ -519,6 +550,9 @@ compute_fingerprint() { helm) committed_trees=$(git ls-tree HEAD deploy/helm/openshell/ 2>/dev/null || true) ;; + cluster_infra) + committed_trees=$(git ls-tree HEAD deploy/docker/Dockerfile.cluster deploy/docker/cluster-entrypoint.sh deploy/docker/cluster-healthcheck.sh deploy/kube/manifests/ deploy/kube/gpu-manifests/ tasks/scripts/cluster-bootstrap.sh tasks/scripts/docker-build-cluster.sh crates/openshell-bootstrap/ 2>/dev/null || true) + ;; esac if [[ -n "${committed_trees}" ]]; then payload+="${committed_trees}"$'\n' @@ -542,6 +576,11 @@ compute_fingerprint() { continue fi ;; + cluster_infra) + if ! matches_cluster_infra "${path}"; then + continue + fi + ;; esac if [[ -e "${path}" ]]; then @@ -562,6 +601,38 @@ compute_fingerprint() { current_gateway_fingerprint=$(compute_fingerprint gateway) current_supervisor_fingerprint=$(compute_fingerprint supervisor) current_helm_fingerprint=$(compute_fingerprint helm) +current_cluster_infra_fingerprint=$(compute_fingerprint cluster_infra) + +for path in "${changed_files[@]}"; do + if matches_cluster_infra "${path}"; then + cluster_infra_worktree_change=1 + break + fi +done + +if [[ "${cluster_infra_worktree_change}" == "1" ]]; then + requires_cluster_bootstrap=1 + cluster_bootstrap_reason="cluster_infra_worktree_change" +elif [[ -n "${previous_cluster_infra_fingerprint}" && -n "${current_cluster_infra_fingerprint}" && "${current_cluster_infra_fingerprint}" != "${previous_cluster_infra_fingerprint}" ]]; then + requires_cluster_bootstrap=1 + cluster_bootstrap_reason="cluster_infra_fingerprint_changed" +fi + +if [[ "${requires_cluster_bootstrap}" == "1" ]]; then + echo "Cluster infrastructure change detected (${cluster_bootstrap_reason}); escalating to full cluster bootstrap." + if [[ "${cluster_infra_worktree_change}" == "1" ]]; then + echo "Changed infra paths:" + for path in "${changed_files[@]}"; do + if matches_cluster_infra "${path}"; then + echo " - ${path}" + fi + done + fi + if [[ "${explicit_target}" == "1" ]]; then + echo "Note: explicit target '${targets_requested}' overridden to keep deploy behavior deterministic." + fi + exec tasks/scripts/cluster-bootstrap.sh fast +fi if [[ "${explicit_target}" == "0" && "${DEPLOY_FAST_MODE}" == "full" ]]; then build_gateway=1 @@ -597,6 +668,7 @@ echo " helm upgrade: ${needs_helm_upgrade}" echo " cargo profile: ${CARGO_BUILD_PROFILE}" echo " readiness timeout:${DEPLOY_FAST_READINESS_TIMEOUT_SECONDS}s" echo " supervisor reconcile: ${DEPLOY_FAST_SUPERVISOR_RECONCILE}" +echo " requires bootstrap: ${requires_cluster_bootstrap}" if [[ "${explicit_target}" == "0" && "${build_gateway}" == "0" && "${build_supervisor}" == "0" && "${needs_helm_upgrade}" == "0" && "${DEPLOY_FAST_MODE}" != "full" ]]; then echo "No new local changes since last deploy." @@ -816,6 +888,7 @@ run_readiness_gate state_gateway_fingerprint="${current_gateway_fingerprint}" state_supervisor_fingerprint="${current_supervisor_fingerprint}" state_helm_fingerprint="${current_helm_fingerprint}" +state_cluster_infra_fingerprint="${current_cluster_infra_fingerprint}" if [[ "${explicit_target}" == "1" ]]; then state_gateway_fingerprint="${previous_gateway_fingerprint:-}" @@ -840,6 +913,7 @@ container_id=${current_container_id} gateway=${state_gateway_fingerprint} supervisor=${state_supervisor_fingerprint} helm=${state_helm_fingerprint} +cluster_infra=${state_cluster_infra_fingerprint} EOF overall_end=$(date +%s) @@ -877,6 +951,8 @@ cat > "${DEPLOY_REPORT_FILE}" < "${DEPLOY_REPORT_FILE}" <