diff --git a/Cargo.toml b/Cargo.toml index 215862cf..20949608 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -131,3 +131,11 @@ strip = true [profile.dev] # Faster compile times for dev builds debug = 1 + +[profile.local-fast] +# Local-only profile for faster dockerized inner-loop builds. +inherits = "dev" +opt-level = 1 +debug = 0 +codegen-units = 256 +incremental = true diff --git a/architecture/build-containers.md b/architecture/build-containers.md index 705b00d6..774f4296 100644 --- a/architecture/build-containers.md +++ b/architecture/build-containers.md @@ -37,6 +37,15 @@ This pulls `ghcr.io/nvidia/openshell-community/sandboxes/:latest`. `mise run cluster` is the primary development command. It bootstraps a cluster if one doesn't exist, then performs incremental deploys for subsequent runs. +For local (non-CI) Docker builds, OpenShell defaults to the Cargo profile +`local-fast` to reduce rebuild latency. CI keeps `release` builds by default. +Set `OPENSHELL_CARGO_PROFILE=release` locally when you need release-equivalent binaries. + +The Dockerfiles keep the proto/build-script invalidation touch, but they no +longer touch gateway/supervisor main sources. That preserves incremental reuse +for unrelated rebuilds while still forcing protobuf regeneration safety when +needed. + The incremental deploy (`cluster-deploy-fast.sh`) fingerprints local Git changes and only rebuilds components whose files have changed: | Changed files | Rebuild triggered | @@ -45,12 +54,17 @@ The incremental deploy (`cluster-deploy-fast.sh`) fingerprints local Git changes | `crates/openshell-server/*`, `Dockerfile.gateway` | Gateway | | `crates/openshell-sandbox/*`, `crates/openshell-policy/*` | Supervisor | | `deploy/helm/openshell/*` | Helm upgrade | +| `Dockerfile.cluster`, cluster entrypoint/healthcheck, kube manifests, bootstrap scripts | Full cluster bootstrap | When no local changes are detected, the command is a no-op. -**Gateway updates** are pushed to a local registry and the StatefulSet is restarted. **Supervisor updates** are copied directly into the running cluster container via `docker cp` — new sandbox pods pick up the updated binary immediately through the hostPath mount, with no image rebuild or cluster restart required. +**Gateway updates** are pushed to a local registry and normally restart the StatefulSet. If the pushed digest already matches the running gateway image digest, fast deploy now skips Helm+rollout to avoid unnecessary restarts. + +**Supervisor updates** are copied directly into the running cluster container via `docker cp`. By default (`DEPLOY_FAST_SUPERVISOR_RECONCILE=rolling-delete`), fast deploy restarts running sandbox pods one-by-one with bounded waits so they deterministically pick up the new supervisor binary. Set `DEPLOY_FAST_SUPERVISOR_RECONCILE=none` to keep current pods untouched until they naturally restart. -Fingerprints are stored in `.cache/cluster-deploy-fast.state`. You can also target specific components explicitly: +All fast deploy paths finish with a bounded readiness gate (`DEPLOY_FAST_READINESS_TIMEOUT_SECONDS`, default `90`) that validates Kubernetes `readyz`, gateway workload readiness, and supervisor binary presence before writing state. + +Fingerprints are stored in `.cache/cluster-deploy-fast.state`. Explicit target deploys update only the reconciled component fingerprints so subsequent auto deploys stay deterministic. You can also target specific components explicitly: ```bash mise run cluster -- gateway # rebuild gateway only @@ -58,3 +72,16 @@ mise run cluster -- supervisor # rebuild supervisor only mise run cluster -- chart # helm upgrade only mise run cluster -- all # rebuild everything ``` + +To baseline local compile and image build latency before optimization work: + +```bash +mise run cluster:baseline # cold + warm build timings +mise run cluster:baseline:full # same plus `mise run cluster` deploy timing +mise run cluster:baseline:warm # warm-only build timings +mise run cluster:baseline:warm:full # warm-only + deploy +``` + +Reports are written to `.cache/perf/` as both CSV and markdown. + +Each `mise run cluster` invocation also emits a deploy transaction report to `.cache/deploy-reports/.md`, including selected actions (gateway rebuild, supervisor update, helm upgrade), fingerprints, and per-step durations. diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index 679bc338..22ceff2c 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -143,7 +143,7 @@ flowchart LR The `deploy_gateway_with_logs` variant accepts an `FnMut(String)` callback for progress reporting. The CLI wraps this in a `GatewayDeployLogPanel` for interactive terminals. -**Pre-deploy check** (CLI layer in `gateway_start`): In interactive terminals, `check_existing_deployment` inspects whether a container or volume already exists. If found, the user is prompted to destroy and recreate or reuse the existing gateway. +**Pre-deploy check** (CLI layer in `gateway_start`): `check_existing_deployment` inspects whether a container or volume already exists. In interactive terminals, the user is prompted to destroy and recreate or reuse the existing gateway. In non-interactive mode, the command fails unless `--recreate` or `--reuse-ok` is provided explicitly. ### 2) Image readiness diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 84a323b5..0548d561 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -742,11 +742,18 @@ enum GatewayCommands { /// Destroy and recreate the gateway from scratch if one already exists. /// - /// Without this flag, an interactive prompt asks whether to recreate; - /// in non-interactive mode the existing gateway is reused silently. + /// Without this flag, an interactive prompt asks whether to recreate. + /// In non-interactive mode, the command fails unless `--reuse-ok` is set. #[arg(long)] recreate: bool, + /// Reuse an existing gateway in non-interactive mode. + /// + /// Use this in automation when you intentionally want idempotent reuse. + /// Conflicts with `--recreate`. + #[arg(long, conflicts_with = "recreate")] + reuse_ok: bool, + /// Listen on plaintext HTTP instead of mTLS. /// /// Use when the gateway sits behind a reverse proxy (e.g., Cloudflare @@ -1445,6 +1452,7 @@ async fn main() -> Result<()> { port, gateway_host, recreate, + reuse_ok, plaintext, disable_gateway_auth, registry_username, @@ -1458,6 +1466,7 @@ async fn main() -> Result<()> { port, gateway_host.as_deref(), recreate, + reuse_ok, plaintext, disable_gateway_auth, registry_username.as_deref(), diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 052a7de2..e55d683f 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1315,6 +1315,7 @@ pub async fn gateway_admin_deploy( port: u16, gateway_host: Option<&str>, recreate: bool, + reuse_ok: bool, disable_tls: bool, disable_gateway_auth: bool, registry_username: Option<&str>, @@ -1334,21 +1335,22 @@ pub async fn gateway_admin_deploy( }); // Check whether a gateway already exists. If so, prompt the user (unless - // --recreate was passed or we're in non-interactive mode). + // --recreate was passed). Non-interactive mode now fails by default unless + // --reuse-ok is explicitly set. let mut should_recreate = recreate; if let Some(existing) = openshell_bootstrap::check_existing_deployment(name, remote_opts.as_ref()).await? { + let status = if existing.container_running { + "running" + } else if existing.container_exists { + "stopped" + } else { + "volume only" + }; if !should_recreate { let interactive = std::io::stdin().is_terminal() && std::io::stderr().is_terminal(); if interactive { - let status = if existing.container_running { - "running" - } else if existing.container_exists { - "stopped" - } else { - "volume only" - }; eprintln!(); eprintln!( "{} Gateway '{name}' already exists ({status}).", @@ -1371,10 +1373,17 @@ pub async fn gateway_admin_deploy( eprintln!("Keeping existing gateway."); return Ok(()); } - } else { - // Non-interactive mode: reuse existing gateway silently. - eprintln!("Gateway '{name}' already exists, reusing."); + } else if reuse_ok { + eprintln!("Gateway '{name}' already exists ({status}), reusing (--reuse-ok)."); return Ok(()); + } else { + return Err(miette::miette!( + "Gateway '{name}' already exists ({status}).\n\ + Non-interactive mode requires explicit intent.\n\ + Re-run with one of:\n\ + --reuse-ok # keep existing gateway\n\ + --recreate # destroy and redeploy" + )); } } } diff --git a/crates/openshell-core/build.rs b/crates/openshell-core/build.rs index c06702c5..b1c4c43b 100644 --- a/crates/openshell-core/build.rs +++ b/crates/openshell-core/build.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::env; +use std::path::Path; fn main() -> Result<(), Box> { // --- Git-derived version --- @@ -9,8 +10,17 @@ fn main() -> Result<(), Box> { // builds where .git is absent, this silently does nothing and the binary // falls back to CARGO_PKG_VERSION (which is already sed-patched by the // build pipeline). - println!("cargo:rerun-if-changed=../../.git/HEAD"); - println!("cargo:rerun-if-changed=../../.git/refs/tags"); + // In Docker builds we do not copy .git into the context, so registering + // missing rerun paths can force unnecessary build script churn. + if Path::new("../../.git/HEAD").exists() { + println!("cargo:rerun-if-changed=../../.git/HEAD"); + } + if Path::new("../../.git/refs/tags").exists() { + println!("cargo:rerun-if-changed=../../.git/refs/tags"); + } + if Path::new("../../.git/packed-refs").exists() { + println!("cargo:rerun-if-changed=../../.git/packed-refs"); + } if let Some(version) = git_version() { println!("cargo:rustc-env=OPENSHELL_GIT_VERSION={version}"); diff --git a/crates/openshell-sandbox/src/proxy.rs b/crates/openshell-sandbox/src/proxy.rs index 44234f66..1398f675 100644 --- a/crates/openshell-sandbox/src/proxy.rs +++ b/crates/openshell-sandbox/src/proxy.rs @@ -123,7 +123,7 @@ impl ProxyHandle { /// The proxy uses OPA for network decisions with process-identity binding /// via `/proc/net/tcp`. All connections are evaluated through OPA policy. #[allow(clippy::too_many_arguments)] - pub async fn start_with_bind_addr( + pub(crate) async fn start_with_bind_addr( policy: &ProxyPolicy, bind_addr: Option, opa_engine: Arc, diff --git a/deploy/docker/Dockerfile.cluster b/deploy/docker/Dockerfile.cluster index 49e29a98..7b2a3755 100644 --- a/deploy/docker/Dockerfile.cluster +++ b/deploy/docker/Dockerfile.cluster @@ -77,6 +77,7 @@ FROM --platform=$BUILDPLATFORM rust:1.88-slim AS supervisor-builder ARG TARGETARCH ARG BUILDARCH ARG OPENSHELL_CARGO_VERSION +ARG OPENSHELL_CARGO_PROFILE=release ARG CARGO_TARGET_CACHE_SCOPE=default ARG SCCACHE_MEMCACHED_ENDPOINT @@ -121,14 +122,23 @@ COPY proto/ proto/ RUN --mount=type=cache,id=cargo-registry-supervisor-${TARGETARCH},sharing=locked,target=/usr/local/cargo/registry \ --mount=type=cache,id=cargo-target-supervisor-${TARGETARCH}-${CARGO_TARGET_CACHE_SCOPE},sharing=locked,target=/build/target \ --mount=type=cache,id=sccache-supervisor-${TARGETARCH},sharing=locked,target=/tmp/sccache \ - . cross-build.sh && cargo_cross_build -p openshell-sandbox 2>/dev/null || true + . cross-build.sh && \ + if [ "${OPENSHELL_CARGO_PROFILE}" = "release" ]; then \ + cargo_cross_build --release -p openshell-sandbox 2>/dev/null || true; \ + elif [ "${OPENSHELL_CARGO_PROFILE}" = "dev" ]; then \ + cargo_cross_build -p openshell-sandbox 2>/dev/null || true; \ + else \ + cargo_cross_build --profile "${OPENSHELL_CARGO_PROFILE}" -p openshell-sandbox 2>/dev/null || true; \ + fi # Copy actual source code COPY crates/ crates/ -# Touch source files to ensure they're rebuilt (not the cached dummy) -RUN touch crates/openshell-sandbox/src/main.rs \ - crates/openshell-core/build.rs \ +# Touch build.rs and proto files to force proto code regeneration when the +# cargo target cache mount retains stale OUT_DIR artifacts from prior builds. +# Do not touch supervisor sources here; that defeats incremental reuse for +# unrelated changes and makes inner-loop builds slower. +RUN touch crates/openshell-core/build.rs \ proto/*.proto # Build the supervisor binary @@ -139,9 +149,28 @@ RUN --mount=type=cache,id=cargo-registry-supervisor-${TARGETARCH},sharing=locked if [ -n "${OPENSHELL_CARGO_VERSION:-}" ]; then \ sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${OPENSHELL_CARGO_VERSION}"'"/}' Cargo.toml; \ fi && \ - cargo_cross_build --release -p openshell-sandbox && \ - mkdir -p /build/out && \ - cp "$(cross_output_dir release)/openshell-sandbox" /build/out/ + if [ "${OPENSHELL_CARGO_PROFILE}" = "release" ]; then \ + cargo_cross_build --release -p openshell-sandbox && \ + mkdir -p /build/out && \ + cp "$(cross_output_dir release)/openshell-sandbox" /build/out/; \ + elif [ "${OPENSHELL_CARGO_PROFILE}" = "dev" ]; then \ + cargo_cross_build -p openshell-sandbox && \ + mkdir -p /build/out && \ + cp "$(cross_output_dir debug)/openshell-sandbox" /build/out/; \ + else \ + cargo_cross_build --profile "${OPENSHELL_CARGO_PROFILE}" -p openshell-sandbox && \ + mkdir -p /build/out && \ + cp "$(cross_output_dir "${OPENSHELL_CARGO_PROFILE}")/openshell-sandbox" /build/out/; \ + fi + +# --------------------------------------------------------------------------- +# Stage 1e: Minimal export stage for local supervisor extraction +# --------------------------------------------------------------------------- +# Exporting directly from supervisor-builder with --output type=local copies the +# full build filesystem (including target cache) and is very slow on macOS. +# This scratch stage contains only the final binary. +FROM scratch AS supervisor-export +COPY --from=supervisor-builder /build/out/openshell-sandbox /openshell-sandbox # --------------------------------------------------------------------------- # Stage 2: Install NVIDIA container toolkit on Ubuntu diff --git a/deploy/docker/Dockerfile.gateway b/deploy/docker/Dockerfile.gateway index 05d2a46f..e1371666 100644 --- a/deploy/docker/Dockerfile.gateway +++ b/deploy/docker/Dockerfile.gateway @@ -11,6 +11,7 @@ FROM --platform=$BUILDPLATFORM rust:1.88-slim AS builder ARG TARGETARCH ARG BUILDARCH ARG OPENSHELL_CARGO_VERSION +ARG OPENSHELL_CARGO_PROFILE=release ARG CARGO_TARGET_CACHE_SCOPE=default # Install build dependencies @@ -55,16 +56,23 @@ COPY proto/ proto/ RUN --mount=type=cache,id=cargo-registry-gateway-${TARGETARCH},sharing=locked,target=/usr/local/cargo/registry \ --mount=type=cache,id=cargo-target-gateway-${TARGETARCH}-${CARGO_TARGET_CACHE_SCOPE},sharing=locked,target=/build/target \ --mount=type=cache,id=sccache-gateway-${TARGETARCH},sharing=locked,target=/tmp/sccache \ - . cross-build.sh && cargo_cross_build --release -p openshell-server 2>/dev/null || true + . cross-build.sh && \ + if [ "${OPENSHELL_CARGO_PROFILE}" = "release" ]; then \ + cargo_cross_build --release -p openshell-server 2>/dev/null || true; \ + elif [ "${OPENSHELL_CARGO_PROFILE}" = "dev" ]; then \ + cargo_cross_build -p openshell-server 2>/dev/null || true; \ + else \ + cargo_cross_build --profile "${OPENSHELL_CARGO_PROFILE}" -p openshell-server 2>/dev/null || true; \ + fi # Copy actual source code COPY crates/ crates/ -# Touch source files to ensure they're rebuilt (not the cached dummy). # Touch build.rs and proto files to force proto code regeneration when the # cargo target cache mount retains stale OUT_DIR artifacts from prior builds. -RUN touch crates/openshell-server/src/main.rs \ - crates/openshell-core/build.rs \ +# Do not touch service sources here; that defeats incremental reuse for +# unrelated changes and makes inner-loop builds slower. +RUN touch crates/openshell-core/build.rs \ proto/*.proto # Build the actual application @@ -75,8 +83,16 @@ RUN --mount=type=cache,id=cargo-registry-gateway-${TARGETARCH},sharing=locked,ta if [ -n "${OPENSHELL_CARGO_VERSION:-}" ]; then \ sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${OPENSHELL_CARGO_VERSION}"'"/}' Cargo.toml; \ fi && \ - cargo_cross_build --release -p openshell-server && \ - cp "$(cross_output_dir release)/openshell-server" /build/openshell-server + if [ "${OPENSHELL_CARGO_PROFILE}" = "release" ]; then \ + cargo_cross_build --release -p openshell-server && \ + cp "$(cross_output_dir release)/openshell-server" /build/openshell-server; \ + elif [ "${OPENSHELL_CARGO_PROFILE}" = "dev" ]; then \ + cargo_cross_build -p openshell-server && \ + cp "$(cross_output_dir debug)/openshell-server" /build/openshell-server; \ + else \ + cargo_cross_build --profile "${OPENSHELL_CARGO_PROFILE}" -p openshell-server && \ + cp "$(cross_output_dir "${OPENSHELL_CARGO_PROFILE}")/openshell-server" /build/openshell-server; \ + fi # Stage 2: Runtime (uses target platform) # NVIDIA hardened Ubuntu base for supply chain consistency. diff --git a/scripts/bin/openshell b/scripts/bin/openshell index 8b8a9c21..5171bd9c 100755 --- a/scripts/bin/openshell +++ b/scripts/bin/openshell @@ -25,8 +25,12 @@ else # Current HEAD commit (detects branch switches, pulls, rebases) current_head=$(git rev-parse HEAD 2>/dev/null || echo "unknown") - # Collect dirty (modified, staged, untracked) files - mapfile -t changed_files < <( + # Collect dirty (modified, staged, untracked) files. + # Use a bash-3-compatible read loop (macOS default bash has no mapfile). + changed_files=() + while IFS= read -r path; do + changed_files+=("$path") + done < <( { git diff --name-only 2>/dev/null git diff --name-only --cached 2>/dev/null @@ -95,7 +99,10 @@ if [[ "$needs_build" == "1" ]]; then cd "$PROJECT_ROOT" new_head=$(git rev-parse HEAD 2>/dev/null || echo "unknown") # Recompute fingerprint of remaining dirty files (build may not change them) - mapfile -t post_files < <( + post_files=() + while IFS= read -r path; do + post_files+=("$path") + done < <( { git diff --name-only 2>/dev/null git diff --name-only --cached 2>/dev/null diff --git a/tasks/cluster.toml b/tasks/cluster.toml index debda04c..7bc516e9 100644 --- a/tasks/cluster.toml +++ b/tasks/cluster.toml @@ -34,3 +34,19 @@ hide = true description = "Tag and push gateway image to pull registry" run = "tasks/scripts/cluster-push-component.sh gateway" hide = true + +["cluster:baseline"] +description = "Capture cold/warm baseline timings for local builds" +run = "tasks/scripts/cluster-baseline.sh --mode both" + +["cluster:baseline:full"] +description = "Capture baseline timings including cluster deploy" +run = "tasks/scripts/cluster-baseline.sh --mode both --with-deploy" + +["cluster:baseline:warm"] +description = "Capture warm-path baseline timings for local builds" +run = "tasks/scripts/cluster-baseline.sh --mode warm" + +["cluster:baseline:warm:full"] +description = "Capture warm-path baseline timings including cluster deploy" +run = "tasks/scripts/cluster-baseline.sh --mode warm --with-deploy" diff --git a/tasks/scripts/cluster-baseline.sh b/tasks/scripts/cluster-baseline.sh new file mode 100755 index 00000000..6c6cf98a --- /dev/null +++ b/tasks/scripts/cluster-baseline.sh @@ -0,0 +1,210 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -uo pipefail + +MODE="both" +WITH_DEPLOY=0 +OUTPUT_DIR="${PERF_OUTPUT_DIR:-.cache/perf}" +RUN_ID="$(date +%Y%m%d-%H%M%S)" +FAILURES=0 +CARGO_BUILD_PROFILE="${OPENSHELL_CARGO_PROFILE:-local-fast}" + +usage() { + cat <<'EOF' +Usage: tasks/scripts/cluster-baseline.sh [options] + +Capture local baseline timings for: + - CLI compile (cargo build -p openshell-cli) + - Gateway image build + - Supervisor-only build stage + - Cluster image build + - Optional cluster deploy + +Options: + --mode Which measurement passes to run (default: both) + --with-deploy Include `mise run --skip-deps cluster` + --output-dir Output directory for CSV/markdown (default: .cache/perf) + -h, --help Show this help text +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --mode) + MODE="${2:-}" + shift 2 + ;; + --with-deploy) + WITH_DEPLOY=1 + shift + ;; + --output-dir) + OUTPUT_DIR="${2:-}" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if [[ "${MODE}" != "cold" && "${MODE}" != "warm" && "${MODE}" != "both" ]]; then + echo "Invalid --mode value: ${MODE}" >&2 + exit 1 +fi + +mkdir -p "${OUTPUT_DIR}" + +CSV_FILE="${OUTPUT_DIR}/cluster-baseline-${RUN_ID}.csv" +SUMMARY_FILE="${OUTPUT_DIR}/cluster-baseline-${RUN_ID}.md" +echo "run,category,step,status,duration_s" > "${CSV_FILE}" + +normalize_arch() { + case "$1" in + x86_64) echo "amd64" ;; + aarch64) echo "arm64" ;; + *) echo "$1" ;; + esac +} + +record_result() { + local run_label=$1 + local category=$2 + local step=$3 + local status=$4 + local duration_s=$5 + + echo "${run_label},${category},${step},${status},${duration_s}" >> "${CSV_FILE}" +} + +run_step() { + local run_label=$1 + local category=$2 + local step=$3 + local command=$4 + shift 4 + + local start_s end_s duration_s status + start_s=$(date +%s) + + echo "" + echo "[${run_label}] ${category}/${step}" + echo " ${command}" + + if env "$@" bash -lc "${command}"; then + status="ok" + else + status="fail" + FAILURES=$((FAILURES + 1)) + fi + + end_s=$(date +%s) + duration_s=$((end_s - start_s)) + record_result "${run_label}" "${category}" "${step}" "${status}" "${duration_s}" +} + +run_pass() { + local run_label=$1 + local scope_seed=$2 + + local cli_target_dir gateway_cache_dir cluster_cache_dir supervisor_output_dir + local image_tag docker_arch + local supervisor_version_arg supervisor_profile_arg cargo_version + + cli_target_dir=".cache/perf/target-${scope_seed}" + gateway_cache_dir=".cache/perf/buildkit-gateway-${scope_seed}" + cluster_cache_dir=".cache/perf/buildkit-cluster-${scope_seed}" + supervisor_output_dir="${OUTPUT_DIR}/supervisor-${scope_seed}" + image_tag="perf-${scope_seed}" + docker_arch="$(normalize_arch "$(docker version --format '{{.Server.Arch}}')")" + supervisor_version_arg="" + supervisor_profile_arg=" --build-arg OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}" + cargo_version=$(uv run python tasks/scripts/release.py get-version --cargo 2>/dev/null || true) + if [[ -n "${cargo_version}" ]]; then + supervisor_version_arg=" --build-arg OPENSHELL_CARGO_VERSION=${cargo_version}" + fi + + if [[ "${run_label}" == "cold" ]]; then + rm -rf "${cli_target_dir}" "${gateway_cache_dir}" "${cluster_cache_dir}" "${supervisor_output_dir}" + fi + + run_step "${run_label}" "rust" "cli_debug" \ + "cargo build -p openshell-cli" \ + "CARGO_TARGET_DIR=${cli_target_dir}" + + run_step "${run_label}" "docker" "gateway_image" \ + "tasks/scripts/docker-build-component.sh gateway" \ + "RUST_TOOLCHAIN_SCOPE=${scope_seed}" \ + "DOCKER_BUILD_CACHE_DIR=${gateway_cache_dir}" \ + "OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}" \ + "IMAGE_TAG=${image_tag}" + + run_step "${run_label}" "docker" "supervisor_stage" \ + "docker buildx build --file deploy/docker/Dockerfile.cluster --target supervisor-export --build-arg BUILDARCH=${docker_arch} --build-arg TARGETARCH=${docker_arch} --build-arg CARGO_TARGET_CACHE_SCOPE=${scope_seed}${supervisor_profile_arg}${supervisor_version_arg} --output type=local,dest=${supervisor_output_dir} --platform linux/${docker_arch} ." \ + "DOCKER_BUILD_CACHE_DIR=${cluster_cache_dir}" \ + "IMAGE_TAG=${image_tag}" + + run_step "${run_label}" "docker" "cluster_image" \ + "tasks/scripts/docker-build-cluster.sh" \ + "DOCKER_BUILD_CACHE_DIR=${cluster_cache_dir}" \ + "OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}" \ + "IMAGE_TAG=${image_tag}" + + if [[ "${WITH_DEPLOY}" == "1" ]]; then + run_step "${run_label}" "deploy" "cluster_task" \ + "mise run --skip-deps cluster" \ + "OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}" + fi +} + +SCOPE_SEED="baseline-${RUN_ID}" + +case "${MODE}" in + cold) + run_pass "cold" "${SCOPE_SEED}" + ;; + warm) + run_pass "warm" "baseline-warm" + ;; + both) + run_pass "cold" "${SCOPE_SEED}" + run_pass "warm" "${SCOPE_SEED}" + ;; +esac + +{ + echo "# Cluster Baseline Report" + echo "" + echo "- run_id: \`${RUN_ID}\`" + echo "- mode: \`${MODE}\`" + echo "- include_deploy: \`${WITH_DEPLOY}\`" + echo "- cargo_build_profile: \`${CARGO_BUILD_PROFILE}\`" + echo "- csv: \`${CSV_FILE}\`" + echo "" + echo "| run | category | step | status | duration_s |" + echo "|---|---|---|---|---|" + tail -n +2 "${CSV_FILE}" | while IFS=, read -r run_label category step status duration_s; do + echo "| ${run_label} | ${category} | ${step} | ${status} | ${duration_s} |" + done +} > "${SUMMARY_FILE}" + +echo "" +echo "Baseline report written:" +echo " ${SUMMARY_FILE}" +echo " ${CSV_FILE}" + +if [[ "${FAILURES}" -gt 0 ]]; then + echo "Completed with ${FAILURES} failed step(s)." >&2 + exit 1 +fi + +echo "Completed successfully." diff --git a/tasks/scripts/cluster-bootstrap.sh b/tasks/scripts/cluster-bootstrap.sh index f354daea..364476cc 100755 --- a/tasks/scripts/cluster-bootstrap.sh +++ b/tasks/scripts/cluster-bootstrap.sh @@ -16,6 +16,16 @@ if [ "${MODE}" != "build" ] && [ "${MODE}" != "fast" ]; then exit 1 fi +DEPLOY_TX_ID=${DEPLOY_TX_ID:-"tx-$(date +%Y%m%d-%H%M%S)-$RANDOM"} +DEPLOY_REPORT_DIR=${DEPLOY_REPORT_DIR:-.cache/deploy-reports} +DEPLOY_REPORT_FILE="${DEPLOY_REPORT_DIR}/${DEPLOY_TX_ID}.md" +overall_start=$(date +%s) +recreated_cluster=0 +pushed_gateway=0 +built_cluster_image=0 + +mkdir -p "${DEPLOY_REPORT_DIR}" + if [ -n "${IMAGE_TAG:-}" ]; then IMAGE_TAG=${IMAGE_TAG} else @@ -122,8 +132,8 @@ is_local_registry_host() { } registry_reachable() { - curl -4 -fsS --max-time 2 "http://127.0.0.1:5000/v2/" >/dev/null 2>&1 || \ - curl -4 -fsS --max-time 2 "http://localhost:5000/v2/" >/dev/null 2>&1 + curl -fsS --max-time 2 "http://127.0.0.1:5000/v2/" >/dev/null 2>&1 || \ + curl -fsS --max-time 2 "http://localhost:5000/v2/" >/dev/null 2>&1 } wait_for_registry_ready() { @@ -172,12 +182,23 @@ ensure_local_registry() { return fi + # Docker Desktop occasionally leaves published ports in a bad state after + # daemon/network churn. Recreate once before failing hard. + echo "Local registry probe failed; recreating registry container and retrying..." >&2 + docker rm -f "${LOCAL_REGISTRY_CONTAINER}" >/dev/null 2>&1 || true + docker run -d --restart=always --name "${LOCAL_REGISTRY_CONTAINER}" -p 5000:5000 registry:2 >/dev/null + + if wait_for_registry_ready 20 1; then + return + fi + if registry_reachable; then return fi echo "Error: local registry is not reachable at ${REGISTRY_HOST}." >&2 echo " Ensure a registry is running on port 5000 (e.g. docker run -d --name openshell-local-registry -p 5000:5000 registry:2)." >&2 + echo " Active docker context: $(docker context show 2>/dev/null || echo unknown)" >&2 docker ps -a >&2 || true docker logs "${LOCAL_REGISTRY_CONTAINER}" >&2 || true exit 1 @@ -217,6 +238,7 @@ if [ "${MODE}" = "fast" ]; then if docker inspect "${CONTAINER_NAME}" >/dev/null 2>&1 || docker volume inspect "${VOLUME_NAME}" >/dev/null 2>&1; then echo "Recreating cluster '${CLUSTER_NAME}' from scratch..." openshell gateway destroy --name "${CLUSTER_NAME}" + recreated_cluster=1 fi fi @@ -224,6 +246,7 @@ if [ "${SKIP_IMAGE_PUSH:-}" = "1" ]; then echo "Skipping image push (SKIP_IMAGE_PUSH=1; images already in registry)." elif [ "${MODE}" = "build" ] || [ "${MODE}" = "fast" ]; then tasks/scripts/cluster-push-component.sh gateway + pushed_gateway=1 fi # Build the cluster image so it contains the latest Helm chart, manifests, @@ -231,6 +254,7 @@ fi # always starts with the correct chart version. if [ "${SKIP_CLUSTER_IMAGE_BUILD:-}" != "1" ]; then tasks/scripts/docker-build-cluster.sh + built_cluster_image=1 fi # In fast/build modes, use the locally-built cluster image rather than the @@ -268,5 +292,26 @@ fi DEPLOY_FAST_STATE_FILE=${DEPLOY_FAST_STATE_FILE:-.cache/cluster-deploy-fast.state} rm -f "${DEPLOY_FAST_STATE_FILE}" +overall_end=$(date +%s) +total_duration=$((overall_end - overall_start)) + +cat > "${DEPLOY_REPORT_FILE}" </dev/null 2>&1; then + sha256sum "$1" | awk '{print substr($1, 1, 16)}' + else + shasum -a 256 "$1" | awk '{print substr($1, 1, 16)}' + fi +} + +sha256_16_stdin() { + if command -v sha256sum >/dev/null 2>&1; then + sha256sum | awk '{print substr($1, 1, 16)}' + else + shasum -a 256 | awk '{print substr($1, 1, 16)}' + fi +} + +detect_rust_scope() { + local dockerfile=$1 + local rust_from + rust_from=$(grep -E '^FROM --platform=\$BUILDPLATFORM rust:[^ ]+' "$dockerfile" | head -n1 | sed -E 's/^FROM --platform=\$BUILDPLATFORM rust:([^ ]+).*/\1/' || true) + if [[ -n "${rust_from}" ]]; then + echo "rust-${rust_from}" + return + fi + + if grep -q "rustup.rs" "$dockerfile"; then + echo "rustup-stable" + return + fi + + echo "no-rust" +} + +change_detection_duration=0 +builds_duration=0 +supervisor_duration=0 +image_push_duration=0 +helm_upgrade_duration=0 +gateway_rollout_duration=0 +supervisor_reconcile_duration=0 +readiness_duration=0 +total_duration=0 +gateway_deployed_digest="" +gateway_target_digest="" +skip_gateway_reconcile=0 +skip_gateway_reconcile_reason="" +readiness_gate_status="pending" +readiness_failure_reason="" +supervisor_reconcile_performed=0 +supervisor_reconcile_restarted_pods=0 +supervisor_reconcile_waits=0 +supervisor_reconcile_expected_running=0 +supervisor_reconcile_failure_reason="" + +state_gateway_fingerprint="" +state_supervisor_fingerprint="" +state_helm_fingerprint="" +state_cluster_infra_fingerprint="" + +mkdir -p "${DEPLOY_REPORT_DIR}" + +case "${DEPLOY_FAST_SUPERVISOR_RECONCILE}" in + rolling-delete|none) + ;; + *) + echo "Error: DEPLOY_FAST_SUPERVISOR_RECONCILE must be 'rolling-delete' or 'none' (got '${DEPLOY_FAST_SUPERVISOR_RECONCILE}')." + exit 1 + ;; +esac + if ! docker ps -q --filter "name=^${CONTAINER_NAME}$" --filter "health=healthy" | grep -q .; then echo "Error: Cluster container '${CONTAINER_NAME}' is not running or not healthy." echo "Start the cluster first with: mise run cluster" @@ -41,6 +125,222 @@ cluster_exec() { docker exec "${CONTAINER_NAME}" sh -c "KUBECONFIG=/etc/rancher/k3s/k3s.yaml $*" } +gateway_workload_kind() { + if cluster_exec "kubectl get statefulset/openshell -n openshell" >/dev/null 2>&1; then + echo "statefulset" + return + fi + if cluster_exec "kubectl get deployment/openshell -n openshell" >/dev/null 2>&1; then + echo "deployment" + return + fi + echo "" +} + +wait_for_gateway_ready() { + local timeout_s=${1:-90} + local poll_s=${2:-2} + local deadline + local workload_kind + local desired_replicas + local ready_replicas + local reason="unknown" + + deadline=$(( $(date +%s) + timeout_s )) + + while true; do + if ! cluster_exec "kubectl get --raw='/readyz'" >/dev/null 2>&1; then + reason="apiserver_not_ready" + elif ! docker exec "${CONTAINER_NAME}" test -x /opt/openshell/bin/openshell-sandbox >/dev/null 2>&1; then + reason="supervisor_binary_missing" + else + workload_kind=$(gateway_workload_kind) + if [[ -z "${workload_kind}" ]]; then + reason="gateway_workload_missing" + else + desired_replicas=$(cluster_exec "kubectl get ${workload_kind}/openshell -n openshell -o jsonpath='{.spec.replicas}'" 2>/dev/null || true) + ready_replicas=$(cluster_exec "kubectl get ${workload_kind}/openshell -n openshell -o jsonpath='{.status.readyReplicas}'" 2>/dev/null || true) + if [[ -z "${desired_replicas}" ]]; then + desired_replicas=1 + fi + if [[ -z "${ready_replicas}" ]]; then + ready_replicas=0 + fi + if [[ "${ready_replicas}" -ge "${desired_replicas}" ]]; then + return 0 + fi + reason="gateway_not_ready:${ready_replicas}/${desired_replicas}" + fi + fi + + if [[ $(date +%s) -ge "${deadline}" ]]; then + readiness_failure_reason="${reason}" + return 1 + fi + + sleep "${poll_s}" + done +} + +list_running_sandbox_pods() { + cluster_exec "kubectl get pods -n openshell -l 'openshell.ai/managed-by=openshell,openshell.ai/sandbox-id' --field-selector=status.phase=Running -o jsonpath='{range .items[*]}{.metadata.name}{\"\\n\"}{end}'" 2>/dev/null || true +} + +count_running_sandbox_pods() { + local markers + markers=$(cluster_exec "kubectl get pods -n openshell -l 'openshell.ai/managed-by=openshell,openshell.ai/sandbox-id' --field-selector=status.phase=Running -o jsonpath='{range .items[*]}x{end}'" 2>/dev/null || true) + if [[ -z "${markers}" ]]; then + echo "0" + return + fi + echo "${#markers}" +} + +wait_for_running_sandboxes_ready() { + local expected_count=$1 + local timeout_s=${2:-90} + local poll_s=${3:-2} + local deadline + local running_count + + if [[ "${expected_count}" -le 0 ]]; then + return 0 + fi + + deadline=$(( $(date +%s) + timeout_s )) + + while true; do + running_count=$(count_running_sandbox_pods) + if [[ "${running_count}" -ge "${expected_count}" ]]; then + if cluster_exec "kubectl wait --for=condition=Ready pod -n openshell -l 'openshell.ai/managed-by=openshell,openshell.ai/sandbox-id' --field-selector=status.phase=Running --timeout=1s" >/dev/null 2>&1; then + return 0 + fi + supervisor_reconcile_failure_reason="sandbox_pods_not_ready" + else + supervisor_reconcile_failure_reason="sandbox_pod_count:${running_count}/${expected_count}" + fi + + if [[ $(date +%s) -ge "${deadline}" ]]; then + if [[ -z "${supervisor_reconcile_failure_reason}" ]]; then + supervisor_reconcile_failure_reason="sandbox_reconcile_timeout" + fi + return 1 + fi + + sleep "${poll_s}" + done +} + +reconcile_supervisor_pods() { + local mode=$1 + local timeout_s=$2 + local poll_s=$3 + local reconcile_start + local reconcile_end + local pod_name + local -a sandbox_pods=() + + if [[ "${mode}" == "none" ]]; then + echo "Supervisor reconcile mode is 'none'; skipping running sandbox pod restart." + return 0 + fi + + while IFS= read -r pod_name; do + if [[ -n "${pod_name}" ]]; then + sandbox_pods+=("${pod_name}") + fi + done < <(list_running_sandbox_pods) + + supervisor_reconcile_expected_running=${#sandbox_pods[@]} + if [[ "${supervisor_reconcile_expected_running}" -eq 0 ]]; then + echo "No running sandbox pods found for supervisor reconcile." + return 0 + fi + + supervisor_reconcile_performed=1 + reconcile_start=$(date +%s) + echo "Reconciling ${supervisor_reconcile_expected_running} running sandbox pod(s) after supervisor update..." + + for pod_name in "${sandbox_pods[@]}"; do + echo "Restarting sandbox pod ${pod_name}..." + cluster_exec "kubectl delete pod ${pod_name} -n openshell --wait=true --timeout=${timeout_s}s" >/dev/null + supervisor_reconcile_restarted_pods=$((supervisor_reconcile_restarted_pods + 1)) + + if ! wait_for_running_sandboxes_ready "${supervisor_reconcile_expected_running}" "${timeout_s}" "${poll_s}"; then + echo "Error: sandbox pod readiness did not recover after restarting ${pod_name} (${supervisor_reconcile_failure_reason})." + return 1 + fi + supervisor_reconcile_waits=$((supervisor_reconcile_waits + 1)) + done + + reconcile_end=$(date +%s) + log_duration "Supervisor pod reconcile" "${reconcile_start}" "${reconcile_end}" + supervisor_reconcile_duration=$((reconcile_end - reconcile_start)) + return 0 +} + +run_readiness_gate() { + local gate_start + local gate_end + + gate_start=$(date +%s) + echo "Running readiness gate..." + + if ! wait_for_gateway_ready "${DEPLOY_FAST_READINESS_TIMEOUT_SECONDS}" "${DEPLOY_FAST_READINESS_POLL_SECONDS}"; then + gate_end=$(date +%s) + readiness_duration=$((gate_end - gate_start)) + readiness_gate_status="failed" + if [[ -z "${readiness_failure_reason}" ]]; then + readiness_failure_reason="unknown" + fi + echo "Error: readiness gate failed (${readiness_failure_reason})." + cluster_exec "kubectl get pods -n openshell -o wide" || true + return 1 + fi + + gate_end=$(date +%s) + readiness_duration=$((gate_end - gate_start)) + readiness_gate_status="passed" + echo "Readiness gate passed." + return 0 +} + +# Best-effort: find the currently running gateway image digest from pod status. +get_deployed_gateway_digest() { + local image_id + + while IFS= read -r image_id; do + case "${image_id}" in + *"${IMAGE_REPO_BASE}/gateway@"*) + printf '%s\n' "${image_id##*@}" + return 0 + ;; + esac + done < <( + cluster_exec "kubectl get pods -n openshell -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.imageID}{\"\\n\"}{end}{end}'" 2>/dev/null || true + ) + + return 1 +} + +# Best-effort: resolve the pushed gateway tag to a digest from local docker metadata. +get_tagged_gateway_digest() { + local repo_digest + + while IFS= read -r repo_digest; do + case "${repo_digest}" in + "${IMAGE_REPO_BASE}/gateway@"*) + printf '%s\n' "${repo_digest##*@}" + return 0 + ;; + esac + done < <( + docker image inspect --format '{{range .RepoDigests}}{{println .}}{{end}}' "${IMAGE_REPO_BASE}/gateway:${IMAGE_TAG}" 2>/dev/null || true + ) + + return 1 +} + # Path inside the container where the chart is copied for helm upgrades. CONTAINER_CHART_DIR=/tmp/openshell-chart @@ -48,13 +348,20 @@ build_gateway=0 build_supervisor=0 needs_helm_upgrade=0 explicit_target=0 +targets_requested="${*:-auto}" +helm_requested_by_gateway_build=0 previous_gateway_fingerprint="" previous_supervisor_fingerprint="" previous_helm_fingerprint="" +previous_cluster_infra_fingerprint="" current_gateway_fingerprint="" current_supervisor_fingerprint="" current_helm_fingerprint="" +current_cluster_infra_fingerprint="" +cluster_infra_worktree_change=0 +requires_cluster_bootstrap=0 +cluster_bootstrap_reason="" if [[ "$#" -gt 0 ]]; then explicit_target=1 @@ -88,7 +395,9 @@ fi declare -a changed_files=() detect_start=$(date +%s) -mapfile -t changed_files < <( +while IFS= read -r path; do + changed_files+=("${path}") +done < <( { git diff --name-only git diff --name-only --cached @@ -97,6 +406,8 @@ mapfile -t changed_files < <( ) detect_end=$(date +%s) log_duration "Change detection" "${detect_start}" "${detect_end}" +change_detection_duration=$((detect_end - detect_start)) +changed_files_count=${#changed_files[@]} # Track the cluster container ID so we can detect when the cluster was # recreated (e.g. via bootstrap). A new container means the k3s state is @@ -122,6 +433,9 @@ if [[ -f "${DEPLOY_FAST_STATE_FILE}" ]]; then helm) previous_helm_fingerprint=${value} ;; + cluster_infra) + previous_cluster_infra_fingerprint=${value} + ;; esac done < "${DEPLOY_FAST_STATE_FILE}" @@ -129,6 +443,7 @@ if [[ -f "${DEPLOY_FAST_STATE_FILE}" ]]; then previous_gateway_fingerprint="" previous_supervisor_fingerprint="" previous_helm_fingerprint="" + previous_cluster_infra_fingerprint="" fi # Invalidate gateway and helm fingerprints when the cluster container has @@ -194,6 +509,27 @@ matches_helm() { esac } +matches_cluster_infra() { + local path=$1 + case "${path}" in + deploy/docker/Dockerfile.cluster|deploy/docker/cluster-entrypoint.sh|deploy/docker/cluster-healthcheck.sh) + return 0 + ;; + deploy/kube/manifests/*|deploy/kube/gpu-manifests/*) + return 0 + ;; + tasks/scripts/cluster-bootstrap.sh|tasks/scripts/docker-build-cluster.sh) + return 0 + ;; + crates/openshell-bootstrap/*) + return 0 + ;; + *) + return 1 + ;; + esac +} + compute_fingerprint() { local component=$1 local payload="" @@ -214,6 +550,9 @@ compute_fingerprint() { helm) committed_trees=$(git ls-tree HEAD deploy/helm/openshell/ 2>/dev/null || true) ;; + cluster_infra) + committed_trees=$(git ls-tree HEAD deploy/docker/Dockerfile.cluster deploy/docker/cluster-entrypoint.sh deploy/docker/cluster-healthcheck.sh deploy/kube/manifests/ deploy/kube/gpu-manifests/ tasks/scripts/cluster-bootstrap.sh tasks/scripts/docker-build-cluster.sh crates/openshell-bootstrap/ 2>/dev/null || true) + ;; esac if [[ -n "${committed_trees}" ]]; then payload+="${committed_trees}"$'\n' @@ -237,6 +576,11 @@ compute_fingerprint() { continue fi ;; + cluster_infra) + if ! matches_cluster_infra "${path}"; then + continue + fi + ;; esac if [[ -e "${path}" ]]; then @@ -257,6 +601,38 @@ compute_fingerprint() { current_gateway_fingerprint=$(compute_fingerprint gateway) current_supervisor_fingerprint=$(compute_fingerprint supervisor) current_helm_fingerprint=$(compute_fingerprint helm) +current_cluster_infra_fingerprint=$(compute_fingerprint cluster_infra) + +for path in "${changed_files[@]}"; do + if matches_cluster_infra "${path}"; then + cluster_infra_worktree_change=1 + break + fi +done + +if [[ "${cluster_infra_worktree_change}" == "1" ]]; then + requires_cluster_bootstrap=1 + cluster_bootstrap_reason="cluster_infra_worktree_change" +elif [[ -n "${previous_cluster_infra_fingerprint}" && -n "${current_cluster_infra_fingerprint}" && "${current_cluster_infra_fingerprint}" != "${previous_cluster_infra_fingerprint}" ]]; then + requires_cluster_bootstrap=1 + cluster_bootstrap_reason="cluster_infra_fingerprint_changed" +fi + +if [[ "${requires_cluster_bootstrap}" == "1" ]]; then + echo "Cluster infrastructure change detected (${cluster_bootstrap_reason}); escalating to full cluster bootstrap." + if [[ "${cluster_infra_worktree_change}" == "1" ]]; then + echo "Changed infra paths:" + for path in "${changed_files[@]}"; do + if matches_cluster_infra "${path}"; then + echo " - ${path}" + fi + done + fi + if [[ "${explicit_target}" == "1" ]]; then + echo "Note: explicit target '${targets_requested}' overridden to keep deploy behavior deterministic." + fi + exec tasks/scripts/cluster-bootstrap.sh fast +fi if [[ "${explicit_target}" == "0" && "${DEPLOY_FAST_MODE}" == "full" ]]; then build_gateway=1 @@ -280,14 +656,19 @@ fi # Always run helm upgrade when the gateway image is rebuilt so that # the image tag and pull policy are set correctly. -if [[ "${build_gateway}" == "1" ]]; then +if [[ "${build_gateway}" == "1" && "${needs_helm_upgrade}" == "0" ]]; then needs_helm_upgrade=1 + helm_requested_by_gateway_build=1 fi echo "Fast deploy plan:" echo " build gateway: ${build_gateway}" echo " build supervisor: ${build_supervisor}" echo " helm upgrade: ${needs_helm_upgrade}" +echo " cargo profile: ${CARGO_BUILD_PROFILE}" +echo " readiness timeout:${DEPLOY_FAST_READINESS_TIMEOUT_SECONDS}s" +echo " supervisor reconcile: ${DEPLOY_FAST_SUPERVISOR_RECONCILE}" +echo " requires bootstrap: ${requires_cluster_bootstrap}" if [[ "${explicit_target}" == "0" && "${build_gateway}" == "0" && "${build_supervisor}" == "0" && "${needs_helm_upgrade}" == "0" && "${DEPLOY_FAST_MODE}" != "full" ]]; then echo "No new local changes since last deploy." @@ -316,8 +697,8 @@ if [[ "${build_supervisor}" == "1" ]]; then CLUSTER_ARCH=$(docker image inspect --format '{{.Architecture}}' "${_cluster_image}" 2>/dev/null || echo "amd64") # Build the supervisor binary using docker buildx with a lightweight build. - # We use the same cross-build.sh helpers as the full cluster image but only - # compile openshell-sandbox, then extract the binary via --output. + # Use the dedicated supervisor-export target to avoid copying the entire + # builder filesystem (target cache) back to the host. SUPERVISOR_BUILD_DIR=$(mktemp -d) trap 'rm -rf "${SUPERVISOR_BUILD_DIR}"' EXIT @@ -332,11 +713,19 @@ if [[ "${build_supervisor}" == "1" ]]; then fi fi + SUPERVISOR_PROFILE_ARGS=(--build-arg "OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}") + SUPERVISOR_RUST_SCOPE=${RUST_TOOLCHAIN_SCOPE:-$(detect_rust_scope "deploy/docker/Dockerfile.cluster")} + SUPERVISOR_LOCK_HASH=$(sha256_16 Cargo.lock) + SUPERVISOR_SCOPE_INPUT="v2|cluster|base|${SUPERVISOR_LOCK_HASH}|${SUPERVISOR_RUST_SCOPE}|${CARGO_BUILD_PROFILE}" + SUPERVISOR_CACHE_SCOPE=$(printf '%s' "${SUPERVISOR_SCOPE_INPUT}" | sha256_16_stdin) + docker buildx build \ --file deploy/docker/Dockerfile.cluster \ - --target supervisor-builder \ + --target supervisor-export \ --build-arg "BUILDARCH=$(docker version --format '{{.Server.Arch}}')" \ --build-arg "TARGETARCH=${CLUSTER_ARCH}" \ + --build-arg "CARGO_TARGET_CACHE_SCOPE=${SUPERVISOR_CACHE_SCOPE}" \ + ${SUPERVISOR_PROFILE_ARGS[@]+"${SUPERVISOR_PROFILE_ARGS[@]}"} \ ${SUPERVISOR_VERSION_ARGS[@]+"${SUPERVISOR_VERSION_ARGS[@]}"} \ --output "type=local,dest=${SUPERVISOR_BUILD_DIR}" \ --platform "linux/${CLUSTER_ARCH}" \ @@ -344,17 +733,24 @@ if [[ "${build_supervisor}" == "1" ]]; then # Copy the built binary into the running k3s container docker exec "${CONTAINER_NAME}" mkdir -p /opt/openshell/bin - docker cp "${SUPERVISOR_BUILD_DIR}/build/out/openshell-sandbox" \ + docker cp "${SUPERVISOR_BUILD_DIR}/openshell-sandbox" \ "${CONTAINER_NAME}:/opt/openshell/bin/openshell-sandbox" docker exec "${CONTAINER_NAME}" chmod 755 /opt/openshell/bin/openshell-sandbox + reconcile_supervisor_pods \ + "${DEPLOY_FAST_SUPERVISOR_RECONCILE}" \ + "${DEPLOY_FAST_SUPERVISOR_RECONCILE_TIMEOUT_SECONDS}" \ + "${DEPLOY_FAST_READINESS_POLL_SECONDS}" + built_components+=("supervisor") supervisor_end=$(date +%s) log_duration "Supervisor build + deploy" "${supervisor_start}" "${supervisor_end}" + supervisor_duration=$((supervisor_end - supervisor_start)) fi build_end=$(date +%s) log_duration "Builds" "${build_start}" "${build_end}" +builds_duration=$((build_end - build_start)) # Push rebuilt gateway image to local registry. declare -a pushed_images=() @@ -373,11 +769,31 @@ if [[ "${#pushed_images[@]}" -gt 0 ]]; then done push_end=$(date +%s) log_duration "Image push" "${push_start}" "${push_end}" + image_push_duration=$((push_end - push_start)) +fi + +# If the pushed gateway digest is already running, skip gateway-only reconcile. +# This avoids unnecessary helm+rollout churn when an explicit gateway build +# produces the same digest. +if [[ "${build_gateway}" == "1" ]]; then + gateway_deployed_digest=$(get_deployed_gateway_digest || true) + gateway_target_digest=$(get_tagged_gateway_digest || true) + + if [[ -n "${gateway_target_digest}" && -n "${gateway_deployed_digest}" && "${gateway_target_digest}" == "${gateway_deployed_digest}" ]]; then + skip_gateway_reconcile=1 + skip_gateway_reconcile_reason="digest_already_deployed" + echo "Gateway digest ${gateway_target_digest} is already deployed." + + if [[ "${helm_requested_by_gateway_build}" == "1" && "${FORCE_HELM_UPGRADE}" != "1" ]]; then + needs_helm_upgrade=0 + echo "Skipping helm upgrade (it was only requested for gateway image refresh)." + fi + fi fi # Evict rebuilt gateway image from k3s containerd cache so new pods pull # the updated image from the registry. -if [[ "${build_gateway}" == "1" ]]; then +if [[ "${build_gateway}" == "1" && "${skip_gateway_reconcile}" == "0" ]]; then echo "Evicting stale gateway image from k3s..." docker exec "${CONTAINER_NAME}" crictl rmi "${IMAGE_REPO_BASE}/gateway:${IMAGE_TAG}" >/dev/null 2>&1 || true fi @@ -386,6 +802,7 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then helm_start=$(date +%s) echo "Upgrading helm release..." helm_wait_args="" + helm_timeout_args="--timeout ${DEPLOY_FAST_READINESS_TIMEOUT_SECONDS}s" if [[ "${DEPLOY_FAST_HELM_WAIT}" == "1" ]]; then helm_wait_args="--wait" fi @@ -425,45 +842,156 @@ if [[ "${needs_helm_upgrade}" == "1" ]]; then --set server.tls.clientTlsSecretName=openshell-client-tls \ --set server.sshHandshakeSecret=${SSH_HANDSHAKE_SECRET} \ ${HOST_GATEWAY_ARGS} \ - ${helm_wait_args}" + ${helm_wait_args} \ + ${helm_timeout_args}" helm_end=$(date +%s) log_duration "Helm upgrade" "${helm_start}" "${helm_end}" + helm_upgrade_duration=$((helm_end - helm_start)) fi -if [[ "${build_gateway}" == "1" ]]; then +if [[ "${build_gateway}" == "1" && "${skip_gateway_reconcile}" == "0" ]]; then rollout_start=$(date +%s) echo "Restarting gateway to pick up updated image..." if cluster_exec "kubectl get statefulset/openshell -n openshell" >/dev/null 2>&1; then cluster_exec "kubectl rollout restart statefulset/openshell -n openshell" - cluster_exec "kubectl rollout status statefulset/openshell -n openshell" + cluster_exec "kubectl rollout status statefulset/openshell -n openshell --timeout=${DEPLOY_FAST_READINESS_TIMEOUT_SECONDS}s" elif cluster_exec "kubectl get deployment/openshell -n openshell" >/dev/null 2>&1; then cluster_exec "kubectl rollout restart deployment/openshell -n openshell" - cluster_exec "kubectl rollout status deployment/openshell -n openshell" + cluster_exec "kubectl rollout status deployment/openshell -n openshell --timeout=${DEPLOY_FAST_READINESS_TIMEOUT_SECONDS}s" else echo "Warning: no openshell workload found to roll out in namespace 'openshell'." fi rollout_end=$(date +%s) log_duration "Gateway rollout" "${rollout_start}" "${rollout_end}" + gateway_rollout_duration=$((rollout_end - rollout_start)) +elif [[ "${build_gateway}" == "1" ]]; then + echo "Skipping gateway rollout (digest unchanged)." fi if [[ "${build_supervisor}" == "1" ]]; then echo "Supervisor binary updated on cluster node." - echo "Existing sandbox pods will use the new binary on next restart." - echo "New sandbox pods will use the updated binary immediately (hostPath mount)." + if [[ "${DEPLOY_FAST_SUPERVISOR_RECONCILE}" == "none" ]]; then + echo "Running sandbox pods keep their current supervisor until they restart." + elif [[ "${supervisor_reconcile_performed}" == "1" ]]; then + echo "Reconciled ${supervisor_reconcile_restarted_pods} running sandbox pod(s) to pick up the new supervisor." + else + echo "No running sandbox pods required reconcile." + fi + echo "New sandbox pods use the updated binary immediately (hostPath mount)." fi -if [[ "${explicit_target}" == "0" ]]; then - mkdir -p "$(dirname "${DEPLOY_FAST_STATE_FILE}")" - cat > "${DEPLOY_FAST_STATE_FILE}" < "${DEPLOY_FAST_STATE_FILE}" < "${DEPLOY_REPORT_FILE}" </dev/null 2>&1; then + sha256sum "$1" | awk '{print substr($1, 1, 16)}' + else + shasum -a 256 "$1" | awk '{print substr($1, 1, 16)}' + fi +} + +sha256_16_stdin() { + if command -v sha256sum >/dev/null 2>&1; then + sha256sum | awk '{print substr($1, 1, 16)}' + else + shasum -a 256 | awk '{print substr($1, 1, 16)}' + fi +} + +detect_rust_scope() { + local dockerfile="$1" + local rust_from + rust_from=$(grep -E '^FROM --platform=\$BUILDPLATFORM rust:[^ ]+' "$dockerfile" | head -n1 | sed -E 's/^FROM --platform=\$BUILDPLATFORM rust:([^ ]+).*/\1/' || true) + if [[ -n "${rust_from}" ]]; then + echo "rust-${rust_from}" + return + fi + + if grep -q "rustup.rs" "$dockerfile"; then + echo "rustup-stable" + return + fi + + echo "no-rust" +} + IMAGE_TAG=${IMAGE_TAG:-dev} IMAGE_NAME="openshell/cluster" if [[ -n "${IMAGE_REGISTRY:-}" ]]; then @@ -65,6 +98,16 @@ elif [[ "${DOCKER_PLATFORM:-}" == *","* ]]; then OUTPUT_FLAG="--push" fi +CARGO_BUILD_PROFILE="${OPENSHELL_CARGO_PROFILE:-}" +if [[ -z "${CARGO_BUILD_PROFILE}" ]]; then + if [[ -n "${CI:-}" ]]; then + CARGO_BUILD_PROFILE="release" + else + CARGO_BUILD_PROFILE="local-fast" + fi +fi +PROFILE_ARGS=(--build-arg "OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}") + # Compute cargo version from git tags (same scheme as docker-build-component.sh). VERSION_ARGS=() if [[ -n "${OPENSHELL_CARGO_VERSION:-}" ]]; then @@ -76,11 +119,18 @@ else fi fi +LOCK_HASH=$(sha256_16 Cargo.lock) +RUST_SCOPE=${RUST_TOOLCHAIN_SCOPE:-$(detect_rust_scope "deploy/docker/Dockerfile.cluster")} +CACHE_SCOPE_INPUT="v2|cluster|base|${LOCK_HASH}|${RUST_SCOPE}|${CARGO_BUILD_PROFILE}" +CARGO_TARGET_CACHE_SCOPE=$(printf '%s' "${CACHE_SCOPE_INPUT}" | sha256_16_stdin) + docker buildx build \ ${BUILDER_ARGS[@]+"${BUILDER_ARGS[@]}"} \ ${DOCKER_PLATFORM:+--platform ${DOCKER_PLATFORM}} \ ${CACHE_ARGS[@]+"${CACHE_ARGS[@]}"} \ + ${PROFILE_ARGS[@]+"${PROFILE_ARGS[@]}"} \ ${VERSION_ARGS[@]+"${VERSION_ARGS[@]}"} \ + --build-arg "CARGO_TARGET_CACHE_SCOPE=${CARGO_TARGET_CACHE_SCOPE}" \ -f deploy/docker/Dockerfile.cluster \ -t ${IMAGE_NAME}:${IMAGE_TAG} \ ${K3S_VERSION:+--build-arg K3S_VERSION=${K3S_VERSION}} \ diff --git a/tasks/scripts/docker-build-component.sh b/tasks/scripts/docker-build-component.sh index f20d7295..daad0977 100755 --- a/tasks/scripts/docker-build-component.sh +++ b/tasks/scripts/docker-build-component.sh @@ -133,6 +133,16 @@ if [[ -n "${SCCACHE_MEMCACHED_ENDPOINT:-}" ]]; then SCCACHE_ARGS=(--build-arg "SCCACHE_MEMCACHED_ENDPOINT=${SCCACHE_MEMCACHED_ENDPOINT}") fi +CARGO_BUILD_PROFILE="${OPENSHELL_CARGO_PROFILE:-}" +if [[ -z "${CARGO_BUILD_PROFILE}" ]]; then + if [[ -n "${CI:-}" ]]; then + CARGO_BUILD_PROFILE="release" + else + CARGO_BUILD_PROFILE="local-fast" + fi +fi +PROFILE_ARGS=(--build-arg "OPENSHELL_CARGO_PROFILE=${CARGO_BUILD_PROFILE}") + VERSION_ARGS=() if [[ -n "${OPENSHELL_CARGO_VERSION:-}" ]]; then VERSION_ARGS=(--build-arg "OPENSHELL_CARGO_VERSION=${OPENSHELL_CARGO_VERSION}") @@ -143,7 +153,7 @@ fi LOCK_HASH=$(sha256_16 Cargo.lock) RUST_SCOPE=${RUST_TOOLCHAIN_SCOPE:-$(detect_rust_scope "${DOCKERFILE}")} -CACHE_SCOPE_INPUT="v1|${COMPONENT}|${VARIANT:-base}|${LOCK_HASH}|${RUST_SCOPE}" +CACHE_SCOPE_INPUT="v2|${COMPONENT}|${VARIANT:-base}|${LOCK_HASH}|${RUST_SCOPE}|${CARGO_BUILD_PROFILE}" CARGO_TARGET_CACHE_SCOPE=$(printf '%s' "${CACHE_SCOPE_INPUT}" | sha256_16_stdin) docker buildx build \ @@ -151,6 +161,7 @@ docker buildx build \ ${DOCKER_PLATFORM:+--platform ${DOCKER_PLATFORM}} \ ${CACHE_ARGS[@]+"${CACHE_ARGS[@]}"} \ ${SCCACHE_ARGS[@]+"${SCCACHE_ARGS[@]}"} \ + ${PROFILE_ARGS[@]+"${PROFILE_ARGS[@]}"} \ ${VERSION_ARGS[@]+"${VERSION_ARGS[@]}"} \ --build-arg "CARGO_TARGET_CACHE_SCOPE=${CARGO_TARGET_CACHE_SCOPE}" \ -f "${DOCKERFILE}" \