From 5bae955b35bd6cade48392fcd3b323e73cbb4238 Mon Sep 17 00:00:00 2001 From: Hongxin Liang Date: Mon, 23 Mar 2026 22:59:16 +0100 Subject: [PATCH 1/4] ci: add debug step to capture flyte-sandbox-bundled container logs Temporary debug step to investigate why the k3s-based sandbox container exits with code 1 on GitHub Actions Ubuntu runners. Signed-off-by: Hongxin Liang Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/build.yaml | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 22be593f..0ec3361b 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -36,6 +36,45 @@ jobs: username: ${{ secrets.FLYTE_BOT_USERNAME }} password: ${{ secrets.FLYTE_BOT_PAT }} + - name: Debug flyte-sandbox-bundled container + if: ${{ github.ref != 'refs/heads/master' }} + run: | + set -x + echo "=== Docker info ===" + docker info + echo "=== Pulling image ===" + docker pull ghcr.io/flyteorg/flyte-sandbox-bundled:sha-f3ab1b7480bad4072f7ecb695660fdf47032a6c4 + echo "=== Starting container ===" + docker run --privileged --tmpfs /run --tmpfs /var/run \ + --name flyte-sandbox-debug -d \ + ghcr.io/flyteorg/flyte-sandbox-bundled:sha-f3ab1b7480bad4072f7ecb695660fdf47032a6c4 + echo "=== Waiting 120s for container ===" + for i in $(seq 1 12); do + sleep 10 + STATUS=$(docker inspect flyte-sandbox-debug --format '{{.State.Status}}' 2>/dev/null || echo "gone") + echo "[$((i*10))s] Container status: $STATUS" + if [ "$STATUS" = "exited" ] || [ "$STATUS" = "gone" ]; then + echo "=== Container exited early ===" + docker inspect flyte-sandbox-debug 2>/dev/null || true + break + fi + done + echo "=== Container logs (stdout) ===" + docker logs flyte-sandbox-debug 2>/dev/null | tail -100 || true + echo "=== Container logs (stderr) ===" + docker logs flyte-sandbox-debug 2>&1 >/dev/null | tail -100 || true + echo "=== Entrypoint log ===" + docker exec flyte-sandbox-debug cat /var/log/k3d-entrypoints*.log 2>/dev/null || echo "no entrypoint log" + echo "=== k3s process ===" + docker exec flyte-sandbox-debug ps aux 2>/dev/null || echo "cannot exec" + echo "=== kubectl get pods ===" + docker exec flyte-sandbox-debug kubectl get pods -A 2>/dev/null || echo "kubectl failed" + echo "=== Healthcheck ===" + docker exec flyte-sandbox-debug wget -qO- http://localhost:30080/healthcheck 2>/dev/null || echo "healthcheck failed" + echo "=== Cleanup ===" + docker stop flyte-sandbox-debug 2>/dev/null || true + docker rm flyte-sandbox-debug 2>/dev/null || true + - name: Verify with Maven if: ${{ github.ref != 'refs/heads/master' }} run: mvn --batch-mode verify -Pci From 7b1732e6ce4276aa2e6b35908388baa8208a7090 Mon Sep 17 00:00:00 2001 From: Hongxin Liang Date: Mon, 23 Mar 2026 23:02:12 +0100 Subject: [PATCH 2/4] ci: test with --cgroupns=host to fix k3s cgroup v2 crash Container exits in 275ms on CI because CgroupnsMode is "private" by default on Docker 28 / Ubuntu 24.04 with cgroup v2. The cgroupv2 entrypoint script fails to write to /sys/fs/cgroup/cgroup.subtree_control in a private cgroup namespace. Signed-off-by: Hongxin Liang Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/build.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 0ec3361b..987c936a 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -45,7 +45,8 @@ jobs: echo "=== Pulling image ===" docker pull ghcr.io/flyteorg/flyte-sandbox-bundled:sha-f3ab1b7480bad4072f7ecb695660fdf47032a6c4 echo "=== Starting container ===" - docker run --privileged --tmpfs /run --tmpfs /var/run \ + echo "=== Test 1: with --cgroupns=host ===" + docker run --privileged --cgroupns=host --tmpfs /run --tmpfs /var/run \ --name flyte-sandbox-debug -d \ ghcr.io/flyteorg/flyte-sandbox-bundled:sha-f3ab1b7480bad4072f7ecb695660fdf47032a6c4 echo "=== Waiting 120s for container ===" From 4273b1f96a441cc001bfee59599a031e1421d13a Mon Sep 17 00:00:00 2001 From: Hongxin Liang Date: Mon, 23 Mar 2026 23:04:46 +0100 Subject: [PATCH 3/4] ci: capture entrypoint logs and cgroup state on CI Run the cgroupv2 entrypoint script manually with sh -x to see exactly where it fails. Also copy /var/log from the dead container and inspect the cgroup filesystem. Signed-off-by: Hongxin Liang Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/build.yaml | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 987c936a..f9721b7d 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -60,18 +60,15 @@ jobs: break fi done - echo "=== Container logs (stdout) ===" - docker logs flyte-sandbox-debug 2>/dev/null | tail -100 || true - echo "=== Container logs (stderr) ===" - docker logs flyte-sandbox-debug 2>&1 >/dev/null | tail -100 || true - echo "=== Entrypoint log ===" - docker exec flyte-sandbox-debug cat /var/log/k3d-entrypoints*.log 2>/dev/null || echo "no entrypoint log" - echo "=== k3s process ===" - docker exec flyte-sandbox-debug ps aux 2>/dev/null || echo "cannot exec" - echo "=== kubectl get pods ===" - docker exec flyte-sandbox-debug kubectl get pods -A 2>/dev/null || echo "kubectl failed" - echo "=== Healthcheck ===" - docker exec flyte-sandbox-debug wget -qO- http://localhost:30080/healthcheck 2>/dev/null || echo "healthcheck failed" + echo "=== Container logs ===" + docker logs flyte-sandbox-debug 2>&1 | tail -200 || true + echo "=== Entrypoint log (via cp) ===" + docker cp flyte-sandbox-debug:/var/log/ /tmp/flyte-logs 2>/dev/null && find /tmp/flyte-logs -type f -exec echo "--- {} ---" \; -exec cat {} \; || echo "no logs to copy" + echo "=== /sys/fs/cgroup inside container ===" + docker run --rm --privileged --cgroupns=host --tmpfs /run --tmpfs /var/run \ + --entrypoint sh \ + ghcr.io/flyteorg/flyte-sandbox-bundled:sha-f3ab1b7480bad4072f7ecb695660fdf47032a6c4 \ + -c "ls -la /sys/fs/cgroup/ && cat /sys/fs/cgroup/cgroup.controllers 2>/dev/null && echo '---' && cat /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null; echo '=== running entrypoint manually ==='; sh -x /bin/k3d-entrypoint-cgroupv2.sh 2>&1; echo 'exit code:' \$?" || true echo "=== Cleanup ===" docker stop flyte-sandbox-debug 2>/dev/null || true docker rm flyte-sandbox-debug 2>/dev/null || true From 79fdac6ed99d248920c6c409f7afbc46425f18ee Mon Sep 17 00:00:00 2001 From: Hongxin Liang Date: Mon, 23 Mar 2026 23:07:08 +0100 Subject: [PATCH 4/4] ci: add --add-host host.docker.internal:host-gateway Bootstrap fails with "lookup host.docker.internal: no such host" on Linux CI. Docker Desktop on Mac adds this automatically, but Linux Docker needs --add-host explicitly. Signed-off-by: Hongxin Liang Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/build.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index f9721b7d..d17db329 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -45,8 +45,9 @@ jobs: echo "=== Pulling image ===" docker pull ghcr.io/flyteorg/flyte-sandbox-bundled:sha-f3ab1b7480bad4072f7ecb695660fdf47032a6c4 echo "=== Starting container ===" - echo "=== Test 1: with --cgroupns=host ===" + echo "=== Test: with --add-host ===" docker run --privileged --cgroupns=host --tmpfs /run --tmpfs /var/run \ + --add-host "host.docker.internal:host-gateway" \ --name flyte-sandbox-debug -d \ ghcr.io/flyteorg/flyte-sandbox-bundled:sha-f3ab1b7480bad4072f7ecb695660fdf47032a6c4 echo "=== Waiting 120s for container ==="