Skip to content

Merge Queue Checks for refs/heads/main #152

Merge Queue Checks for refs/heads/main

Merge Queue Checks for refs/heads/main #152

Workflow file for this run

name: Merge Queue Checks
run-name: Merge Queue Checks for ${{ github.ref }}
# This workflow is called from 'general.yml' (so that it can depend on artifacts from other jobs)
# It is *not* invoked directly via a 'merge_group' event, so checking for 'github.event_name == 'merge_group' will not work
on:
workflow_dispatch:
workflow_call:
schedule:
- cron: "0 0 * * *" # Runs at 00:00 UTC every day
# When triggered from the merge queue (via the workflow call in 'general.yml'), cancel any existing workflow runs for the same PR branch
# Otherwise, use the unique run id for the concurrency group, to prevent anything from getting cancelled
concurrency:
group: ${{ github.event_name == 'workflow_call' && format('{0}-{1}', github.workflow, github.ref) || github.run_id }}
cancel-in-progress: true
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_REGION: "us-east-1"
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AZURE_AI_FOUNDRY_API_KEY: ${{ secrets.AZURE_AI_FOUNDRY_API_KEY }}
AZURE_OPENAI_API_BASE: ${{secrets.AZURE_OPENAI_API_BASE }}
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
AZURE_OPENAI_DEPLOYMENT_ID: ${{secrets.AZURE_OPENAI_DEPLOYMENT_ID }}
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
DATABASE_URL: postgresql://postgres:postgres@localhost:5432/tensorzero-e2e-tests
FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
FORCE_COLOR: 1
GCP_STORAGE_ACCESS_KEY_ID: ${{ secrets.GCP_STORAGE_ACCESS_KEY_ID }}
GCP_STORAGE_SECRET_ACCESS_KEY: ${{ secrets.GCP_STORAGE_SECRET_ACCESS_KEY }}
GCP_VERTEX_CREDENTIALS_PATH: ${{ github.workspace }}/gcp_jwt_key.json
GOOGLE_AI_STUDIO_API_KEY: ${{ secrets.GOOGLE_AI_STUDIO_API_KEY }}
GOOGLE_APPLICATION_CREDENTIALS: ${{ github.workspace }}/gcp_jwt_key.json
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
HYPERBOLIC_API_KEY: ${{secrets.HYPERBOLIC_API_KEY}}
MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
MODAL_KEY: ${{ secrets.MODAL_KEY }}
MODAL_SECRET: ${{ secrets.MODAL_SECRET }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
SGLANG_API_KEY: ${{ secrets.SGLANG_API_KEY }}
TENSORZERO_POSTGRES_URL: postgresql://postgres:postgres@localhost:5432/tensorzero-e2e-tests
TGI_API_KEY: ${{ secrets.TGI_API_KEY }}
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
VLLM_API_BASE: ${{ secrets.VLLM_API_BASE }}
VLLM_API_KEY: ${{ secrets.VLLM_API_KEY }}
VLLM_MODEL_NAME: "microsoft/Phi-3.5-mini-instruct"
VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: https://localhost:4316
SQLX_OFFLINE: 1
TENSORZERO_E2E_PROXY: http://localhost:3003
jobs:
batch-tests:
permissions:
contents: read
if: github.repository == 'tensorzero/tensorzero'
uses: ./.github/workflows/batch-test.yml
secrets: inherit
live-tests:
name: "live-tests (batch_writes: ${{ matrix.batch_writes }})"
runs-on: namespace-profile-tensorzero-16x32
if: github.repository == 'tensorzero/tensorzero'
permissions:
# Permission to checkout the repository
contents: read
# Permission to fetch GitHub OIDC token authentication
id-token: write
timeout-minutes: 45
strategy:
matrix:
batch_writes: [true, false]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
- name: Install gdb
run: sudo apt-get update && sudo apt-get install -y gdb
- name: Warm up Modal instances
run: |
curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--vllm-inference-vllm-inference.modal.run/docs > vllm_modal_logs.txt &
curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--sglang-0-4-10-inference-sglang-inference.modal.run/ > sglang_modal_logs.txt &
# TODO: Re-enable once we can switch to a T4 GPU
# curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--vllm-gpt-oss-20b-serve.modal.run/ > vllm_gpt_oss_modal_logs.txt &
- name: Cleanup disk space
run: ./ci/free-disk-space.sh
- name: Update Rust
run: |
for attempt in 1 2 3; do
if rustup update stable && rustup default stable; then
break
fi
if [ $attempt -eq 3 ]; then
echo "Failed to update Rust after 3 attempts"
exit 1
fi
sleep $((10 * attempt))
done
shell: bash
- name: Configure Namespace-powered Buildx
uses: namespacelabs/nscloud-setup-buildx-action@91c2e6537780e3b092cb8476406be99a8f91bd5e
with:
wait-for-builder: true
- name: Install Rust toolchain
run: |
for attempt in 1 2 3; do
if rustup toolchain install stable && rustup default stable; then
break
fi
if [ $attempt -eq 3 ]; then
echo "Failed to install Rust toolchain after 3 attempts"
exit 1
fi
sleep $((10 * attempt))
done
shell: bash
# Start testing workload identity federation credentials once the SDK adds support: https://github.com/googleapis/google-cloud-rust/issues/1342
# - uses: 'google-github-actions/auth@v2'
# with:
# project_id: 'tensozero-public'
# workload_identity_provider: 'projects/454541351720/locations/global/workloadIdentityPools/github/providers/tensorzero'
- name: Print Rust version
run: rustc --version
- name: Install uv
run: curl -LsSf --retry 2 --retry-delay 10 --retry-max-time 60 https://astral.sh/uv/0.6.17/install.sh | sh
- name: Download ClickHouse fixtures
run: uv run ./ui/fixtures/download-fixtures.py
- name: Install pnpm
run: |
for attempt in 1 2 3; do
if npm install -g pnpm@latest; then
break
fi
if [ $attempt -eq 3 ]; then
echo "Failed to install pnpm after 3 attempts"
exit 1
fi
sleep $((10 * attempt))
done
shell: bash
- name: Configure Namespace cache for Python (uv), and pnpm
uses: namespacelabs/nscloud-cache-action@2f50e7d0f70475e6f59a55ba0f05eec9108e77cc
with:
cache: |
pnpm
uv
- name: Install JS dependencies
run: pnpm install --frozen-lockfile
- name: Login to DockerHub
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Install cargo-nextest
uses: taiki-e/install-action@d12e869b89167df346dd0ff65da342d1fb1202fb
with:
tool: cargo-nextest
- name: Write GCP JWT key to file
env:
GCP_JWT_KEY: ${{ secrets.GCP_JWT_KEY }}
run: echo "$GCP_JWT_KEY" > $GITHUB_WORKSPACE/gcp_jwt_key.json
- name: Set up TENSORZERO_CLICKHOUSE_URL for E2E tests
run: |
echo "TENSORZERO_CLICKHOUSE_URL=http://chuser:chpassword@localhost:8123/tensorzero_e2e_tests" >> $GITHUB_ENV
echo "TENSORZERO_CLICKHOUSE_BATCH_WRITES=${{ matrix.batch_writes }}" >> $GITHUB_ENV
- name: Configure batch writes in tensorzero.toml
if: matrix.batch_writes == true
run: |
echo "[gateway.observability.batch_writes]" >> tensorzero-core/tests/e2e/tensorzero.toml
echo "enabled = true" >> tensorzero-core/tests/e2e/tensorzero.toml
echo "flush_interval_ms = 80" >> tensorzero-core/tests/e2e/tensorzero.toml
echo "__force_allow_embedded_batch_writes = true" >> tensorzero-core/tests/e2e/tensorzero.toml
- name: Download provider-proxy cache
# When running as a cron job, don't use the provider-proxy cache.
# The cron job is used to gather information about provider flakiness.
if: github.event_name != 'schedule'
run: |
AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY ./ci/download-provider-proxy-cache.sh
- name: Launch dependency services for E2E tests
run: |
docker compose -f tensorzero-core/tests/e2e/docker-compose.yml up --build -d --wait
- name: Print ClickHouse container logs
if: always()
run: |
docker compose -f tensorzero-core/tests/e2e/docker-compose.yml logs -t
- name: Launch the provider-proxy cache for E2E tests
run: |
./ci/run-provider-proxy.sh ci
# TODO - get rid of this when the merge queue has a freshly-build gateway image available
- name: Manually run the latest postgres migrations
run: cargo run-e2e --run-postgres-migrations
- name: Launch the gateway for E2E tests
timeout-minutes: 2
run: |
cargo run-e2e > e2e_logs.txt 2>&1 &
while ! curl -s -f http://localhost:3000/health >/dev/null 2>&1; do
echo "Waiting for gateway to be healthy..."
sleep 1
done
echo "GATEWAY_PID=$!" >> $GITHUB_ENV
# The 'CARGO_NEXTEST_FLAKY_TESTS' variable allows us to mark tests as flaky without merging a PR (if a provider happens to break or goes down)
# We run the tests without the flaky tests, and require them to pass
- name: Run all tests (including E2E tests)
# When running from a cron job (the 'schedule' event), use '--no-fail-fast' so that we get full coverage of flaky providers.
run: |
cargo test-e2e ${{ vars.CARGO_NEXTEST_EXTRA_ARGS }} -E "not (${{ vars.CARGO_NEXTEST_FLAKY_TESTS }})" ${{ github.event_name == 'schedule' && '--no-fail-fast' || '' }}
# TODO(https://github.com/tensorzero/tensorzero/issues/3989) - move this back to the end of the job
# For now, we only check for deprecation warnings after running the Rust e2e tests
- name: Check e2e logs for deprecation warnings (gateway e2e tests only)
run: |
! grep -i "Deprecation Warning" e2e_logs.txt
# As a separate step, we run just the flaky tests, and allow them to fail.
# This lets us see if any flaky tests have started succeeding (by looking at the job output),
# so that we can decide to mark them as non-flaky.
# - name: Run flaky E2E tests
# run: |
# cargo test-e2e --no-fail-fast ${{ vars.CARGO_NEXTEST_EXTRA_ARGS }} -E "${{ vars.CARGO_NEXTEST_FLAKY_TESTS }}"
# continue-on-error: true
- name: Install Python for python async client tests
run: uv python install 3.9
- name: "Python: PyO3 Client: pytest"
working-directory: clients/python
run: |
# Start the test in background and capture its PID
bash ./test.sh --verbose &
TEST_PID=$!
echo "Started test.sh with PID: $TEST_PID"
# Wait for 5 minutes (300 seconds)
for i in {1..300}; do
if ! kill -0 $TEST_PID 2>/dev/null; then
echo "Test completed normally"
wait $TEST_PID
exit $?
fi
sleep 1
done
echo "Test has been running for 5 minutes, capturing backtraces..."
# Get all processes related to our test
echo "=== Process tree ==="
ps -ef | grep -E "(test\.sh|pytest|python)" | grep -v grep || true
echo "=== Capturing backtraces with gdb ==="
# Find all python processes that might be related to our test
PYTHON_PIDS=$(pgrep -f "tensorzero.*python" || true)
if [ -n "$PYTHON_PIDS" ]; then
for pid in $PYTHON_PIDS; do
echo "--- Backtrace for Python process $pid ---"
gdb -p $pid --batch \
-ex "set pagination off" \
-ex "thread apply all bt" \
-ex "info threads" \
-ex "detach" \
-ex "quit" 2>&1 || true
echo ""
done
else
echo "No Python processes found"
fi
exit 1
- name: "Node.js: OpenAI Client: test"
working-directory: clients/openai-node
run: |
pnpm run test
- name: Install Go
uses: actions/setup-go@29694d72cd5e7ef3b09496b39f28a942af47737e
with:
go-version: "1.24"
- name: "Go: OpenAI Client: test"
working-directory: clients/openai-go/tests
run: go test -v
- name: "Python: Recipes: pytest"
working-directory: recipes
run: |
uv run pytest
- name: Terminate the gateway and wait for it to exit
if: always()
run: |
echo "Killing gateway with pid $GATEWAY_PID"
kill $GATEWAY_PID
# Wait for at most 30 seconds for the gateway to exit
for i in {1..30}; do
if ! kill -0 $GATEWAY_PID 2>/dev/null; then
echo "Gateway exited"
break
fi
sleep 1
done
if kill -0 $GATEWAY_PID 2>/dev/null; then
echo "Gateway did not exit after 30 seconds!"
exit 1
fi
- name: Print e2e logs
if: always()
run: cat e2e_logs.txt
- name: Print provider-proxy logs
if: always()
run: cat provider_proxy_logs.txt
- name: Print vLLM modal logs
if: always()
run: cat vllm_modal_logs.txt
- name: Print SGLang modal logs
if: always()
run: cat sglang_modal_logs.txt
- name: Print vLLM GPT-OSS modal logs
if: always()
continue-on-error: true
run: cat vllm_gpt_oss_modal_logs.txt
- name: Upload provider-proxy cache
# Only upload the cache when we're running from a 'good' run
# (running from the merge queue via 'workflow_call' from general.yml, or a cron job)
# This prevents manual workflow runs from modifying the cache
if: github.event_name == 'workflow_call' || github.event_name == 'schedule'
run: |
AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY ./ci/upload-provider-proxy-cache.sh
check-production-docker-container:
permissions:
contents: read
runs-on: ubuntu-latest
if: github.repository == 'tensorzero/tensorzero'
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
- name: Build Docker container for production deployment tests
run: docker build -t tensorzero/gateway -f gateway/Dockerfile .
- name: Launch ClickHouse container for E2E tests
run: |
docker compose -f tensorzero-core/tests/e2e/docker-compose.yml up clickhouse -d --wait
- name: Set up .env file for production deployment tests
run: |
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > examples/production-deployment/.env
echo "TENSORZERO_CLICKHOUSE_URL=http://chuser:chpassword@host.docker.internal:8123/tensorzero" >> examples/production-deployment/.env
echo "TENSORZERO_DISABLE_PSEUDONYMOUS_USAGE_ANALYTICS=1" >> examples/production-deployment/.env
- name: Run docker compose for production deployment tests
run: docker compose -f examples/production-deployment/docker-compose.yml up -d --wait
- name: Run inference for production deployment tests
run: examples/production-deployment/run.sh
- name: Print Docker compose logs
if: always()
run: |
docker compose -f examples/production-deployment/docker-compose.yml logs -t
- name: Take down docker compose for production deployment tests
run: |
docker compose -f examples/production-deployment/docker-compose.yml down
docker compose -f tensorzero-core/tests/e2e/docker-compose.yml down
# Test that the ui e2e tests still pass after we regenerate the model inference cache
ui-tests-e2e-regen-model-inference-cache:
permissions:
contents: read
actions: write
if: github.repository == 'tensorzero/tensorzero'
uses: ./.github/workflows/ui-tests-e2e-model-inference-cache.yml
with:
regen_cache: true
is_merge_group: true
secrets:
S3_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
# See 'ci/README.md' at the repository root for more details.
check-all-tests-passed:
permissions: {}
if: always() && github.repository == 'tensorzero/tensorzero'
needs:
[
check-production-docker-container,
ui-tests-e2e-regen-model-inference-cache,
live-tests,
batch-tests,
]
runs-on: ubuntu-latest
steps:
# When running in the merge queue, jobs should never be skipped.
# In a scheduled run, some jobs may be intentionally skipped, as we only care about regenerating the model inference cache.
- if: ${{ contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') || (github.event_name == 'workflow_call' && contains(needs.*.result, 'skipped')) }}
run: exit 1