Merge Queue Checks for refs/heads/main #152
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Merge Queue Checks | |
| run-name: Merge Queue Checks for ${{ github.ref }} | |
| # This workflow is called from 'general.yml' (so that it can depend on artifacts from other jobs) | |
| # It is *not* invoked directly via a 'merge_group' event, so checking for 'github.event_name == 'merge_group' will not work | |
| on: | |
| workflow_dispatch: | |
| workflow_call: | |
| schedule: | |
| - cron: "0 0 * * *" # Runs at 00:00 UTC every day | |
| # When triggered from the merge queue (via the workflow call in 'general.yml'), cancel any existing workflow runs for the same PR branch | |
| # Otherwise, use the unique run id for the concurrency group, to prevent anything from getting cancelled | |
| concurrency: | |
| group: ${{ github.event_name == 'workflow_call' && format('{0}-{1}', github.workflow, github.ref) || github.run_id }} | |
| cancel-in-progress: true | |
| env: | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
| AWS_REGION: "us-east-1" | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| AZURE_AI_FOUNDRY_API_KEY: ${{ secrets.AZURE_AI_FOUNDRY_API_KEY }} | |
| AZURE_OPENAI_API_BASE: ${{secrets.AZURE_OPENAI_API_BASE }} | |
| AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} | |
| AZURE_OPENAI_DEPLOYMENT_ID: ${{secrets.AZURE_OPENAI_DEPLOYMENT_ID }} | |
| DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} | |
| DATABASE_URL: postgresql://postgres:postgres@localhost:5432/tensorzero-e2e-tests | |
| FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} | |
| FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} | |
| FORCE_COLOR: 1 | |
| GCP_STORAGE_ACCESS_KEY_ID: ${{ secrets.GCP_STORAGE_ACCESS_KEY_ID }} | |
| GCP_STORAGE_SECRET_ACCESS_KEY: ${{ secrets.GCP_STORAGE_SECRET_ACCESS_KEY }} | |
| GCP_VERTEX_CREDENTIALS_PATH: ${{ github.workspace }}/gcp_jwt_key.json | |
| GOOGLE_AI_STUDIO_API_KEY: ${{ secrets.GOOGLE_AI_STUDIO_API_KEY }} | |
| GOOGLE_APPLICATION_CREDENTIALS: ${{ github.workspace }}/gcp_jwt_key.json | |
| GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} | |
| HYPERBOLIC_API_KEY: ${{secrets.HYPERBOLIC_API_KEY}} | |
| MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }} | |
| MODAL_KEY: ${{ secrets.MODAL_KEY }} | |
| MODAL_SECRET: ${{ secrets.MODAL_SECRET }} | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} | |
| R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| SGLANG_API_KEY: ${{ secrets.SGLANG_API_KEY }} | |
| TENSORZERO_POSTGRES_URL: postgresql://postgres:postgres@localhost:5432/tensorzero-e2e-tests | |
| TGI_API_KEY: ${{ secrets.TGI_API_KEY }} | |
| TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} | |
| VLLM_API_BASE: ${{ secrets.VLLM_API_BASE }} | |
| VLLM_API_KEY: ${{ secrets.VLLM_API_KEY }} | |
| VLLM_MODEL_NAME: "microsoft/Phi-3.5-mini-instruct" | |
| VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }} | |
| XAI_API_KEY: ${{ secrets.XAI_API_KEY }} | |
| OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: https://localhost:4316 | |
| SQLX_OFFLINE: 1 | |
| TENSORZERO_E2E_PROXY: http://localhost:3003 | |
| jobs: | |
| batch-tests: | |
| permissions: | |
| contents: read | |
| if: github.repository == 'tensorzero/tensorzero' | |
| uses: ./.github/workflows/batch-test.yml | |
| secrets: inherit | |
| live-tests: | |
| name: "live-tests (batch_writes: ${{ matrix.batch_writes }})" | |
| runs-on: namespace-profile-tensorzero-16x32 | |
| if: github.repository == 'tensorzero/tensorzero' | |
| permissions: | |
| # Permission to checkout the repository | |
| contents: read | |
| # Permission to fetch GitHub OIDC token authentication | |
| id-token: write | |
| timeout-minutes: 45 | |
| strategy: | |
| matrix: | |
| batch_writes: [true, false] | |
| steps: | |
| - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 | |
| - name: Install gdb | |
| run: sudo apt-get update && sudo apt-get install -y gdb | |
| - name: Warm up Modal instances | |
| run: | | |
| curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--vllm-inference-vllm-inference.modal.run/docs > vllm_modal_logs.txt & | |
| curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--sglang-0-4-10-inference-sglang-inference.modal.run/ > sglang_modal_logs.txt & | |
| # TODO: Re-enable once we can switch to a T4 GPU | |
| # curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--vllm-gpt-oss-20b-serve.modal.run/ > vllm_gpt_oss_modal_logs.txt & | |
| - name: Cleanup disk space | |
| run: ./ci/free-disk-space.sh | |
| - name: Update Rust | |
| run: | | |
| for attempt in 1 2 3; do | |
| if rustup update stable && rustup default stable; then | |
| break | |
| fi | |
| if [ $attempt -eq 3 ]; then | |
| echo "Failed to update Rust after 3 attempts" | |
| exit 1 | |
| fi | |
| sleep $((10 * attempt)) | |
| done | |
| shell: bash | |
| - name: Configure Namespace-powered Buildx | |
| uses: namespacelabs/nscloud-setup-buildx-action@91c2e6537780e3b092cb8476406be99a8f91bd5e | |
| with: | |
| wait-for-builder: true | |
| - name: Install Rust toolchain | |
| run: | | |
| for attempt in 1 2 3; do | |
| if rustup toolchain install stable && rustup default stable; then | |
| break | |
| fi | |
| if [ $attempt -eq 3 ]; then | |
| echo "Failed to install Rust toolchain after 3 attempts" | |
| exit 1 | |
| fi | |
| sleep $((10 * attempt)) | |
| done | |
| shell: bash | |
| # Start testing workload identity federation credentials once the SDK adds support: https://github.com/googleapis/google-cloud-rust/issues/1342 | |
| # - uses: 'google-github-actions/auth@v2' | |
| # with: | |
| # project_id: 'tensozero-public' | |
| # workload_identity_provider: 'projects/454541351720/locations/global/workloadIdentityPools/github/providers/tensorzero' | |
| - name: Print Rust version | |
| run: rustc --version | |
| - name: Install uv | |
| run: curl -LsSf --retry 2 --retry-delay 10 --retry-max-time 60 https://astral.sh/uv/0.6.17/install.sh | sh | |
| - name: Download ClickHouse fixtures | |
| run: uv run ./ui/fixtures/download-fixtures.py | |
| - name: Install pnpm | |
| run: | | |
| for attempt in 1 2 3; do | |
| if npm install -g pnpm@latest; then | |
| break | |
| fi | |
| if [ $attempt -eq 3 ]; then | |
| echo "Failed to install pnpm after 3 attempts" | |
| exit 1 | |
| fi | |
| sleep $((10 * attempt)) | |
| done | |
| shell: bash | |
| - name: Configure Namespace cache for Python (uv), and pnpm | |
| uses: namespacelabs/nscloud-cache-action@2f50e7d0f70475e6f59a55ba0f05eec9108e77cc | |
| with: | |
| cache: | | |
| pnpm | |
| uv | |
| - name: Install JS dependencies | |
| run: pnpm install --frozen-lockfile | |
| - name: Login to DockerHub | |
| uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 | |
| with: | |
| username: ${{ secrets.DOCKERHUB_USERNAME }} | |
| password: ${{ secrets.DOCKERHUB_TOKEN }} | |
| - name: Install cargo-nextest | |
| uses: taiki-e/install-action@d12e869b89167df346dd0ff65da342d1fb1202fb | |
| with: | |
| tool: cargo-nextest | |
| - name: Write GCP JWT key to file | |
| env: | |
| GCP_JWT_KEY: ${{ secrets.GCP_JWT_KEY }} | |
| run: echo "$GCP_JWT_KEY" > $GITHUB_WORKSPACE/gcp_jwt_key.json | |
| - name: Set up TENSORZERO_CLICKHOUSE_URL for E2E tests | |
| run: | | |
| echo "TENSORZERO_CLICKHOUSE_URL=http://chuser:chpassword@localhost:8123/tensorzero_e2e_tests" >> $GITHUB_ENV | |
| echo "TENSORZERO_CLICKHOUSE_BATCH_WRITES=${{ matrix.batch_writes }}" >> $GITHUB_ENV | |
| - name: Configure batch writes in tensorzero.toml | |
| if: matrix.batch_writes == true | |
| run: | | |
| echo "[gateway.observability.batch_writes]" >> tensorzero-core/tests/e2e/tensorzero.toml | |
| echo "enabled = true" >> tensorzero-core/tests/e2e/tensorzero.toml | |
| echo "flush_interval_ms = 80" >> tensorzero-core/tests/e2e/tensorzero.toml | |
| echo "__force_allow_embedded_batch_writes = true" >> tensorzero-core/tests/e2e/tensorzero.toml | |
| - name: Download provider-proxy cache | |
| # When running as a cron job, don't use the provider-proxy cache. | |
| # The cron job is used to gather information about provider flakiness. | |
| if: github.event_name != 'schedule' | |
| run: | | |
| AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY ./ci/download-provider-proxy-cache.sh | |
| - name: Launch dependency services for E2E tests | |
| run: | | |
| docker compose -f tensorzero-core/tests/e2e/docker-compose.yml up --build -d --wait | |
| - name: Print ClickHouse container logs | |
| if: always() | |
| run: | | |
| docker compose -f tensorzero-core/tests/e2e/docker-compose.yml logs -t | |
| - name: Launch the provider-proxy cache for E2E tests | |
| run: | | |
| ./ci/run-provider-proxy.sh ci | |
| # TODO - get rid of this when the merge queue has a freshly-build gateway image available | |
| - name: Manually run the latest postgres migrations | |
| run: cargo run-e2e --run-postgres-migrations | |
| - name: Launch the gateway for E2E tests | |
| timeout-minutes: 2 | |
| run: | | |
| cargo run-e2e > e2e_logs.txt 2>&1 & | |
| while ! curl -s -f http://localhost:3000/health >/dev/null 2>&1; do | |
| echo "Waiting for gateway to be healthy..." | |
| sleep 1 | |
| done | |
| echo "GATEWAY_PID=$!" >> $GITHUB_ENV | |
| # The 'CARGO_NEXTEST_FLAKY_TESTS' variable allows us to mark tests as flaky without merging a PR (if a provider happens to break or goes down) | |
| # We run the tests without the flaky tests, and require them to pass | |
| - name: Run all tests (including E2E tests) | |
| # When running from a cron job (the 'schedule' event), use '--no-fail-fast' so that we get full coverage of flaky providers. | |
| run: | | |
| cargo test-e2e ${{ vars.CARGO_NEXTEST_EXTRA_ARGS }} -E "not (${{ vars.CARGO_NEXTEST_FLAKY_TESTS }})" ${{ github.event_name == 'schedule' && '--no-fail-fast' || '' }} | |
| # TODO(https://github.com/tensorzero/tensorzero/issues/3989) - move this back to the end of the job | |
| # For now, we only check for deprecation warnings after running the Rust e2e tests | |
| - name: Check e2e logs for deprecation warnings (gateway e2e tests only) | |
| run: | | |
| ! grep -i "Deprecation Warning" e2e_logs.txt | |
| # As a separate step, we run just the flaky tests, and allow them to fail. | |
| # This lets us see if any flaky tests have started succeeding (by looking at the job output), | |
| # so that we can decide to mark them as non-flaky. | |
| # - name: Run flaky E2E tests | |
| # run: | | |
| # cargo test-e2e --no-fail-fast ${{ vars.CARGO_NEXTEST_EXTRA_ARGS }} -E "${{ vars.CARGO_NEXTEST_FLAKY_TESTS }}" | |
| # continue-on-error: true | |
| - name: Install Python for python async client tests | |
| run: uv python install 3.9 | |
| - name: "Python: PyO3 Client: pytest" | |
| working-directory: clients/python | |
| run: | | |
| # Start the test in background and capture its PID | |
| bash ./test.sh --verbose & | |
| TEST_PID=$! | |
| echo "Started test.sh with PID: $TEST_PID" | |
| # Wait for 5 minutes (300 seconds) | |
| for i in {1..300}; do | |
| if ! kill -0 $TEST_PID 2>/dev/null; then | |
| echo "Test completed normally" | |
| wait $TEST_PID | |
| exit $? | |
| fi | |
| sleep 1 | |
| done | |
| echo "Test has been running for 5 minutes, capturing backtraces..." | |
| # Get all processes related to our test | |
| echo "=== Process tree ===" | |
| ps -ef | grep -E "(test\.sh|pytest|python)" | grep -v grep || true | |
| echo "=== Capturing backtraces with gdb ===" | |
| # Find all python processes that might be related to our test | |
| PYTHON_PIDS=$(pgrep -f "tensorzero.*python" || true) | |
| if [ -n "$PYTHON_PIDS" ]; then | |
| for pid in $PYTHON_PIDS; do | |
| echo "--- Backtrace for Python process $pid ---" | |
| gdb -p $pid --batch \ | |
| -ex "set pagination off" \ | |
| -ex "thread apply all bt" \ | |
| -ex "info threads" \ | |
| -ex "detach" \ | |
| -ex "quit" 2>&1 || true | |
| echo "" | |
| done | |
| else | |
| echo "No Python processes found" | |
| fi | |
| exit 1 | |
| - name: "Node.js: OpenAI Client: test" | |
| working-directory: clients/openai-node | |
| run: | | |
| pnpm run test | |
| - name: Install Go | |
| uses: actions/setup-go@29694d72cd5e7ef3b09496b39f28a942af47737e | |
| with: | |
| go-version: "1.24" | |
| - name: "Go: OpenAI Client: test" | |
| working-directory: clients/openai-go/tests | |
| run: go test -v | |
| - name: "Python: Recipes: pytest" | |
| working-directory: recipes | |
| run: | | |
| uv run pytest | |
| - name: Terminate the gateway and wait for it to exit | |
| if: always() | |
| run: | | |
| echo "Killing gateway with pid $GATEWAY_PID" | |
| kill $GATEWAY_PID | |
| # Wait for at most 30 seconds for the gateway to exit | |
| for i in {1..30}; do | |
| if ! kill -0 $GATEWAY_PID 2>/dev/null; then | |
| echo "Gateway exited" | |
| break | |
| fi | |
| sleep 1 | |
| done | |
| if kill -0 $GATEWAY_PID 2>/dev/null; then | |
| echo "Gateway did not exit after 30 seconds!" | |
| exit 1 | |
| fi | |
| - name: Print e2e logs | |
| if: always() | |
| run: cat e2e_logs.txt | |
| - name: Print provider-proxy logs | |
| if: always() | |
| run: cat provider_proxy_logs.txt | |
| - name: Print vLLM modal logs | |
| if: always() | |
| run: cat vllm_modal_logs.txt | |
| - name: Print SGLang modal logs | |
| if: always() | |
| run: cat sglang_modal_logs.txt | |
| - name: Print vLLM GPT-OSS modal logs | |
| if: always() | |
| continue-on-error: true | |
| run: cat vllm_gpt_oss_modal_logs.txt | |
| - name: Upload provider-proxy cache | |
| # Only upload the cache when we're running from a 'good' run | |
| # (running from the merge queue via 'workflow_call' from general.yml, or a cron job) | |
| # This prevents manual workflow runs from modifying the cache | |
| if: github.event_name == 'workflow_call' || github.event_name == 'schedule' | |
| run: | | |
| AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY ./ci/upload-provider-proxy-cache.sh | |
| check-production-docker-container: | |
| permissions: | |
| contents: read | |
| runs-on: ubuntu-latest | |
| if: github.repository == 'tensorzero/tensorzero' | |
| steps: | |
| - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 | |
| - name: Build Docker container for production deployment tests | |
| run: docker build -t tensorzero/gateway -f gateway/Dockerfile . | |
| - name: Launch ClickHouse container for E2E tests | |
| run: | | |
| docker compose -f tensorzero-core/tests/e2e/docker-compose.yml up clickhouse -d --wait | |
| - name: Set up .env file for production deployment tests | |
| run: | | |
| echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > examples/production-deployment/.env | |
| echo "TENSORZERO_CLICKHOUSE_URL=http://chuser:chpassword@host.docker.internal:8123/tensorzero" >> examples/production-deployment/.env | |
| echo "TENSORZERO_DISABLE_PSEUDONYMOUS_USAGE_ANALYTICS=1" >> examples/production-deployment/.env | |
| - name: Run docker compose for production deployment tests | |
| run: docker compose -f examples/production-deployment/docker-compose.yml up -d --wait | |
| - name: Run inference for production deployment tests | |
| run: examples/production-deployment/run.sh | |
| - name: Print Docker compose logs | |
| if: always() | |
| run: | | |
| docker compose -f examples/production-deployment/docker-compose.yml logs -t | |
| - name: Take down docker compose for production deployment tests | |
| run: | | |
| docker compose -f examples/production-deployment/docker-compose.yml down | |
| docker compose -f tensorzero-core/tests/e2e/docker-compose.yml down | |
| # Test that the ui e2e tests still pass after we regenerate the model inference cache | |
| ui-tests-e2e-regen-model-inference-cache: | |
| permissions: | |
| contents: read | |
| actions: write | |
| if: github.repository == 'tensorzero/tensorzero' | |
| uses: ./.github/workflows/ui-tests-e2e-model-inference-cache.yml | |
| with: | |
| regen_cache: true | |
| is_merge_group: true | |
| secrets: | |
| S3_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
| S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} | |
| FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| # See 'ci/README.md' at the repository root for more details. | |
| check-all-tests-passed: | |
| permissions: {} | |
| if: always() && github.repository == 'tensorzero/tensorzero' | |
| needs: | |
| [ | |
| check-production-docker-container, | |
| ui-tests-e2e-regen-model-inference-cache, | |
| live-tests, | |
| batch-tests, | |
| ] | |
| runs-on: ubuntu-latest | |
| steps: | |
| # When running in the merge queue, jobs should never be skipped. | |
| # In a scheduled run, some jobs may be intentionally skipped, as we only care about regenerating the model inference cache. | |
| - if: ${{ contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') || (github.event_name == 'workflow_call' && contains(needs.*.result, 'skipped')) }} | |
| run: exit 1 |