Merge Queue Checks for refs/heads/main #152

Workflow file for this run

.github/workflows/merge-queue.yml at 6f56daf

	name: Merge Queue Checks
	run-name: Merge Queue Checks for ${{ github.ref }}

	# This workflow is called from 'general.yml' (so that it can depend on artifacts from other jobs)
	# It is not invoked directly via a 'merge_group' event, so checking for 'github.event_name == 'merge_group' will not work
	on:
	workflow_dispatch:
	workflow_call:
	schedule:
	- cron: "0 0 * * *" # Runs at 00:00 UTC every day

	# When triggered from the merge queue (via the workflow call in 'general.yml'), cancel any existing workflow runs for the same PR branch
	# Otherwise, use the unique run id for the concurrency group, to prevent anything from getting cancelled
	concurrency:
	group: ${{ github.event_name == 'workflow_call' && format('{0}-{1}', github.workflow, github.ref) \|\| github.run_id }}
	cancel-in-progress: true

	env:
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
	AWS_REGION: "us-east-1"
	AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	AZURE_AI_FOUNDRY_API_KEY: ${{ secrets.AZURE_AI_FOUNDRY_API_KEY }}
	AZURE_OPENAI_API_BASE: ${{secrets.AZURE_OPENAI_API_BASE }}
	AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
	AZURE_OPENAI_DEPLOYMENT_ID: ${{secrets.AZURE_OPENAI_DEPLOYMENT_ID }}
	DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
	DATABASE_URL: postgresql://postgres:postgres@localhost:5432/tensorzero-e2e-tests
	FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
	FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
	FORCE_COLOR: 1
	GCP_STORAGE_ACCESS_KEY_ID: ${{ secrets.GCP_STORAGE_ACCESS_KEY_ID }}
	GCP_STORAGE_SECRET_ACCESS_KEY: ${{ secrets.GCP_STORAGE_SECRET_ACCESS_KEY }}
	GCP_VERTEX_CREDENTIALS_PATH: ${{ github.workspace }}/gcp_jwt_key.json
	GOOGLE_AI_STUDIO_API_KEY: ${{ secrets.GOOGLE_AI_STUDIO_API_KEY }}
	GOOGLE_APPLICATION_CREDENTIALS: ${{ github.workspace }}/gcp_jwt_key.json
	GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
	HYPERBOLIC_API_KEY: ${{secrets.HYPERBOLIC_API_KEY}}
	MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
	MODAL_KEY: ${{ secrets.MODAL_KEY }}
	MODAL_SECRET: ${{ secrets.MODAL_SECRET }}
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
	R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
	R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
	SGLANG_API_KEY: ${{ secrets.SGLANG_API_KEY }}
	TENSORZERO_POSTGRES_URL: postgresql://postgres:postgres@localhost:5432/tensorzero-e2e-tests
	TGI_API_KEY: ${{ secrets.TGI_API_KEY }}
	TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
	VLLM_API_BASE: ${{ secrets.VLLM_API_BASE }}
	VLLM_API_KEY: ${{ secrets.VLLM_API_KEY }}
	VLLM_MODEL_NAME: "microsoft/Phi-3.5-mini-instruct"
	VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
	XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
	OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: https://localhost:4316
	SQLX_OFFLINE: 1
	TENSORZERO_E2E_PROXY: http://localhost:3003

	jobs:
	batch-tests:
	permissions:
	contents: read
	if: github.repository == 'tensorzero/tensorzero'
	uses: ./.github/workflows/batch-test.yml
	secrets: inherit

	live-tests:
	name: "live-tests (batch_writes: ${{ matrix.batch_writes }})"
	runs-on: namespace-profile-tensorzero-16x32
	if: github.repository == 'tensorzero/tensorzero'
	permissions:
	# Permission to checkout the repository
	contents: read
	# Permission to fetch GitHub OIDC token authentication
	id-token: write
	timeout-minutes: 45
	strategy:
	matrix:
	batch_writes: [true, false]

	steps:
	- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

	- name: Install gdb
	run: sudo apt-get update && sudo apt-get install -y gdb

	- name: Warm up Modal instances
	run: \|
	curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--vllm-inference-vllm-inference.modal.run/docs > vllm_modal_logs.txt &
	curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--sglang-0-4-10-inference-sglang-inference.modal.run/ > sglang_modal_logs.txt &
	# TODO: Re-enable once we can switch to a T4 GPU
	# curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--vllm-gpt-oss-20b-serve.modal.run/ > vllm_gpt_oss_modal_logs.txt &

	- name: Cleanup disk space
	run: ./ci/free-disk-space.sh

	- name: Update Rust
	run: \|
	for attempt in 1 2 3; do
	if rustup update stable && rustup default stable; then
	break
	fi
	if [ $attempt -eq 3 ]; then
	echo "Failed to update Rust after 3 attempts"
	exit 1
	fi
	sleep $((10 * attempt))
	done
	shell: bash

	- name: Configure Namespace-powered Buildx
	uses: namespacelabs/nscloud-setup-buildx-action@91c2e6537780e3b092cb8476406be99a8f91bd5e
	with:
	wait-for-builder: true

	- name: Install Rust toolchain
	run: \|
	for attempt in 1 2 3; do
	if rustup toolchain install stable && rustup default stable; then
	break
	fi
	if [ $attempt -eq 3 ]; then
	echo "Failed to install Rust toolchain after 3 attempts"
	exit 1
	fi
	sleep $((10 * attempt))
	done
	shell: bash
	# Start testing workload identity federation credentials once the SDK adds support: https://github.com/googleapis/google-cloud-rust/issues/1342

	# - uses: 'google-github-actions/auth@v2'
	# with:
	# project_id: 'tensozero-public'
	# workload_identity_provider: 'projects/454541351720/locations/global/workloadIdentityPools/github/providers/tensorzero'

	- name: Print Rust version
	run: rustc --version

	- name: Install uv
	run: curl -LsSf --retry 2 --retry-delay 10 --retry-max-time 60 https://astral.sh/uv/0.6.17/install.sh \| sh

	- name: Download ClickHouse fixtures
	run: uv run ./ui/fixtures/download-fixtures.py

	- name: Install pnpm
	run: \|
	for attempt in 1 2 3; do
	if npm install -g pnpm@latest; then
	break
	fi
	if [ $attempt -eq 3 ]; then
	echo "Failed to install pnpm after 3 attempts"
	exit 1
	fi
	sleep $((10 * attempt))
	done
	shell: bash

	- name: Configure Namespace cache for Python (uv), and pnpm
	uses: namespacelabs/nscloud-cache-action@2f50e7d0f70475e6f59a55ba0f05eec9108e77cc
	with:
	cache: \|
	pnpm
	uv

	- name: Install JS dependencies
	run: pnpm install --frozen-lockfile

	- name: Login to DockerHub
	uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
	with:
	username: ${{ secrets.DOCKERHUB_USERNAME }}
	password: ${{ secrets.DOCKERHUB_TOKEN }}

	- name: Install cargo-nextest
	uses: taiki-e/install-action@d12e869b89167df346dd0ff65da342d1fb1202fb
	with:
	tool: cargo-nextest

	- name: Write GCP JWT key to file
	env:
	GCP_JWT_KEY: ${{ secrets.GCP_JWT_KEY }}
	run: echo "$GCP_JWT_KEY" > $GITHUB_WORKSPACE/gcp_jwt_key.json

	- name: Set up TENSORZERO_CLICKHOUSE_URL for E2E tests
	run: \|
	echo "TENSORZERO_CLICKHOUSE_URL=http://chuser:chpassword@localhost:8123/tensorzero_e2e_tests" >> $GITHUB_ENV
	echo "TENSORZERO_CLICKHOUSE_BATCH_WRITES=${{ matrix.batch_writes }}" >> $GITHUB_ENV

	- name: Configure batch writes in tensorzero.toml
	if: matrix.batch_writes == true
	run: \|
	echo "[gateway.observability.batch_writes]" >> tensorzero-core/tests/e2e/tensorzero.toml
	echo "enabled = true" >> tensorzero-core/tests/e2e/tensorzero.toml
	echo "flush_interval_ms = 80" >> tensorzero-core/tests/e2e/tensorzero.toml
	echo "__force_allow_embedded_batch_writes = true" >> tensorzero-core/tests/e2e/tensorzero.toml

	- name: Download provider-proxy cache
	# When running as a cron job, don't use the provider-proxy cache.
	# The cron job is used to gather information about provider flakiness.
	if: github.event_name != 'schedule'
	run: \|
	AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY ./ci/download-provider-proxy-cache.sh

	- name: Launch dependency services for E2E tests
	run: \|
	docker compose -f tensorzero-core/tests/e2e/docker-compose.yml up --build -d --wait

	- name: Print ClickHouse container logs
	if: always()
	run: \|
	docker compose -f tensorzero-core/tests/e2e/docker-compose.yml logs -t

	- name: Launch the provider-proxy cache for E2E tests
	run: \|
	./ci/run-provider-proxy.sh ci

	# TODO - get rid of this when the merge queue has a freshly-build gateway image available
	- name: Manually run the latest postgres migrations
	run: cargo run-e2e --run-postgres-migrations

	- name: Launch the gateway for E2E tests
	timeout-minutes: 2
	run: \|
	cargo run-e2e > e2e_logs.txt 2>&1 &
	while ! curl -s -f http://localhost:3000/health >/dev/null 2>&1; do
	echo "Waiting for gateway to be healthy..."
	sleep 1
	done
	echo "GATEWAY_PID=$!" >> $GITHUB_ENV

	# The 'CARGO_NEXTEST_FLAKY_TESTS' variable allows us to mark tests as flaky without merging a PR (if a provider happens to break or goes down)
	# We run the tests without the flaky tests, and require them to pass
	- name: Run all tests (including E2E tests)
	# When running from a cron job (the 'schedule' event), use '--no-fail-fast' so that we get full coverage of flaky providers.
	run: \|
	cargo test-e2e ${{ vars.CARGO_NEXTEST_EXTRA_ARGS }} -E "not (${{ vars.CARGO_NEXTEST_FLAKY_TESTS }})" ${{ github.event_name == 'schedule' && '--no-fail-fast' \|\| '' }}

	# TODO(https://github.com/tensorzero/tensorzero/issues/3989) - move this back to the end of the job
	# For now, we only check for deprecation warnings after running the Rust e2e tests
	- name: Check e2e logs for deprecation warnings (gateway e2e tests only)
	run: \|
	! grep -i "Deprecation Warning" e2e_logs.txt

	# As a separate step, we run just the flaky tests, and allow them to fail.
	# This lets us see if any flaky tests have started succeeding (by looking at the job output),
	# so that we can decide to mark them as non-flaky.
	# - name: Run flaky E2E tests
	# run: \|
	# cargo test-e2e --no-fail-fast ${{ vars.CARGO_NEXTEST_EXTRA_ARGS }} -E "${{ vars.CARGO_NEXTEST_FLAKY_TESTS }}"
	# continue-on-error: true
	- name: Install Python for python async client tests
	run: uv python install 3.9

	- name: "Python: PyO3 Client: pytest"
	working-directory: clients/python
	run: \|
	# Start the test in background and capture its PID
	bash ./test.sh --verbose &
	TEST_PID=$!
	echo "Started test.sh with PID: $TEST_PID"

	# Wait for 5 minutes (300 seconds)
	for i in {1..300}; do
	if ! kill -0 $TEST_PID 2>/dev/null; then
	echo "Test completed normally"
	wait $TEST_PID
	exit $?
	fi
	sleep 1
	done

	echo "Test has been running for 5 minutes, capturing backtraces..."

	# Get all processes related to our test
	echo "=== Process tree ==="
	ps -ef \| grep -E "(test\.sh\|pytest\|python)" \| grep -v grep \|\| true

	echo "=== Capturing backtraces with gdb ==="
	# Find all python processes that might be related to our test
	PYTHON_PIDS=$(pgrep -f "tensorzero.*python" \|\| true)
	if [ -n "$PYTHON_PIDS" ]; then
	for pid in $PYTHON_PIDS; do
	echo "--- Backtrace for Python process $pid ---"
	gdb -p $pid --batch \
	-ex "set pagination off" \
	-ex "thread apply all bt" \
	-ex "info threads" \
	-ex "detach" \
	-ex "quit" 2>&1 \|\| true
	echo ""
	done
	else
	echo "No Python processes found"
	fi
	exit 1

	- name: "Node.js: OpenAI Client: test"
	working-directory: clients/openai-node
	run: \|
	pnpm run test

	- name: Install Go
	uses: actions/setup-go@29694d72cd5e7ef3b09496b39f28a942af47737e
	with:
	go-version: "1.24"

	- name: "Go: OpenAI Client: test"
	working-directory: clients/openai-go/tests
	run: go test -v

	- name: "Python: Recipes: pytest"
	working-directory: recipes
	run: \|
	uv run pytest

	- name: Terminate the gateway and wait for it to exit
	if: always()
	run: \|
	echo "Killing gateway with pid $GATEWAY_PID"
	kill $GATEWAY_PID
	# Wait for at most 30 seconds for the gateway to exit
	for i in {1..30}; do
	if ! kill -0 $GATEWAY_PID 2>/dev/null; then
	echo "Gateway exited"
	break
	fi
	sleep 1
	done
	if kill -0 $GATEWAY_PID 2>/dev/null; then
	echo "Gateway did not exit after 30 seconds!"
	exit 1
	fi

	- name: Print e2e logs
	if: always()
	run: cat e2e_logs.txt

	- name: Print provider-proxy logs
	if: always()
	run: cat provider_proxy_logs.txt

	- name: Print vLLM modal logs
	if: always()
	run: cat vllm_modal_logs.txt

	- name: Print SGLang modal logs
	if: always()
	run: cat sglang_modal_logs.txt

	- name: Print vLLM GPT-OSS modal logs
	if: always()
	continue-on-error: true
	run: cat vllm_gpt_oss_modal_logs.txt

	- name: Upload provider-proxy cache
	# Only upload the cache when we're running from a 'good' run
	# (running from the merge queue via 'workflow_call' from general.yml, or a cron job)
	# This prevents manual workflow runs from modifying the cache
	if: github.event_name == 'workflow_call' \|\| github.event_name == 'schedule'
	run: \|
	AWS_ACCESS_KEY_ID=$R2_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY=$R2_SECRET_ACCESS_KEY ./ci/upload-provider-proxy-cache.sh

	check-production-docker-container:
	permissions:
	contents: read
	runs-on: ubuntu-latest
	if: github.repository == 'tensorzero/tensorzero'
	steps:
	- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

	- name: Build Docker container for production deployment tests
	run: docker build -t tensorzero/gateway -f gateway/Dockerfile .

	- name: Launch ClickHouse container for E2E tests
	run: \|
	docker compose -f tensorzero-core/tests/e2e/docker-compose.yml up clickhouse -d --wait

	- name: Set up .env file for production deployment tests
	run: \|
	echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > examples/production-deployment/.env
	echo "TENSORZERO_CLICKHOUSE_URL=http://chuser:chpassword@host.docker.internal:8123/tensorzero" >> examples/production-deployment/.env
	echo "TENSORZERO_DISABLE_PSEUDONYMOUS_USAGE_ANALYTICS=1" >> examples/production-deployment/.env

	- name: Run docker compose for production deployment tests
	run: docker compose -f examples/production-deployment/docker-compose.yml up -d --wait

	- name: Run inference for production deployment tests
	run: examples/production-deployment/run.sh

	- name: Print Docker compose logs
	if: always()
	run: \|
	docker compose -f examples/production-deployment/docker-compose.yml logs -t

	- name: Take down docker compose for production deployment tests
	run: \|
	docker compose -f examples/production-deployment/docker-compose.yml down
	docker compose -f tensorzero-core/tests/e2e/docker-compose.yml down

	# Test that the ui e2e tests still pass after we regenerate the model inference cache
	ui-tests-e2e-regen-model-inference-cache:
	permissions:
	contents: read
	actions: write
	if: github.repository == 'tensorzero/tensorzero'
	uses: ./.github/workflows/ui-tests-e2e-model-inference-cache.yml
	with:
	regen_cache: true
	is_merge_group: true
	secrets:
	S3_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
	S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
	FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}

	# See 'ci/README.md' at the repository root for more details.
	check-all-tests-passed:
	permissions: {}
	if: always() && github.repository == 'tensorzero/tensorzero'
	needs:
	[
	check-production-docker-container,
	ui-tests-e2e-regen-model-inference-cache,
	live-tests,
	batch-tests,
	]
	runs-on: ubuntu-latest
	steps:
	# When running in the merge queue, jobs should never be skipped.
	# In a scheduled run, some jobs may be intentionally skipped, as we only care about regenerating the model inference cache.
	- if: ${{ contains(needs..result, 'failure') \|\| contains(needs..result, 'cancelled') \|\| (github.event_name == 'workflow_call' && contains(needs.*.result, 'skipped')) }}
	run: exit 1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Merge Queue Checks for refs/heads/main #152

Workflow file

Merge Queue Checks for refs/heads/main #152

Uh oh!

Workflow file for this run