diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml index 5cdc2aaf41..74a62009eb 100644 --- a/.github/workflows/build_and_push_docker_image.yml +++ b/.github/workflows/build_and_push_docker_image.yml @@ -38,10 +38,6 @@ on: required: false type: string default: 'pre-training' - version_name: - required: false - type: string - default: '' include_test_assets: required: false type: boolean @@ -163,27 +159,19 @@ jobs: run: | SOURCE_IMAGE="gcr.io/${{ vars.PROJECT_NAME }}/${INPUTS_IMAGE_NAME}" TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}" + + # Add date tag + IMAGE_DATE="$(date +%Y-%m-%d)" + gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:$IMAGE_DATE" --quiet - if [[ $INPUTS_VERSION_NAME ]]; then - echo "Tagging docker images corresponding to PyPI release..." - gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${INPUTS_VERSION_NAME}" --quiet - else - echo "Tagging docker images corresponding to nightly release..." - - # Add date tag - IMAGE_DATE="$(date +%Y-%m-%d)" - gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:$IMAGE_DATE" --quiet - - # Convert date to YYYYMMDD format - clean_date=$(echo "$IMAGE_DATE" | sed 's/[-:]//g' | cut -c1-8) + # Convert date to YYYYMMDD format + clean_date=$(echo "$IMAGE_DATE" | sed 's/[-:]//g' | cut -c1-8) - # Add MaxText tag - MAXTEXT_SHA=$(git rev-parse --short HEAD) - gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:maxtext_${MAXTEXT_SHA}_${clean_date}" --quiet - fi + # Add MaxText tag + MAXTEXT_SHA=$(git rev-parse --short HEAD) + gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:maxtext_${MAXTEXT_SHA}_${clean_date}" --quiet env: INPUTS_IMAGE_NAME: ${{ inputs.image_name }} - INPUTS_VERSION_NAME: ${{ inputs.version_name }} promote_image: needs: [pre_build_check, build_and_push] diff --git a/.github/workflows/build_and_test_release_candidate.yml b/.github/workflows/build_and_test_release_candidate.yml new file mode 100644 index 0000000000..f29a8f51ef --- /dev/null +++ b/.github/workflows/build_and_test_release_candidate.yml @@ -0,0 +1,85 @@ +# Copyright 2023-2026 Google LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This workflow will build MaxText release candidate and trigger CI tests and E2E airflow tests. + +name: Build and Test MaxText Release Candidate + +# Triggers when a new "release" is published in the GitHub UI +on: + release: + types: [published] + pull_request: # TODO: Remove this trigger + +permissions: + contents: read + issues: write + id-token: write + +jobs: + #release_approval: + # name: Approve Release + # runs-on: ubuntu-latest + # # "release" environment is configured in MaxText repository settings. + # # This environment requires manual approval before proceeding to the next job. + # environment: release + # steps: + # - name: Acknowledge Approval + # run: echo "Release approved, proceeding to build and test MaxText package." + + build_and_test_maxtext_package: + name: Build and Test MaxText Package + #needs: [release_approval] + uses: ./.github/workflows/build_and_test_maxtext.yml + secrets: inherit + + build_release_candidate_images: + name: Build ${{ matrix.image_name }} + needs: [build_and_test_maxtext_package] + strategy: + fail-fast: false + matrix: + include: + - device: tpu + build_mode: stable + image_name: maxtext_jax_stable + workflow: pre-training + dockerfile: maxtext_tpu_dependencies.Dockerfile + - device: gpu + build_mode: stable + image_name: maxtext_gpu_jax_stable + workflow: pre-training + dockerfile: maxtext_gpu_dependencies.Dockerfile + - device: tpu + build_mode: stable + image_name: maxtext_post_training_stable + workflow: post-training + dockerfile: maxtext_tpu_dependencies.Dockerfile + uses: ./.github/workflows/build_and_push_docker_image.yml + with: + image_name: ${{ matrix.image_name }} + device: ${{ matrix.device }} + build_mode: ${{ matrix.build_mode }} + workflow: ${{ matrix.workflow }} + dockerfile: ${{ matrix.dockerfile }} + maxtext_sha: ${{ github.sha }} + secrets: inherit + + run_e2e_tests: + name: Run E2E Pre-Training and Post-Training Tests + needs: [build_release_candidate_images] + uses: ./.github/workflows/run_e2e_tests.yml + with: + mode: stable + secrets: inherit diff --git a/.github/workflows/pypi_release.yml b/.github/workflows/pypi_release.yml index 0eba10c310..f841983202 100644 --- a/.github/workflows/pypi_release.yml +++ b/.github/workflows/pypi_release.yml @@ -12,54 +12,85 @@ # See the License for the specific language governing permissions and # limitations under the License. -# This workflow will build, test and automatically release MaxText package to PyPI using Trusted Publishing (OIDC). +# This workflow publishes MaxText to PyPI once the parent Airflow E2E DAG reports success. +# The parent DAG fans out to all child E2E DAGs, waits for them, then fires a single callback. name: Publish MaxText to PyPI -# Triggers when a new "release" is published in the GitHub UI +# Triggered by Airflow via POST /repos/{owner}/{repo}/dispatches when the E2E DAG completes. +# Airflow passes the result in client_payload: { state, dag_id, dag_run_id, sha, github_run_id }. on: - release: - types: [published] + repository_dispatch: + types: [airflow-dag-complete] permissions: contents: read - issues: write - id-token: write + id-token: write # required for PyPI Trusted Publishing (OIDC) jobs: + handle_result: + name: Handle Airflow DAG Result - ${{ github.event.client_payload.dag_id }} + runs-on: ubuntu-latest + steps: + - name: Report DAG result + run: | + STATE="${{ github.event.client_payload.state }}" + DAG_ID="${{ github.event.client_payload.dag_id }}" + DAG_RUN_ID="${{ github.event.client_payload.dag_run_id }}" + SHA="${{ github.event.client_payload.sha }}" + + echo "================================" + echo "DAG ID: ${DAG_ID}" + echo "DAG Run ID: ${DAG_RUN_ID}" + echo "Commit SHA: ${SHA}" + echo "State: ${STATE}" + echo "================================" + + case "$STATE" in + success) echo "E2E tests PASSED." ;; + failed|upstream_failed) echo "E2E tests FAILED."; exit 1 ;; + *) echo "DAG ended with unexpected state: ${STATE}"; exit 1 ;; + esac + release_approval: name: Approve Release runs-on: ubuntu-latest + needs: handle_result + if: github.event.client_payload.state == 'success' # "release" environment is configured in MaxText repository settings. # This environment requires manual approval before proceeding to the next job. environment: release steps: - name: Acknowledge Approval - run: echo "Release approved, proceeding to build, test and publish MaxText package." + run: echo "Release approved, proceeding to publishing MaxText package." - build_and_test_maxtext_package: - name: Build and Test MaxText Package - needs: [release_approval] - uses: ./.github/workflows/build_and_test_maxtext.yml - secrets: inherit - publish_maxtext_package_to_pypi: name: Publish MaxText package to PyPI - needs: [build_and_test_maxtext_package] + needs: [handle_result, release_approval] + if: github.event.client_payload.state == 'success' runs-on: ubuntu-latest environment: release steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 - - name: Download MaxText wheel - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 - with: - name: maxtext-wheel - path: dist/ - - name: Publish MaxText wheel to PyPI - # Official action for PyPI Trusted Publishing - uses: pypa/gh-action-pypi-publish@release/v1 - with: - packages-dir: dist/ + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + - name: Download MaxText wheel from release build + env: + GH_TOKEN: ${{ github.token }} + run: | + ORIGINAL_RUN_ID="${{ github.event.client_payload.github_run_id }}" + ARTIFACT_ID=$(gh api "/repos/${{ github.repository }}/actions/runs/${ORIGINAL_RUN_ID}/artifacts" \ + --jq '.artifacts[] | select(.name == "maxtext-wheel") | .id') + if [ -z "${ARTIFACT_ID}" ]; then + echo "Error: could not find maxtext-wheel artifact in run ${ORIGINAL_RUN_ID}" + exit 1 + fi + mkdir -p dist + gh api "/repos/${{ github.repository }}/actions/artifacts/${ARTIFACT_ID}/zip" > dist/maxtext-wheel.zip + unzip dist/maxtext-wheel.zip -d dist/ + rm dist/maxtext-wheel.zip + - name: Publish MaxText wheel to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist/ get_latest_maxtext_pypi_version: name: Get latest MaxText PyPI version @@ -68,60 +99,38 @@ jobs: outputs: latest_pypi_version: ${{ steps.get_version.outputs.version }} steps: - - name: Install jq - run: sudo apt-get update && sudo apt-get install -y jq - name: Fetch latest version of maxtext from PyPI id: get_version run: | - # Fetch JSON from PyPI for 'maxtext' - echo "Fetching latest version from https://pypi.org/pypi/maxtext/json" pypi_json=$(curl -s https://pypi.org/pypi/maxtext/json) - - # Extract the version from the "info" section using jq - latest_version=$(echo "$pypi_json" | jq -r ".info.version") - - if [ -z "$latest_version" ] || [ "$latest_version" == "null" ]; then + latest_version=$(echo "$pypi_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['info']['version'])") + if [ -z "$latest_version" ] || [ "$latest_version" = "null" ]; then echo "Error: Could not parse latest version from PyPI JSON." exit 1 fi - - echo "Successfully fetched latest MaxText version on PyPI: $latest_version" - # Set the output variable for other jobs to consume + echo "Latest MaxText version on PyPI: $latest_version" echo "version=$latest_version" >> "$GITHUB_OUTPUT" - # This job builds and pushes MaxText stable Docker images for both TPU and GPU devices. - # It runs only after a new release is published to PyPI. - # Creates docker image for MaxText commit corresponding to the release. - upload_maxtext_docker_images: - name: ${{ matrix.image_name }} - needs: [get_latest_maxtext_pypi_version] + promote_release_images: + name: Promote Release Images - ${{ matrix.image_name }} + needs: [publish_maxtext_package_to_pypi, get_latest_maxtext_pypi_version] + runs-on: linux-x86-n2-16-buildkit + container: google/cloud-sdk:524.0.0 strategy: fail-fast: false matrix: - include: - - device: tpu - build_mode: stable - image_name: maxtext_jax_stable - workflow: pre-training - dockerfile: maxtext_tpu_dependencies.Dockerfile - - device: gpu - build_mode: stable - image_name: maxtext_gpu_jax_stable - workflow: pre-training - dockerfile: maxtext_gpu_dependencies.Dockerfile - - device: tpu - build_mode: stable - image_name: maxtext_post_training_stable - workflow: post-training - dockerfile: maxtext_tpu_dependencies.Dockerfile - uses: ./.github/workflows/build_and_push_docker_image.yml - with: - image_name: ${{ matrix.image_name }} - device: ${{ matrix.device }} - build_mode: ${{ matrix.build_mode }} - workflow: ${{ matrix.workflow }} - dockerfile: ${{ matrix.dockerfile }} - maxtext_sha: ${{ github.sha }} - version_name: ${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }} - secrets: - HF_TOKEN: ${{ secrets.HF_TOKEN }} + image_name: + - maxtext_jax_stable + - maxtext_post_training_stable + steps: + - name: Configure Docker + run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q + - name: Add tags to Docker image + shell: bash + run: | + SOURCE_IMAGE="gcr.io/${{ vars.PROJECT_NAME }}/${{ matrix.image_name }}" + ORIGINAL_RUN_ID="${{ github.event.client_payload.github_run_id }}" + gcloud container images add-tag \ + "${SOURCE_IMAGE}:${ORIGINAL_RUN_ID}" \ + "${SOURCE_IMAGE}:${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }}" \ + --quiet diff --git a/.github/workflows/run_e2e_tests.yml b/.github/workflows/run_e2e_tests.yml new file mode 100644 index 0000000000..ee0535e7dd --- /dev/null +++ b/.github/workflows/run_e2e_tests.yml @@ -0,0 +1,65 @@ +name: MaxText E2E Airflow Tests + +on: + workflow_call: + inputs: + mode: + description: 'Build mode to test: stable or nightly' + required: false + type: string + default: 'stable' + +permissions: + contents: read + +jobs: + e2e_airflow_tests: + name: E2E Airflow Tests + runs-on: linux-x86-n2-16-buildkit + container: google/cloud-sdk:524.0.0 + steps: + - name: Get Airflow URI + id: info + run: | + AIRFLOW_URI=$(gcloud composer environments describe ml-automation-solutions \ + --location us-central1 \ + --project cloud-ml-auto-solutions \ + --format "value(config.airflowUri)") + echo "airflow_uri=${AIRFLOW_URI}" >> "$GITHUB_OUTPUT" + + - name: Trigger DAG + id: trigger + run: | + IAP_TOKEN=$(gcloud auth print-access-token) + DAG_ID="maxtext_sft"" + + RESPONSE=$(curl -s -w "\n%{http_code}" -X POST \ + "${{ steps.info.outputs.airflow_uri }}/api/v1/dags/${DAG_ID}/dagRuns" \ + -H "Authorization: Bearer ${IAP_TOKEN}" \ + -H "Content-Type: application/json" \ + -d "{ + \"conf\": { + \"maxtext_sha\": \"${{ github.sha }}\", + \"github_run_id\": \"${{ github.run_id }}\", + \"github_repo\": \"${{ github.repository }}\", + \"github_callback_token\": \"${{ secrets.AIRFLOW_CALLBACK_TOKEN }}\", + \"mode\": \"${{ inputs.mode }}\" + } + }") + + HTTP_STATUS=$(echo "$RESPONSE" | tail -1) + BODY=$(echo "$RESPONSE" | sed '$d') + echo "HTTP status: ${HTTP_STATUS}" + echo "Response body: ${BODY}" + + if [ "${HTTP_STATUS}" -lt 200 ] || [ "${HTTP_STATUS}" -ge 300 ]; then + echo "Error: Airflow API returned HTTP ${HTTP_STATUS}" + exit 1 + fi + + DAG_RUN_ID=$(echo "$BODY" | python3 -c "import sys,json; print(json.load(sys.stdin).get('dag_run_id',''))") + if [ -z "${DAG_RUN_ID}" ] || [ "${DAG_RUN_ID}" = "null" ]; then + echo "Error: could not parse dag_run_id from response" + exit 1 + fi + echo "dag_run_id=${DAG_RUN_ID}" >> "$GITHUB_OUTPUT"