From 784f1d1246b2c41543c30faca1a28ce88626603c Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Sat, 23 May 2026 19:11:06 +0200 Subject: [PATCH 1/2] ci: Alert Slack on failure across CI workflows --- .../workflows/_notify_slack_on_failure.yaml | 111 ++++++++++++++++++ .github/workflows/manual_release_beta.yaml | 18 +++ .github/workflows/manual_release_docs.yaml | 22 ++++ .github/workflows/manual_release_stable.yaml | 21 ++++ .github/workflows/manual_version_docs.yaml | 21 ++++ .github/workflows/on_issue.yaml | 14 +++ .github/workflows/on_master.yaml | 20 ++++ .github/workflows/on_schedule_tests.yaml | 74 ++---------- 8 files changed, 235 insertions(+), 66 deletions(-) create mode 100644 .github/workflows/_notify_slack_on_failure.yaml diff --git a/.github/workflows/_notify_slack_on_failure.yaml b/.github/workflows/_notify_slack_on_failure.yaml new file mode 100644 index 0000000000..eee37de4bf --- /dev/null +++ b/.github/workflows/_notify_slack_on_failure.yaml @@ -0,0 +1,111 @@ +name: Notify Slack on failure + +on: + # Runs when invoked by another workflow. + workflow_call: + inputs: + heading: + description: 'Slack message heading (e.g. ":red_circle: Master CI failed").' + required: true + type: string + secrets: + SLACK_WEBHOOK_URL: + description: 'Incoming webhook URL for the team alerting channel.' + required: true + +permissions: + contents: read + actions: read + +jobs: + notify: + name: Notify Slack + runs-on: ubuntu-latest + + steps: + - name: Build Slack payload + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + RUN_ID: ${{ github.run_id }} + RUN_ATTEMPT: ${{ github.run_attempt }} + WORKFLOW_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + HEADING: ${{ inputs.heading }} + run: | + set -uo pipefail + + gh_out=$(mktemp) + gh_err=$(mktemp) + trap 'rm -f "${gh_out}" "${gh_err}"' EXIT + + # Retry the API call to tolerate transient 5xx from GitHub. + # `--paginate` handles runs with more than 100 jobs (e.g., large matrix in scheduled e2e tests). + max_attempts=5 + fetched=0 + for attempt in $(seq 1 "${max_attempts}"); do + if gh api --paginate \ + "repos/${REPO}/actions/runs/${RUN_ID}/attempts/${RUN_ATTEMPT}/jobs?per_page=100" \ + > "${gh_out}" 2> "${gh_err}"; then + fetched=1 + break + fi + echo "Attempt ${attempt}/${max_attempts} failed:" >&2 + cat "${gh_err}" >&2 + if [[ "${attempt}" -lt "${max_attempts}" ]]; then + sleep "$((attempt * 5))" + fi + done + + if [[ "${fetched}" -eq 0 ]]; then + echo "Failed to fetch job list after ${max_attempts} attempts; sending notification without it." >&2 + failed_jobs="(unable to fetch job list — see workflow run)" + else + # `failure()` and `cancelled()` upstream can be triggered by any of these conclusions, so include them all. + # Truncate to MAX_JOBS bullets to stay under Slack's 3000-char limit per section text block. + max_jobs=20 + if ! failed_jobs=$(jq -s -r \ + --argjson max "${max_jobs}" \ + '[.[].jobs[] | select(.conclusion == "failure" or .conclusion == "timed_out" or .conclusion == "cancelled") | "• \(.name) (\(.conclusion))"] + | if length == 0 then + "(no failed/timed_out/cancelled jobs in this run — see workflow run)" + elif length > $max then + (.[0:$max] + ["_... and \(length - $max) more_"]) | join("\n") + else + join("\n") + end' "${gh_out}"); then + echo "Failed to parse job list JSON; sending notification without it." >&2 + failed_jobs="(unable to parse job list — see workflow run)" + fi + fi + jq -n \ + --arg repo "${REPO}" \ + --arg url "${WORKFLOW_URL}" \ + --arg heading "${HEADING}" \ + --arg failed "${failed_jobs}" \ + '{ + text: "\($heading) in \($repo)", + blocks: [ + { + type: "header", + text: { type: "plain_text", text: $heading, emoji: true } + }, + { + type: "section", + fields: [ + { type: "mrkdwn", text: "*Repository:*\n\($repo)" }, + { type: "mrkdwn", text: "*Workflow run:*\n<\($url)|View on GitHub>" } + ] + }, + { + type: "section", + text: { type: "mrkdwn", text: "*Failed jobs:*\n\($failed)" } + } + ] + }' > slack-payload.json + + - name: Send Slack notification + uses: slackapi/slack-github-action@v3.0.2 + with: + webhook: ${{ secrets.SLACK_WEBHOOK_URL }} + webhook-type: incoming-webhook + payload-file-path: slack-payload.json diff --git a/.github/workflows/manual_release_beta.yaml b/.github/workflows/manual_release_beta.yaml index 34430c6552..64542e22ed 100644 --- a/.github/workflows/manual_release_beta.yaml +++ b/.github/workflows/manual_release_beta.yaml @@ -91,4 +91,22 @@ jobs: pages: write id-token: write uses: ./.github/workflows/manual_release_docs.yaml + with: + # This workflow has its own notify_on_failure; suppress the nested one to avoid duplicate alerts. + suppress_failure_notification: true secrets: inherit + + # Send a Slack notification to the team alerting channel when there is a failure. + # Also alerts on cancellation (e.g., env-gated `pypi_publish` rejected by a reviewer or timed out). + notify_on_failure: + name: Notify Slack on failure + needs: [release_prepare, changelog_update, pypi_publish, doc_release_post_publish] + if: failure() || cancelled() + permissions: + contents: read + actions: read # Required for `gh api ...actions/runs/.../jobs` inside the reusable workflow. + uses: ./.github/workflows/_notify_slack_on_failure.yaml + with: + heading: ':red_circle: Beta release failed' + secrets: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/manual_release_docs.yaml b/.github/workflows/manual_release_docs.yaml index d649c5cc3f..0c49423138 100644 --- a/.github/workflows/manual_release_docs.yaml +++ b/.github/workflows/manual_release_docs.yaml @@ -6,6 +6,12 @@ on: # Runs when invoked by another workflow. workflow_call: + inputs: + suppress_failure_notification: + description: 'When true, skips the Slack alert on failure (the caller will send its own).' + required: false + type: boolean + default: false permissions: contents: read @@ -79,3 +85,19 @@ jobs: echo "✅ CloudFront cache invalidation workflow triggered successfully" env: GITHUB_TOKEN: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }} + + # Send a Slack notification to the team alerting channel when there is a failure. + # Also alerts on cancellation (e.g., the `github-pages` deployment was rejected by a reviewer). + # Skipped when the workflow is invoked from a parent that sends its own alert (to avoid duplicates). + notify_on_failure: + name: Notify Slack on failure + needs: release_docs + if: (failure() || cancelled()) && !inputs.suppress_failure_notification + permissions: + contents: read + actions: read # Required for `gh api ...actions/runs/.../jobs` inside the reusable workflow. + uses: ./.github/workflows/_notify_slack_on_failure.yaml + with: + heading: ':red_circle: Docs release failed' + secrets: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/manual_release_stable.yaml b/.github/workflows/manual_release_stable.yaml index fed5338d93..81816d2804 100644 --- a/.github/workflows/manual_release_stable.yaml +++ b/.github/workflows/manual_release_stable.yaml @@ -134,6 +134,8 @@ jobs: # Pass the bumped version explicitly — the job's checkout uses the dispatch ref (pre-bump), # so `uv version --short` from pyproject.toml would return the old version. version_number: ${{ needs.release_prepare.outputs.version_number }} + # This workflow has its own notify_on_failure; suppress the nested one to avoid duplicate alerts. + suppress_failure_notification: true secrets: inherit doc_release: @@ -144,4 +146,23 @@ jobs: pages: write id-token: write uses: ./.github/workflows/manual_release_docs.yaml + with: + # This workflow has its own notify_on_failure; suppress the nested one to avoid duplicate alerts. + suppress_failure_notification: true secrets: inherit + + # Send a Slack notification to the team alerting channel when there is a failure. + # Also alerts on cancellation (e.g., env-gated `pypi_publish` / `github-pages` rejected by a reviewer + # or the approval timed out). + notify_on_failure: + name: Notify Slack on failure + needs: [code_checks, release_prepare, changelog_update, github_release, pypi_publish, version_docs, doc_release] + if: failure() || cancelled() + permissions: + contents: read + actions: read # Required for `gh api ...actions/runs/.../jobs` inside the reusable workflow. + uses: ./.github/workflows/_notify_slack_on_failure.yaml + with: + heading: ':red_circle: Stable release failed' + secrets: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/manual_version_docs.yaml b/.github/workflows/manual_version_docs.yaml index 06b28362f6..9bd51dce44 100644 --- a/.github/workflows/manual_version_docs.yaml +++ b/.github/workflows/manual_version_docs.yaml @@ -18,6 +18,11 @@ on: required: false type: string default: "" + suppress_failure_notification: + description: 'When true, skips the Slack alert on failure (the caller will send its own).' + required: false + type: boolean + default: false concurrency: group: version-docs @@ -119,3 +124,19 @@ jobs: add: 'website/versioned_docs website/versioned_sidebars website/versions.json' pull: '--rebase --autostash' github-token: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }} + + # Send a Slack notification to the team alerting channel when there is a failure. + # Also alerts on cancellation (e.g., concurrent run in the `version-docs` group cancels this one). + # Skipped when the workflow is invoked from a parent that sends its own alert (to avoid duplicates). + notify_on_failure: + name: Notify Slack on failure + needs: version_docs + if: (failure() || cancelled()) && !inputs.suppress_failure_notification + permissions: + contents: read + actions: read # Required for `gh api ...actions/runs/.../jobs` inside the reusable workflow. + uses: ./.github/workflows/_notify_slack_on_failure.yaml + with: + heading: ':red_circle: Version docs failed' + secrets: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/on_issue.yaml b/.github/workflows/on_issue.yaml index db6a304751..a2d5ed0461 100644 --- a/.github/workflows/on_issue.yaml +++ b/.github/workflows/on_issue.yaml @@ -27,3 +27,17 @@ jobs: repo: context.repo.repo, labels: ["t-tooling"] }) + + # Send a Slack notification to the team alerting channel when there is a failure. + notify_on_failure: + name: Notify Slack on failure + needs: label_issues + if: failure() + permissions: + contents: read + actions: read # Required for `gh api ...actions/runs/.../jobs` inside the reusable workflow. + uses: ./.github/workflows/_notify_slack_on_failure.yaml + with: + heading: ':red_circle: Issue labeling failed' + secrets: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/on_master.yaml b/.github/workflows/on_master.yaml index 0bd60fa0ec..1cd9b51246 100644 --- a/.github/workflows/on_master.yaml +++ b/.github/workflows/on_master.yaml @@ -25,6 +25,9 @@ jobs: pages: write id-token: write uses: ./.github/workflows/manual_release_docs.yaml + with: + # This workflow has its own notify_on_failure; suppress the nested one to avoid duplicate alerts. + suppress_failure_notification: true secrets: inherit code_checks: @@ -59,3 +62,20 @@ jobs: uses: apify/actions/execute-workflow@v1.1.2 with: workflow: manual_release_beta.yaml + + # Send a Slack notification to the team alerting channel when there is a failure. + # Note: `beta_release` is a fire-and-forget dispatcher (it only kicks off `manual_release_beta.yaml` + # via execute-workflow). Failures inside the dispatched beta run are caught by that workflow's own + # `notify_on_failure`, not by this one. + notify_on_failure: + name: Notify Slack on failure + needs: [doc_checks, doc_release, code_checks, tests, beta_release] + if: failure() + permissions: + contents: read + actions: read # Required for `gh api ...actions/runs/.../jobs` inside the reusable workflow. + uses: ./.github/workflows/_notify_slack_on_failure.yaml + with: + heading: ':red_circle: Master CI failed' + secrets: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/on_schedule_tests.yaml b/.github/workflows/on_schedule_tests.yaml index e16333b28b..1a231ec07c 100644 --- a/.github/workflows/on_schedule_tests.yaml +++ b/.github/workflows/on_schedule_tests.yaml @@ -71,74 +71,16 @@ jobs: APIFY_TEST_USER_API_TOKEN: ${{ secrets.APIFY_TEST_USER_API_TOKEN }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - # Send a Slack notification to the team alerting channel when scheduled e2e tests fail. - # Skipped on workflow_dispatch (manual runs) so that ad-hoc triggers don't spam the channel. + # Send a Slack notification to the team alerting channel when there is a failure. notify_on_failure: name: Notify Slack on failure needs: end_to_end_tests - if: failure() && github.event_name == 'schedule' - runs-on: ubuntu-latest + if: failure() permissions: contents: read - actions: read - - steps: - - name: Build Slack payload - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPO: ${{ github.repository }} - RUN_ID: ${{ github.run_id }} - RUN_ATTEMPT: ${{ github.run_attempt }} - WORKFLOW_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - HEADING: ':red_circle: Scheduled e2e tests failed' - run: | - # Retry the API call to tolerate transient 5xx from GitHub. - max_attempts=5 - fetched=0 - for attempt in $(seq 1 "${max_attempts}"); do - if failed_jobs=$(gh api \ - "repos/${REPO}/actions/runs/${RUN_ID}/attempts/${RUN_ATTEMPT}/jobs?per_page=100" \ - --jq '[.jobs[] | select(.conclusion == "failure") | "• \(.name)"] | join("\n")'); then - fetched=1 - break - fi - if [[ "${attempt}" -lt "${max_attempts}" ]]; then - sleep "$((attempt * 5))" - fi - done - if [[ "${fetched}" -eq 0 ]]; then - echo "Failed to fetch job list after ${max_attempts} attempts; sending notification without it." >&2 - failed_jobs="(unable to fetch job list — see workflow run)" - fi - jq -n \ - --arg repo "${REPO}" \ - --arg url "${WORKFLOW_URL}" \ - --arg heading "${HEADING}" \ - --arg failed "${failed_jobs}" \ - '{ - text: "\($heading) in \($repo)", - blocks: [ - { - type: "header", - text: { type: "plain_text", text: $heading, emoji: true } - }, - { - type: "section", - fields: [ - { type: "mrkdwn", text: "*Repository:*\n\($repo)" }, - { type: "mrkdwn", text: "*Workflow run:*\n<\($url)|View on GitHub>" } - ] - }, - { - type: "section", - text: { type: "mrkdwn", text: "*Failed jobs:*\n\($failed)" } - } - ] - }' > slack-payload.json - - - name: Send Slack notification - uses: slackapi/slack-github-action@v3.0.2 - with: - webhook: ${{ secrets.SLACK_WEBHOOK_URL }} - webhook-type: incoming-webhook - payload-file-path: slack-payload.json + actions: read # Required for `gh api ...actions/runs/.../jobs` inside the reusable workflow. + uses: ./.github/workflows/_notify_slack_on_failure.yaml + with: + heading: ':red_circle: Scheduled e2e tests failed' + secrets: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} From 54d98d030fccec5c49fddc6a5508fcf200a802e8 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 26 May 2026 09:44:39 +0200 Subject: [PATCH 2/2] rm on issue notification --- .github/workflows/on_issue.yaml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/.github/workflows/on_issue.yaml b/.github/workflows/on_issue.yaml index a2d5ed0461..db6a304751 100644 --- a/.github/workflows/on_issue.yaml +++ b/.github/workflows/on_issue.yaml @@ -27,17 +27,3 @@ jobs: repo: context.repo.repo, labels: ["t-tooling"] }) - - # Send a Slack notification to the team alerting channel when there is a failure. - notify_on_failure: - name: Notify Slack on failure - needs: label_issues - if: failure() - permissions: - contents: read - actions: read # Required for `gh api ...actions/runs/.../jobs` inside the reusable workflow. - uses: ./.github/workflows/_notify_slack_on_failure.yaml - with: - heading: ':red_circle: Issue labeling failed' - secrets: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}