From a772fc2cd81fb520baf0ec43df7a9df6291daedd Mon Sep 17 00:00:00 2001 From: edersonbrilhante Date: Thu, 12 Feb 2026 15:43:55 +0100 Subject: [PATCH] fix: retry SQS messages when GitHub API fails after EC2 instance creation --- .../control-plane/src/lambda.test.ts | 39 ++++++++- .../src/scale-runners/ScaleError.test.ts | 86 ++++++++++++++++++- .../src/scale-runners/ScaleError.ts | 31 +++++++ .../src/scale-runners/scale-up.test.ts | 64 ++++++++++++++ .../src/scale-runners/scale-up.ts | 22 ++++- 5 files changed, 236 insertions(+), 6 deletions(-) diff --git a/lambdas/functions/control-plane/src/lambda.test.ts b/lambdas/functions/control-plane/src/lambda.test.ts index 2c9a98e420..146f86b6b2 100644 --- a/lambdas/functions/control-plane/src/lambda.test.ts +++ b/lambdas/functions/control-plane/src/lambda.test.ts @@ -3,7 +3,7 @@ import { Context, SQSEvent, SQSRecord } from 'aws-lambda'; import { addMiddleware, adjustPool, scaleDownHandler, scaleUpHandler, ssmHousekeeper, jobRetryCheck } from './lambda'; import { adjust } from './pool/pool'; -import ScaleError from './scale-runners/ScaleError'; +import ScaleError, { GHHttpError } from './scale-runners/ScaleError'; import { scaleDown } from './scale-runners/scale-down'; import { ActionRequestMessage, scaleUp } from './scale-runners/scale-up'; import { cleanSSMTokens } from './scale-runners/ssm-housekeeper'; @@ -229,6 +229,43 @@ describe('Test scale up lambda wrapper.', () => { batchItemFailures: [{ itemIdentifier: 'message-0' }, { itemIdentifier: 'message-1' }], }); }); + + it('Should return all messages as batch item failures when scaleUp throws GHHttpError', async () => { + const records = createMultipleRecords(3); + const multiRecordEvent: SQSEvent = { Records: records }; + + const error = new GHHttpError('Validation Failed', 422); + vi.mocked(scaleUp).mockRejectedValue(error); + + const result = await scaleUpHandler(multiRecordEvent, context); + expect(result).toEqual({ + batchItemFailures: [ + { itemIdentifier: 'message-0' }, + { itemIdentifier: 'message-1' }, + { itemIdentifier: 'message-2' }, + ], + }); + }); + + it('Should return single message as batch item failure when scaleUp throws GHHttpError', async () => { + const error = new GHHttpError('Bad credentials', 401); + vi.mocked(scaleUp).mockRejectedValue(error); + + const result = await scaleUpHandler(sqsEvent, context); + expect(result).toEqual({ + batchItemFailures: [{ itemIdentifier: sqsRecord.messageId }], + }); + }); + + it('Should not return batch item failures for generic errors (not ScaleError or GHHttpError)', async () => { + const records = createMultipleRecords(2); + const multiRecordEvent: SQSEvent = { Records: records }; + + vi.mocked(scaleUp).mockRejectedValue(new Error('Some unexpected error')); + + const result = await scaleUpHandler(multiRecordEvent, context); + expect(result).toEqual({ batchItemFailures: [] }); + }); }); }); diff --git a/lambdas/functions/control-plane/src/scale-runners/ScaleError.test.ts b/lambdas/functions/control-plane/src/scale-runners/ScaleError.test.ts index 0a7478c12f..72fa66a134 100644 --- a/lambdas/functions/control-plane/src/scale-runners/ScaleError.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/ScaleError.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; import type { ActionRequestMessageSQS } from './scale-up'; -import ScaleError from './ScaleError'; +import ScaleError, { GHHttpError } from './ScaleError'; describe('ScaleError', () => { describe('detailedMessage', () => { @@ -74,3 +74,87 @@ describe('ScaleError', () => { }); }); }); + +describe('GHHttpError', () => { + describe('constructor', () => { + it('should set name, message, and status', () => { + const error = new GHHttpError('Validation Failed', 422); + + expect(error.name).toBe('GHHttpError'); + expect(error.message).toBe('Validation Failed'); + expect(error.status).toBe(422); + }); + + it('should be an instance of ScaleError', () => { + const error = new GHHttpError('Not Found', 404); + + expect(error).toBeInstanceOf(ScaleError); + expect(error).toBeInstanceOf(GHHttpError); + expect(error).toBeInstanceOf(Error); + }); + }); + + describe('detailedMessage', () => { + it.each([ + { message: 'Validation Failed', status: 422, expected: 'GitHub API HTTP error (status 422): Validation Failed' }, + { message: 'Bad credentials', status: 401, expected: 'GitHub API HTTP error (status 401): Bad credentials' }, + { message: 'Not Found', status: 404, expected: 'GitHub API HTTP error (status 404): Not Found' }, + { + message: 'Internal Server Error', + status: 500, + expected: 'GitHub API HTTP error (status 500): Internal Server Error', + }, + ])('should format message for status $status', ({ message, status, expected }) => { + const error = new GHHttpError(message, status); + + expect(error.detailedMessage).toBe(expected); + }); + }); + + describe('toBatchItemFailures', () => { + const mockMessages: ActionRequestMessageSQS[] = [ + { messageId: 'msg-1', id: 1, eventType: 'workflow_job' }, + { messageId: 'msg-2', id: 2, eventType: 'workflow_job' }, + { messageId: 'msg-3', id: 3, eventType: 'workflow_job' }, + ]; + + it('should retry ALL messages regardless of status', () => { + const error = new GHHttpError('Validation Failed', 422); + const failures = error.toBatchItemFailures(mockMessages); + + expect(failures).toEqual([{ itemIdentifier: 'msg-1' }, { itemIdentifier: 'msg-2' }, { itemIdentifier: 'msg-3' }]); + }); + + it('should retry the single message when only one is provided', () => { + const error = new GHHttpError('Bad credentials', 401); + const failures = error.toBatchItemFailures([mockMessages[0]]); + + expect(failures).toEqual([{ itemIdentifier: 'msg-1' }]); + }); + + it('should return empty array for empty messages', () => { + const error = new GHHttpError('Server Error', 500); + const failures = error.toBatchItemFailures([]); + + expect(failures).toEqual([]); + }); + + it('should retry all messages unlike ScaleError which retries only failedInstanceCount', () => { + const messages: ActionRequestMessageSQS[] = [ + { messageId: 'msg-1', id: 1, eventType: 'workflow_job' }, + { messageId: 'msg-2', id: 2, eventType: 'workflow_job' }, + { messageId: 'msg-3', id: 3, eventType: 'workflow_job' }, + { messageId: 'msg-4', id: 4, eventType: 'workflow_job' }, + { messageId: 'msg-5', id: 5, eventType: 'workflow_job' }, + ]; + + // ScaleError with failedInstanceCount=1 retries only 1 message + const scaleError = new ScaleError(1); + expect(scaleError.toBatchItemFailures(messages)).toHaveLength(1); + + // GHHttpError retries ALL messages + const ghError = new GHHttpError('error', 422); + expect(ghError.toBatchItemFailures(messages)).toHaveLength(5); + }); + }); +}); diff --git a/lambdas/functions/control-plane/src/scale-runners/ScaleError.ts b/lambdas/functions/control-plane/src/scale-runners/ScaleError.ts index 9c1f474d17..8975a16b1f 100644 --- a/lambdas/functions/control-plane/src/scale-runners/ScaleError.ts +++ b/lambdas/functions/control-plane/src/scale-runners/ScaleError.ts @@ -26,4 +26,35 @@ class ScaleError extends Error { } } +/** + * Custom error for GitHub HTTP API failures during runner config creation. + * Extends ScaleError so it is caught by the same handler in the Lambda entry point. + * + * Unlike a plain ScaleError (which retries only `failedInstanceCount` messages), + * a GHHttpError retries ALL messages because the GitHub API failure may affect + * every instance that was just launched. + */ +export class GHHttpError extends ScaleError { + public readonly status: number; + + constructor(message: string, status: number) { + super(); + this.message = message; + this.name = 'GHHttpError'; + this.status = status; + } + + public override get detailedMessage(): string { + return `GitHub API HTTP error (status ${this.status}): ${this.message}`; + } + + /** + * Override: retry ALL messages because the GitHub API error affects the + * entire batch of instances that were already created. + */ + public override toBatchItemFailures(messages: ActionRequestMessageSQS[]): SQSBatchItemFailure[] { + return messages.map(({ messageId }) => ({ itemIdentifier: messageId })); + } +} + export default ScaleError; diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 8a10b82ca4..9c1a0b9ce8 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -9,6 +9,7 @@ import * as ghAuth from '../github/auth'; import { createRunner, listEC2Runners } from './../aws/runners'; import { RunnerInputParameters } from './../aws/runners.d'; import * as scaleUpModule from './scale-up'; +import { GHHttpError } from './ScaleError'; import { getParameter } from '@aws-github-runner/aws-ssm-util'; import { publishRetryMessage } from './job-retry'; import { describe, it, expect, beforeEach, vi } from 'vitest'; @@ -1852,6 +1853,69 @@ describe('Retry mechanism tests', () => { }); }); +describe('GHHttpError', () => { + it('should have correct name, message, and status properties', () => { + const error = new GHHttpError('test error', 422); + expect(error).toBeInstanceOf(Error); + expect(error).toBeInstanceOf(GHHttpError); + expect(error.name).toBe('GHHttpError'); + expect(error.message).toBe('test error'); + expect(error.status).toBe(422); + }); + + describe('createRunners throws GHHttpError on GitHub API HTTP failure', () => { + beforeEach(() => { + process.env.GHES_URL = 'https://github.enterprise.something'; + process.env.ENABLE_ORGANIZATION_RUNNERS = 'true'; + process.env.ENABLE_EPHEMERAL_RUNNERS = 'true'; + process.env.ENABLE_JIT_CONFIG = 'true'; + process.env.RUNNER_NAME_PREFIX = 'unit-test-'; + process.env.RUNNER_GROUP_NAME = 'Default'; + process.env.SSM_CONFIG_PATH = '/github-action-runners/default/runners/config'; + process.env.SSM_TOKEN_PATH = '/github-action-runners/default/runners/config'; + process.env.RUNNER_LABELS = 'label1,label2'; + process.env.RUNNERS_MAXIMUM_COUNT = '3'; + }); + + it('wraps JIT config GitHub HTTP error as GHHttpError', async () => { + const httpError = new Error('Validation Failed') as Error & { status: number }; + httpError.name = 'HttpError'; + httpError.status = 422; + mockOctokit.actions.generateRunnerJitconfigForOrg.mockRejectedValue(httpError); + + await expect(scaleUpModule.scaleUp(TEST_DATA)).rejects.toThrow(GHHttpError); + await expect(scaleUpModule.scaleUp(TEST_DATA)).rejects.toMatchObject({ + status: 422, + message: expect.stringContaining('Validation Failed'), + }); + }); + + it('wraps registration token GitHub HTTP error as GHHttpError', async () => { + process.env.ENABLE_EPHEMERAL_RUNNERS = 'false'; + process.env.ENABLE_JIT_CONFIG = 'false'; + + const httpError = new Error('Bad credentials') as Error & { status: number }; + httpError.name = 'HttpError'; + httpError.status = 401; + mockOctokit.actions.createRegistrationTokenForOrg.mockRejectedValue(httpError); + + await expect(scaleUpModule.scaleUp(TEST_DATA)).rejects.toThrow(GHHttpError); + await expect(scaleUpModule.scaleUp(TEST_DATA)).rejects.toMatchObject({ + status: 401, + message: expect.stringContaining('Bad credentials'), + }); + }); + + it('does not wrap non-HTTP errors as GHHttpError', async () => { + const genericError = new Error('Some SSM error'); + mockOctokit.actions.generateRunnerJitconfigForOrg.mockRejectedValue(genericError); + + await expect(scaleUpModule.scaleUp(TEST_DATA)).rejects.toThrow('Some SSM error'); + await expect(scaleUpModule.scaleUp(TEST_DATA)).rejects.not.toBeInstanceOf(GHHttpError); + }); + }); +}); + function defaultOctokitMockImpl() { mockOctokit.actions.getJobForWorkflowRun.mockImplementation(() => ({ data: { diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 2a4c2c1c58..5a1c09d468 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -8,6 +8,7 @@ import { createRunner, listEC2Runners, tag } from './../aws/runners'; import { RunnerInputParameters } from './../aws/runners.d'; import { metricGitHubAppRateLimit } from '../github/rate-limit'; import { publishRetryMessage } from './job-retry'; +import { GHHttpError } from './ScaleError'; const logger = createChildLogger('scale-up'); @@ -538,10 +539,23 @@ async function createStartRunnerConfig( instances: string[], ghClient: Octokit, ) { - if (githubRunnerConfig.enableJitConfig && githubRunnerConfig.ephemeral) { - await createJitConfig(githubRunnerConfig, instances, ghClient); - } else { - await createRegistrationTokenConfig(githubRunnerConfig, instances, ghClient); + try { + if (githubRunnerConfig.enableJitConfig && githubRunnerConfig.ephemeral) { + await createJitConfig(githubRunnerConfig, instances, ghClient); + } else { + await createRegistrationTokenConfig(githubRunnerConfig, instances, ghClient); + } + } catch (error) { + if (error instanceof Error && error.name === 'HttpError') { + const status = (error as Error & { status: number }).status; + logger.error('GitHub API HTTP error during runner config creation', { + status, + message: error.message, + instances, + }); + throw new GHHttpError(`Failed to create runner start config: ${error.message}`, status); + } + throw error; } }