From 264ee5d96a9dc216bc7052ad3d4b4564a5943ac2 Mon Sep 17 00:00:00 2001 From: Brend Smits Date: Wed, 7 Jan 2026 12:56:04 +0100 Subject: [PATCH 1/4] test: add test to simulate JIT Config creation 503 from GitHub API --- .../src/scale-runners/scale-up.test.ts | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 315bb8ab0b..531949b2b9 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -336,6 +336,80 @@ describe('scaleUp with GHES', () => { ], }); }); + + it('should create JIT config for all remaining instances even when GitHub API fails for one instance', async () => { + process.env.RUNNERS_MAXIMUM_COUNT = '5'; + mockCreateRunner.mockImplementation(async () => { + return ['i-instance-1', 'i-instance-2', 'i-instance-3']; + }); + mockListRunners.mockImplementation(async () => { + return []; + }); + + let callCount = 0; + mockOctokit.actions.generateRunnerJitconfigForOrg.mockImplementation(({ name }) => { + callCount++; + if (name === 'unit-test-i-instance-2') { + // Simulate a 503 Service Unavailable error from GitHub + const error = new Error('Service Unavailable') as any; + error.status = 503; + error.response = { + status: 503, + data: { message: 'Service temporarily unavailable' }, + }; + throw error; + } + return { + data: { + runner: { id: 9876543210 + callCount }, + encoded_jit_config: `TEST_JIT_CONFIG_${name}`, + }, + headers: {}, + }; + }); + + await scaleUpModule.scaleUp(TEST_DATA); + + expect(mockOctokit.actions.generateRunnerJitconfigForOrg).toHaveBeenCalledWith({ + org: TEST_DATA_SINGLE.repositoryOwner, + name: 'unit-test-i-instance-1', + runner_group_id: 1, + labels: ['label1', 'label2'], + }); + + expect(mockOctokit.actions.generateRunnerJitconfigForOrg).toHaveBeenCalledWith({ + org: TEST_DATA_SINGLE.repositoryOwner, + name: 'unit-test-i-instance-2', + runner_group_id: 1, + labels: ['label1', 'label2'], + }); + + expect(mockOctokit.actions.generateRunnerJitconfigForOrg).toHaveBeenCalledWith({ + org: TEST_DATA_SINGLE.repositoryOwner, + name: 'unit-test-i-instance-3', + runner_group_id: 1, + labels: ['label1', 'label2'], + }); + + expect(mockSSMClient).toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-1', + Value: 'TEST_JIT_CONFIG_unit-test-i-instance-1', + Type: 'SecureString', + Tags: [{ Key: 'InstanceId', Value: 'i-instance-1' }], + }); + + expect(mockSSMClient).toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-3', + Value: 'TEST_JIT_CONFIG_unit-test-i-instance-3', + Type: 'SecureString', + Tags: [{ Key: 'InstanceId', Value: 'i-instance-3' }], + }); + + expect(mockSSMClient).not.toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-2', + }); + }); + it.each(RUNNER_TYPES)( 'calls create start runner config of 40' + ' instances (ssm rate limit condition) to test time delay ', async (type: RunnerType) => { From 7e33d64991c408405382c3d98051c8c1b744fd27 Mon Sep 17 00:00:00 2001 From: Brend Smits Date: Wed, 7 Jan 2026 13:09:29 +0100 Subject: [PATCH 2/4] fix: improve error handling in createJitConfig for instance creation This ensures that even if there's a failed jit config creation for one of the instances, it proceeds with the other ones and does not just skip the entire batch. It will report the failed instances at the end. --- .../src/scale-runners/scale-up.ts | 93 +++++++++++-------- 1 file changed, 55 insertions(+), 38 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index e271b901df..91945aee0c 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -534,48 +534,65 @@ async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, ins const runnerGroupId = await getRunnerGroupId(githubRunnerConfig, ghClient); const { isDelay, delay } = addDelay(instances); const runnerLabels = githubRunnerConfig.runnerLabels.split(','); + const failedInstances: string[] = []; logger.debug(`Runner group id: ${runnerGroupId}`); logger.debug(`Runner labels: ${runnerLabels}`); for (const instance of instances) { - // generate jit config for runner registration - const ephemeralRunnerConfig: EphemeralRunnerConfig = { - runnerName: `${githubRunnerConfig.runnerNamePrefix}${instance}`, - runnerGroupId: runnerGroupId, - runnerLabels: runnerLabels, - }; - logger.debug(`Runner name: ${ephemeralRunnerConfig.runnerName}`); - const runnerConfig = - githubRunnerConfig.runnerType === 'Org' - ? await ghClient.actions.generateRunnerJitconfigForOrg({ - org: githubRunnerConfig.runnerOwner, - name: ephemeralRunnerConfig.runnerName, - runner_group_id: ephemeralRunnerConfig.runnerGroupId, - labels: ephemeralRunnerConfig.runnerLabels, - }) - : await ghClient.actions.generateRunnerJitconfigForRepo({ - owner: githubRunnerConfig.runnerOwner.split('/')[0], - repo: githubRunnerConfig.runnerOwner.split('/')[1], - name: ephemeralRunnerConfig.runnerName, - runner_group_id: ephemeralRunnerConfig.runnerGroupId, - labels: ephemeralRunnerConfig.runnerLabels, - }); - - metricGitHubAppRateLimit(runnerConfig.headers); - - // tag the EC2 instance with the Github runner id - await tagRunnerId(instance, runnerConfig.data.runner.id.toString()); - - // store jit config in ssm parameter store - logger.debug('Runner JIT config for ephemeral runner generated.', { - instance: instance, - }); - await putParameter(`${githubRunnerConfig.ssmTokenPath}/${instance}`, runnerConfig.data.encoded_jit_config, true, { - tags: [{ Key: 'InstanceId', Value: instance }], - }); - if (isDelay) { - // Delay to prevent AWS ssm rate limits by being within the max throughput limit - await delay(25); + try { + // generate jit config for runner registration + const ephemeralRunnerConfig: EphemeralRunnerConfig = { + runnerName: `${githubRunnerConfig.runnerNamePrefix}${instance}`, + runnerGroupId: runnerGroupId, + runnerLabels: runnerLabels, + }; + logger.debug(`Runner name: ${ephemeralRunnerConfig.runnerName}`); + const runnerConfig = + githubRunnerConfig.runnerType === 'Org' + ? await ghClient.actions.generateRunnerJitconfigForOrg({ + org: githubRunnerConfig.runnerOwner, + name: ephemeralRunnerConfig.runnerName, + runner_group_id: ephemeralRunnerConfig.runnerGroupId, + labels: ephemeralRunnerConfig.runnerLabels, + }) + : await ghClient.actions.generateRunnerJitconfigForRepo({ + owner: githubRunnerConfig.runnerOwner.split('/')[0], + repo: githubRunnerConfig.runnerOwner.split('/')[1], + name: ephemeralRunnerConfig.runnerName, + runner_group_id: ephemeralRunnerConfig.runnerGroupId, + labels: ephemeralRunnerConfig.runnerLabels, + }); + + metricGitHubAppRateLimit(runnerConfig.headers); + + // tag the EC2 instance with the Github runner id + await tagRunnerId(instance, runnerConfig.data.runner.id.toString()); + + // store jit config in ssm parameter store + logger.debug('Runner JIT config for ephemeral runner generated.', { + instance: instance, + }); + await putParameter(`${githubRunnerConfig.ssmTokenPath}/${instance}`, runnerConfig.data.encoded_jit_config, true, { + tags: [{ Key: 'InstanceId', Value: instance }], + }); + if (isDelay) { + // Delay to prevent AWS ssm rate limits by being within the max throughput limit + await delay(25); + } + } catch (error) { + failedInstances.push(instance); + logger.warn('Failed to create JIT config for instance, continuing with remaining instances', { + instance: instance, + error: error instanceof Error ? error.message : String(error), + }); } } + + if (failedInstances.length > 0) { + logger.error('Failed to create JIT config for some instances', { + failedInstances: failedInstances, + totalInstances: instances.length, + successfulInstances: instances.length - failedInstances.length, + }); + } } From 9f37a0499e757f1fc3e723a81cb62b839d305cc6 Mon Sep 17 00:00:00 2001 From: Brend Smits Date: Wed, 7 Jan 2026 13:18:50 +0100 Subject: [PATCH 3/4] feat: add retry logic to Octokit client and add ScaleUp tests --- lambdas/functions/control-plane/package.json | 1 + .../control-plane/src/github/auth.ts | 14 ++- .../src/scale-runners/scale-up.test.ts | 100 +++++++++++++++++- lambdas/yarn.lock | 14 +++ 4 files changed, 124 insertions(+), 5 deletions(-) diff --git a/lambdas/functions/control-plane/package.json b/lambdas/functions/control-plane/package.json index c17eeec7f9..e348a0e79e 100644 --- a/lambdas/functions/control-plane/package.json +++ b/lambdas/functions/control-plane/package.json @@ -38,6 +38,7 @@ "@middy/core": "^6.4.5", "@octokit/auth-app": "8.1.2", "@octokit/core": "7.0.6", + "@octokit/plugin-retry": "8.0.3", "@octokit/plugin-throttling": "11.0.3", "@octokit/rest": "22.0.1", "cron-parser": "^5.4.0" diff --git a/lambdas/functions/control-plane/src/github/auth.ts b/lambdas/functions/control-plane/src/github/auth.ts index d529cc81e8..8d6d8bfbf4 100644 --- a/lambdas/functions/control-plane/src/github/auth.ts +++ b/lambdas/functions/control-plane/src/github/auth.ts @@ -18,6 +18,7 @@ type StrategyOptions = { }; import { request } from '@octokit/request'; import { Octokit } from '@octokit/rest'; +import { retry } from '@octokit/plugin-retry'; import { throttling } from '@octokit/plugin-throttling'; import { createChildLogger } from '@aws-github-runner/aws-powertools-util'; import { getParameter } from '@aws-github-runner/aws-ssm-util'; @@ -26,7 +27,7 @@ import { EndpointDefaults } from '@octokit/types'; const logger = createChildLogger('gh-auth'); export async function createOctokitClient(token: string, ghesApiUrl = ''): Promise { - const CustomOctokit = Octokit.plugin(throttling); + const CustomOctokit = Octokit.plugin(retry, throttling); const ocktokitOptions: OctokitOptions = { auth: token, }; @@ -38,6 +39,17 @@ export async function createOctokitClient(token: string, ghesApiUrl = ''): Promi return new CustomOctokit({ ...ocktokitOptions, userAgent: process.env.USER_AGENT || 'github-aws-runners', + retry: { + onRetry: (retryCount: number, error: Error, request: { method: string; url: string }) => { + logger.warn('GitHub API request retry attempt', { + retryCount, + method: request.method, + url: request.url, + error: error.message, + status: (error as Error & { status?: number }).status, + }); + }, + }, throttle: { onRateLimit: (retryAfter: number, options: Required) => { logger.warn( diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 531949b2b9..4730eb404b 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -346,12 +346,13 @@ describe('scaleUp with GHES', () => { return []; }); - let callCount = 0; mockOctokit.actions.generateRunnerJitconfigForOrg.mockImplementation(({ name }) => { - callCount++; if (name === 'unit-test-i-instance-2') { // Simulate a 503 Service Unavailable error from GitHub - const error = new Error('Service Unavailable') as any; + const error = new Error('Service Unavailable') as Error & { + status: number; + response: { status: number; data: { message: string } }; + }; error.status = 503; error.response = { status: 503, @@ -361,7 +362,7 @@ describe('scaleUp with GHES', () => { } return { data: { - runner: { id: 9876543210 + callCount }, + runner: { id: 9876543210 }, encoded_jit_config: `TEST_JIT_CONFIG_${name}`, }, headers: {}, @@ -410,6 +411,97 @@ describe('scaleUp with GHES', () => { }); }); + it('should handle retryable errors with error handling logic', async () => { + process.env.RUNNERS_MAXIMUM_COUNT = '5'; + mockCreateRunner.mockImplementation(async () => { + return ['i-instance-1', 'i-instance-2']; + }); + mockListRunners.mockImplementation(async () => { + return []; + }); + + mockOctokit.actions.generateRunnerJitconfigForOrg.mockImplementation(({ name }) => { + if (name === 'unit-test-i-instance-1') { + const error = new Error('Internal Server Error') as Error & { + status: number; + response: { status: number; data: { message: string } }; + }; + error.status = 500; + error.response = { + status: 500, + data: { message: 'Internal server error' }, + }; + throw error; + } + return { + data: { + runner: { id: 9876543210 }, + encoded_jit_config: `TEST_JIT_CONFIG_${name}`, + }, + headers: {}, + }; + }); + + await scaleUpModule.scaleUp(TEST_DATA); + + expect(mockSSMClient).toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-2', + Value: 'TEST_JIT_CONFIG_unit-test-i-instance-2', + Type: 'SecureString', + Tags: [{ Key: 'InstanceId', Value: 'i-instance-2' }], + }); + + expect(mockSSMClient).not.toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-1', + }); + }); + + it('should handle non-retryable 4xx errors gracefully', async () => { + process.env.RUNNERS_MAXIMUM_COUNT = '5'; + mockCreateRunner.mockImplementation(async () => { + return ['i-instance-1', 'i-instance-2']; + }); + mockListRunners.mockImplementation(async () => { + return []; + }); + + mockOctokit.actions.generateRunnerJitconfigForOrg.mockImplementation(({ name }) => { + if (name === 'unit-test-i-instance-1') { + // 404 is not retryable - will fail immediately + const error = new Error('Not Found') as Error & { + status: number; + response: { status: number; data: { message: string } }; + }; + error.status = 404; + error.response = { + status: 404, + data: { message: 'Resource not found' }, + }; + throw error; + } + return { + data: { + runner: { id: 9876543210 }, + encoded_jit_config: `TEST_JIT_CONFIG_${name}`, + }, + headers: {}, + }; + }); + + await scaleUpModule.scaleUp(TEST_DATA); + + expect(mockSSMClient).toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-2', + Value: 'TEST_JIT_CONFIG_unit-test-i-instance-2', + Type: 'SecureString', + Tags: [{ Key: 'InstanceId', Value: 'i-instance-2' }], + }); + + expect(mockSSMClient).not.toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-1', + }); + }); + it.each(RUNNER_TYPES)( 'calls create start runner config of 40' + ' instances (ssm rate limit condition) to test time delay ', async (type: RunnerType) => { diff --git a/lambdas/yarn.lock b/lambdas/yarn.lock index af188cd0a8..8c195998a6 100644 --- a/lambdas/yarn.lock +++ b/lambdas/yarn.lock @@ -155,6 +155,7 @@ __metadata: "@middy/core": "npm:^6.4.5" "@octokit/auth-app": "npm:8.1.2" "@octokit/core": "npm:7.0.6" + "@octokit/plugin-retry": "npm:8.0.3" "@octokit/plugin-throttling": "npm:11.0.3" "@octokit/rest": "npm:22.0.1" "@octokit/types": "npm:^16.0.0" @@ -3685,6 +3686,19 @@ __metadata: languageName: node linkType: hard +"@octokit/plugin-retry@npm:8.0.3": + version: 8.0.3 + resolution: "@octokit/plugin-retry@npm:8.0.3" + dependencies: + "@octokit/request-error": "npm:^7.0.2" + "@octokit/types": "npm:^16.0.0" + bottleneck: "npm:^2.15.3" + peerDependencies: + "@octokit/core": ">=7" + checksum: 10c0/24d35d85f750f9e3e52f63b8ddd8fc8aa7bdd946c77b9ea4d6894d026c5c2c69109e8de3880a9970c906f624eb777c7d0c0a2072e6d41dadc7b36cce104b978c + languageName: node + linkType: hard + "@octokit/plugin-throttling@npm:11.0.3": version: 11.0.3 resolution: "@octokit/plugin-throttling@npm:11.0.3" From 097ccc6f81ad426378c4eb58d4855cc28748055a Mon Sep 17 00:00:00 2001 From: Brend Smits Date: Thu, 8 Jan 2026 16:54:46 +0100 Subject: [PATCH 4/4] fix: add missing instance termination for failures Instances that failed to start up because of incorrect configuration never got terminated. This is now updated and failed instances get terminated right away. Previously we relied on a scale-down to do this. --- .../src/scale-runners/scale-up.ts | 61 ++++++++++++++++--- 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 91945aee0c..83b1e50c5a 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -4,7 +4,7 @@ import { getParameter, putParameter } from '@aws-github-runner/aws-ssm-util'; import yn from 'yn'; import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth'; -import { createRunner, listEC2Runners, tag } from './../aws/runners'; +import { createRunner, listEC2Runners, tag, terminateRunner } from './../aws/runners'; import { RunnerInputParameters } from './../aws/runners.d'; import { metricGitHubAppRateLimit } from '../github/rate-limit'; @@ -222,7 +222,29 @@ export async function createRunners( ...ec2RunnerConfig, }); if (instances.length !== 0) { - await createStartRunnerConfig(githubRunnerConfig, instances, ghClient); + const failedInstances = await createStartRunnerConfig(githubRunnerConfig, instances, ghClient); + + // Terminate instances that failed to get configured to avoid waste + if (failedInstances.length > 0) { + logger.warn('Terminating instances that failed to get configured', { + failedInstances, + failedCount: failedInstances.length, + }); + + for (const instanceId of failedInstances) { + try { + await terminateRunner(instanceId); + } catch (error) { + logger.error('Failed to terminate instance', { + instanceId, + error: error instanceof Error ? error.message : String(error), + }); + } + } + + // Remove failed instances from the returned list + return instances.filter((id) => !failedInstances.includes(id)); + } } return instances; @@ -475,15 +497,21 @@ export function getGitHubEnterpriseApiUrl() { return { ghesApiUrl, ghesBaseUrl }; } +/** + * Creates the start configuration for runner instances by either generating JIT configs + * or registration tokens. + * + * @returns Array of instance IDs that failed to get configured + */ async function createStartRunnerConfig( githubRunnerConfig: CreateGitHubRunnerConfig, instances: string[], ghClient: Octokit, -) { +): Promise { if (githubRunnerConfig.enableJitConfig && githubRunnerConfig.ephemeral) { - await createJitConfig(githubRunnerConfig, instances, ghClient); + return await createJitConfig(githubRunnerConfig, instances, ghClient); } else { - await createRegistrationTokenConfig(githubRunnerConfig, instances, ghClient); + return await createRegistrationTokenConfig(githubRunnerConfig, instances, ghClient); } } @@ -498,11 +526,16 @@ function addDelay(instances: string[]) { return { isDelay, delay }; } +/** + * Creates registration token configuration for non-ephemeral runners. + * + * @returns Empty array (this configuration method does not have failure cases) + */ async function createRegistrationTokenConfig( githubRunnerConfig: CreateGitHubRunnerConfig, instances: string[], ghClient: Octokit, -) { +): Promise { const { isDelay, delay } = addDelay(instances); const token = await getGithubRunnerRegistrationToken(githubRunnerConfig, ghClient); const runnerServiceConfig = generateRunnerServiceConfig(githubRunnerConfig, token); @@ -520,6 +553,8 @@ async function createRegistrationTokenConfig( await delay(25); } } + + return []; } async function tagRunnerId(instanceId: string, runnerId: string): Promise { @@ -530,7 +565,17 @@ async function tagRunnerId(instanceId: string, runnerId: string): Promise } } -async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, instances: string[], ghClient: Octokit) { +/** + * Creates JIT (Just-In-Time) configuration for ephemeral runners. + * Continues processing remaining instances even if some fail. + * + * @returns Array of instance IDs that failed to get JIT configuration + */ +async function createJitConfig( + githubRunnerConfig: CreateGitHubRunnerConfig, + instances: string[], + ghClient: Octokit, +): Promise { const runnerGroupId = await getRunnerGroupId(githubRunnerConfig, ghClient); const { isDelay, delay } = addDelay(instances); const runnerLabels = githubRunnerConfig.runnerLabels.split(','); @@ -595,4 +640,6 @@ async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, ins successfulInstances: instances.length - failedInstances.length, }); } + + return failedInstances; }