diff --git a/lambdas/functions/control-plane/package.json b/lambdas/functions/control-plane/package.json index c17eeec7f9..e348a0e79e 100644 --- a/lambdas/functions/control-plane/package.json +++ b/lambdas/functions/control-plane/package.json @@ -38,6 +38,7 @@ "@middy/core": "^6.4.5", "@octokit/auth-app": "8.1.2", "@octokit/core": "7.0.6", + "@octokit/plugin-retry": "8.0.3", "@octokit/plugin-throttling": "11.0.3", "@octokit/rest": "22.0.1", "cron-parser": "^5.4.0" diff --git a/lambdas/functions/control-plane/src/github/auth.ts b/lambdas/functions/control-plane/src/github/auth.ts index d529cc81e8..8d6d8bfbf4 100644 --- a/lambdas/functions/control-plane/src/github/auth.ts +++ b/lambdas/functions/control-plane/src/github/auth.ts @@ -18,6 +18,7 @@ type StrategyOptions = { }; import { request } from '@octokit/request'; import { Octokit } from '@octokit/rest'; +import { retry } from '@octokit/plugin-retry'; import { throttling } from '@octokit/plugin-throttling'; import { createChildLogger } from '@aws-github-runner/aws-powertools-util'; import { getParameter } from '@aws-github-runner/aws-ssm-util'; @@ -26,7 +27,7 @@ import { EndpointDefaults } from '@octokit/types'; const logger = createChildLogger('gh-auth'); export async function createOctokitClient(token: string, ghesApiUrl = ''): Promise { - const CustomOctokit = Octokit.plugin(throttling); + const CustomOctokit = Octokit.plugin(retry, throttling); const ocktokitOptions: OctokitOptions = { auth: token, }; @@ -38,6 +39,17 @@ export async function createOctokitClient(token: string, ghesApiUrl = ''): Promi return new CustomOctokit({ ...ocktokitOptions, userAgent: process.env.USER_AGENT || 'github-aws-runners', + retry: { + onRetry: (retryCount: number, error: Error, request: { method: string; url: string }) => { + logger.warn('GitHub API request retry attempt', { + retryCount, + method: request.method, + url: request.url, + error: error.message, + status: (error as Error & { status?: number }).status, + }); + }, + }, throttle: { onRateLimit: (retryAfter: number, options: Required) => { logger.warn( diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 315bb8ab0b..4730eb404b 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -336,6 +336,172 @@ describe('scaleUp with GHES', () => { ], }); }); + + it('should create JIT config for all remaining instances even when GitHub API fails for one instance', async () => { + process.env.RUNNERS_MAXIMUM_COUNT = '5'; + mockCreateRunner.mockImplementation(async () => { + return ['i-instance-1', 'i-instance-2', 'i-instance-3']; + }); + mockListRunners.mockImplementation(async () => { + return []; + }); + + mockOctokit.actions.generateRunnerJitconfigForOrg.mockImplementation(({ name }) => { + if (name === 'unit-test-i-instance-2') { + // Simulate a 503 Service Unavailable error from GitHub + const error = new Error('Service Unavailable') as Error & { + status: number; + response: { status: number; data: { message: string } }; + }; + error.status = 503; + error.response = { + status: 503, + data: { message: 'Service temporarily unavailable' }, + }; + throw error; + } + return { + data: { + runner: { id: 9876543210 }, + encoded_jit_config: `TEST_JIT_CONFIG_${name}`, + }, + headers: {}, + }; + }); + + await scaleUpModule.scaleUp(TEST_DATA); + + expect(mockOctokit.actions.generateRunnerJitconfigForOrg).toHaveBeenCalledWith({ + org: TEST_DATA_SINGLE.repositoryOwner, + name: 'unit-test-i-instance-1', + runner_group_id: 1, + labels: ['label1', 'label2'], + }); + + expect(mockOctokit.actions.generateRunnerJitconfigForOrg).toHaveBeenCalledWith({ + org: TEST_DATA_SINGLE.repositoryOwner, + name: 'unit-test-i-instance-2', + runner_group_id: 1, + labels: ['label1', 'label2'], + }); + + expect(mockOctokit.actions.generateRunnerJitconfigForOrg).toHaveBeenCalledWith({ + org: TEST_DATA_SINGLE.repositoryOwner, + name: 'unit-test-i-instance-3', + runner_group_id: 1, + labels: ['label1', 'label2'], + }); + + expect(mockSSMClient).toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-1', + Value: 'TEST_JIT_CONFIG_unit-test-i-instance-1', + Type: 'SecureString', + Tags: [{ Key: 'InstanceId', Value: 'i-instance-1' }], + }); + + expect(mockSSMClient).toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-3', + Value: 'TEST_JIT_CONFIG_unit-test-i-instance-3', + Type: 'SecureString', + Tags: [{ Key: 'InstanceId', Value: 'i-instance-3' }], + }); + + expect(mockSSMClient).not.toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-2', + }); + }); + + it('should handle retryable errors with error handling logic', async () => { + process.env.RUNNERS_MAXIMUM_COUNT = '5'; + mockCreateRunner.mockImplementation(async () => { + return ['i-instance-1', 'i-instance-2']; + }); + mockListRunners.mockImplementation(async () => { + return []; + }); + + mockOctokit.actions.generateRunnerJitconfigForOrg.mockImplementation(({ name }) => { + if (name === 'unit-test-i-instance-1') { + const error = new Error('Internal Server Error') as Error & { + status: number; + response: { status: number; data: { message: string } }; + }; + error.status = 500; + error.response = { + status: 500, + data: { message: 'Internal server error' }, + }; + throw error; + } + return { + data: { + runner: { id: 9876543210 }, + encoded_jit_config: `TEST_JIT_CONFIG_${name}`, + }, + headers: {}, + }; + }); + + await scaleUpModule.scaleUp(TEST_DATA); + + expect(mockSSMClient).toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-2', + Value: 'TEST_JIT_CONFIG_unit-test-i-instance-2', + Type: 'SecureString', + Tags: [{ Key: 'InstanceId', Value: 'i-instance-2' }], + }); + + expect(mockSSMClient).not.toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-1', + }); + }); + + it('should handle non-retryable 4xx errors gracefully', async () => { + process.env.RUNNERS_MAXIMUM_COUNT = '5'; + mockCreateRunner.mockImplementation(async () => { + return ['i-instance-1', 'i-instance-2']; + }); + mockListRunners.mockImplementation(async () => { + return []; + }); + + mockOctokit.actions.generateRunnerJitconfigForOrg.mockImplementation(({ name }) => { + if (name === 'unit-test-i-instance-1') { + // 404 is not retryable - will fail immediately + const error = new Error('Not Found') as Error & { + status: number; + response: { status: number; data: { message: string } }; + }; + error.status = 404; + error.response = { + status: 404, + data: { message: 'Resource not found' }, + }; + throw error; + } + return { + data: { + runner: { id: 9876543210 }, + encoded_jit_config: `TEST_JIT_CONFIG_${name}`, + }, + headers: {}, + }; + }); + + await scaleUpModule.scaleUp(TEST_DATA); + + expect(mockSSMClient).toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-2', + Value: 'TEST_JIT_CONFIG_unit-test-i-instance-2', + Type: 'SecureString', + Tags: [{ Key: 'InstanceId', Value: 'i-instance-2' }], + }); + + expect(mockSSMClient).not.toHaveReceivedCommandWith(PutParameterCommand, { + Name: '/github-action-runners/default/runners/config/i-instance-1', + }); + }); + it.each(RUNNER_TYPES)( 'calls create start runner config of 40' + ' instances (ssm rate limit condition) to test time delay ', async (type: RunnerType) => { diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index e271b901df..83b1e50c5a 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -4,7 +4,7 @@ import { getParameter, putParameter } from '@aws-github-runner/aws-ssm-util'; import yn from 'yn'; import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth'; -import { createRunner, listEC2Runners, tag } from './../aws/runners'; +import { createRunner, listEC2Runners, tag, terminateRunner } from './../aws/runners'; import { RunnerInputParameters } from './../aws/runners.d'; import { metricGitHubAppRateLimit } from '../github/rate-limit'; @@ -222,7 +222,29 @@ export async function createRunners( ...ec2RunnerConfig, }); if (instances.length !== 0) { - await createStartRunnerConfig(githubRunnerConfig, instances, ghClient); + const failedInstances = await createStartRunnerConfig(githubRunnerConfig, instances, ghClient); + + // Terminate instances that failed to get configured to avoid waste + if (failedInstances.length > 0) { + logger.warn('Terminating instances that failed to get configured', { + failedInstances, + failedCount: failedInstances.length, + }); + + for (const instanceId of failedInstances) { + try { + await terminateRunner(instanceId); + } catch (error) { + logger.error('Failed to terminate instance', { + instanceId, + error: error instanceof Error ? error.message : String(error), + }); + } + } + + // Remove failed instances from the returned list + return instances.filter((id) => !failedInstances.includes(id)); + } } return instances; @@ -475,15 +497,21 @@ export function getGitHubEnterpriseApiUrl() { return { ghesApiUrl, ghesBaseUrl }; } +/** + * Creates the start configuration for runner instances by either generating JIT configs + * or registration tokens. + * + * @returns Array of instance IDs that failed to get configured + */ async function createStartRunnerConfig( githubRunnerConfig: CreateGitHubRunnerConfig, instances: string[], ghClient: Octokit, -) { +): Promise { if (githubRunnerConfig.enableJitConfig && githubRunnerConfig.ephemeral) { - await createJitConfig(githubRunnerConfig, instances, ghClient); + return await createJitConfig(githubRunnerConfig, instances, ghClient); } else { - await createRegistrationTokenConfig(githubRunnerConfig, instances, ghClient); + return await createRegistrationTokenConfig(githubRunnerConfig, instances, ghClient); } } @@ -498,11 +526,16 @@ function addDelay(instances: string[]) { return { isDelay, delay }; } +/** + * Creates registration token configuration for non-ephemeral runners. + * + * @returns Empty array (this configuration method does not have failure cases) + */ async function createRegistrationTokenConfig( githubRunnerConfig: CreateGitHubRunnerConfig, instances: string[], ghClient: Octokit, -) { +): Promise { const { isDelay, delay } = addDelay(instances); const token = await getGithubRunnerRegistrationToken(githubRunnerConfig, ghClient); const runnerServiceConfig = generateRunnerServiceConfig(githubRunnerConfig, token); @@ -520,6 +553,8 @@ async function createRegistrationTokenConfig( await delay(25); } } + + return []; } async function tagRunnerId(instanceId: string, runnerId: string): Promise { @@ -530,52 +565,81 @@ async function tagRunnerId(instanceId: string, runnerId: string): Promise } } -async function createJitConfig(githubRunnerConfig: CreateGitHubRunnerConfig, instances: string[], ghClient: Octokit) { +/** + * Creates JIT (Just-In-Time) configuration for ephemeral runners. + * Continues processing remaining instances even if some fail. + * + * @returns Array of instance IDs that failed to get JIT configuration + */ +async function createJitConfig( + githubRunnerConfig: CreateGitHubRunnerConfig, + instances: string[], + ghClient: Octokit, +): Promise { const runnerGroupId = await getRunnerGroupId(githubRunnerConfig, ghClient); const { isDelay, delay } = addDelay(instances); const runnerLabels = githubRunnerConfig.runnerLabels.split(','); + const failedInstances: string[] = []; logger.debug(`Runner group id: ${runnerGroupId}`); logger.debug(`Runner labels: ${runnerLabels}`); for (const instance of instances) { - // generate jit config for runner registration - const ephemeralRunnerConfig: EphemeralRunnerConfig = { - runnerName: `${githubRunnerConfig.runnerNamePrefix}${instance}`, - runnerGroupId: runnerGroupId, - runnerLabels: runnerLabels, - }; - logger.debug(`Runner name: ${ephemeralRunnerConfig.runnerName}`); - const runnerConfig = - githubRunnerConfig.runnerType === 'Org' - ? await ghClient.actions.generateRunnerJitconfigForOrg({ - org: githubRunnerConfig.runnerOwner, - name: ephemeralRunnerConfig.runnerName, - runner_group_id: ephemeralRunnerConfig.runnerGroupId, - labels: ephemeralRunnerConfig.runnerLabels, - }) - : await ghClient.actions.generateRunnerJitconfigForRepo({ - owner: githubRunnerConfig.runnerOwner.split('/')[0], - repo: githubRunnerConfig.runnerOwner.split('/')[1], - name: ephemeralRunnerConfig.runnerName, - runner_group_id: ephemeralRunnerConfig.runnerGroupId, - labels: ephemeralRunnerConfig.runnerLabels, - }); - - metricGitHubAppRateLimit(runnerConfig.headers); - - // tag the EC2 instance with the Github runner id - await tagRunnerId(instance, runnerConfig.data.runner.id.toString()); + try { + // generate jit config for runner registration + const ephemeralRunnerConfig: EphemeralRunnerConfig = { + runnerName: `${githubRunnerConfig.runnerNamePrefix}${instance}`, + runnerGroupId: runnerGroupId, + runnerLabels: runnerLabels, + }; + logger.debug(`Runner name: ${ephemeralRunnerConfig.runnerName}`); + const runnerConfig = + githubRunnerConfig.runnerType === 'Org' + ? await ghClient.actions.generateRunnerJitconfigForOrg({ + org: githubRunnerConfig.runnerOwner, + name: ephemeralRunnerConfig.runnerName, + runner_group_id: ephemeralRunnerConfig.runnerGroupId, + labels: ephemeralRunnerConfig.runnerLabels, + }) + : await ghClient.actions.generateRunnerJitconfigForRepo({ + owner: githubRunnerConfig.runnerOwner.split('/')[0], + repo: githubRunnerConfig.runnerOwner.split('/')[1], + name: ephemeralRunnerConfig.runnerName, + runner_group_id: ephemeralRunnerConfig.runnerGroupId, + labels: ephemeralRunnerConfig.runnerLabels, + }); + + metricGitHubAppRateLimit(runnerConfig.headers); + + // tag the EC2 instance with the Github runner id + await tagRunnerId(instance, runnerConfig.data.runner.id.toString()); + + // store jit config in ssm parameter store + logger.debug('Runner JIT config for ephemeral runner generated.', { + instance: instance, + }); + await putParameter(`${githubRunnerConfig.ssmTokenPath}/${instance}`, runnerConfig.data.encoded_jit_config, true, { + tags: [{ Key: 'InstanceId', Value: instance }], + }); + if (isDelay) { + // Delay to prevent AWS ssm rate limits by being within the max throughput limit + await delay(25); + } + } catch (error) { + failedInstances.push(instance); + logger.warn('Failed to create JIT config for instance, continuing with remaining instances', { + instance: instance, + error: error instanceof Error ? error.message : String(error), + }); + } + } - // store jit config in ssm parameter store - logger.debug('Runner JIT config for ephemeral runner generated.', { - instance: instance, - }); - await putParameter(`${githubRunnerConfig.ssmTokenPath}/${instance}`, runnerConfig.data.encoded_jit_config, true, { - tags: [{ Key: 'InstanceId', Value: instance }], + if (failedInstances.length > 0) { + logger.error('Failed to create JIT config for some instances', { + failedInstances: failedInstances, + totalInstances: instances.length, + successfulInstances: instances.length - failedInstances.length, }); - if (isDelay) { - // Delay to prevent AWS ssm rate limits by being within the max throughput limit - await delay(25); - } } + + return failedInstances; } diff --git a/lambdas/yarn.lock b/lambdas/yarn.lock index af188cd0a8..8c195998a6 100644 --- a/lambdas/yarn.lock +++ b/lambdas/yarn.lock @@ -155,6 +155,7 @@ __metadata: "@middy/core": "npm:^6.4.5" "@octokit/auth-app": "npm:8.1.2" "@octokit/core": "npm:7.0.6" + "@octokit/plugin-retry": "npm:8.0.3" "@octokit/plugin-throttling": "npm:11.0.3" "@octokit/rest": "npm:22.0.1" "@octokit/types": "npm:^16.0.0" @@ -3685,6 +3686,19 @@ __metadata: languageName: node linkType: hard +"@octokit/plugin-retry@npm:8.0.3": + version: 8.0.3 + resolution: "@octokit/plugin-retry@npm:8.0.3" + dependencies: + "@octokit/request-error": "npm:^7.0.2" + "@octokit/types": "npm:^16.0.0" + bottleneck: "npm:^2.15.3" + peerDependencies: + "@octokit/core": ">=7" + checksum: 10c0/24d35d85f750f9e3e52f63b8ddd8fc8aa7bdd946c77b9ea4d6894d026c5c2c69109e8de3880a9970c906f624eb777c7d0c0a2072e6d41dadc7b36cce104b978c + languageName: node + linkType: hard + "@octokit/plugin-throttling@npm:11.0.3": version: 11.0.3 resolution: "@octokit/plugin-throttling@npm:11.0.3"