From b8c45d8ceb2aeb983ab1c4ab06d030d330f6d8a6 Mon Sep 17 00:00:00 2001 From: Vegard Hansen Date: Tue, 26 May 2026 22:29:46 +0200 Subject: [PATCH 1/2] fix(lambda): return partial CreateFleet instances instead of discarding them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CreateFleet returns partial success (some instances created, some errors), processFleetResult previously threw ScaleError and discarded the successfully-created instance IDs. Those instances would boot with no JIT config in SSM — orphaned until scale-down reaps them. Now returns partial instances when at least one was created. The caller (scaleUp) already handles count mismatch by marking unfulfilled messages as batch failures for SQS retry. ScaleError is only thrown when zero instances were created. Also changes ScaleError to carry numberOfRunners (the full requested count) rather than the count of matched error codes, ensuring SQS retries the correct number of messages. --- .../control-plane/src/aws/runners.test.ts | 16 ++++++++++++-- .../control-plane/src/aws/runners.ts | 21 +++++++++++++++++-- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index 4243e4b06b..bd385d255d 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -513,7 +513,7 @@ describe('create runner with errors', () => { await expect(createRunner(createRunnerConfig(defaultRunnerConfig))).rejects.toMatchObject({ name: 'ScaleError', - failedInstanceCount: 2, + failedInstanceCount: 1, // numberOfRunners when zero instances created }); expect(mockEC2Client).toHaveReceivedCommandWith( CreateFleetCommand, @@ -543,6 +543,16 @@ describe('create runner with errors', () => { ); }); + it('returns partial instances on recognized scale error instead of throwing', async () => { + createFleetMockWithErrors(['UnfulfillableCapacity'], ['i-partial']); + + await expect(createRunner(createRunnerConfig(defaultRunnerConfig))).resolves.toEqual(['i-partial']); + expect(mockEC2Client).toHaveReceivedCommandWith( + CreateFleetCommand, + expectedCreateFleetRequest(defaultExpectedFleetRequestValues), + ); + }); + it('test error by create fleet call is thrown.', async () => { mockEC2Client.on(CreateFleetCommand).rejects(new Error('Some error')); @@ -688,12 +698,14 @@ describe('create runner with errors fail over to OnDemand', () => { // fallback to on demand for UnfulfillableCapacity but InsufficientInstanceCapacity is thrown createFleetMockWithWithOnDemandFallback(['UnfulfillableCapacity'], instancesIds); + // Partial success: 1 instance created, unrecognized error for the rest. + // Returns partial instances instead of throwing to prevent orphans. await expect( createRunner({ ...createRunnerConfig(defaultRunnerConfig), numberOfRunners: 2, }), - ).rejects.toBeInstanceOf(Error); + ).resolves.toEqual(['i-123']); expect(mockEC2Client).toHaveReceivedCommandTimes(CreateFleetCommand, 1); diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index 193c82d2e7..a5dd46e4d7 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -201,9 +201,26 @@ async function processFleetResult( const failedCount = countScaleErrors(errors, scaleErrors); if (failedCount > 0) { - logger.warn('Create fleet failed, ScaleError will be thrown to trigger retry for ephemeral runners.'); + if (instances.length > 0) { + logger.warn( + `Partial fleet success: ${instances.length}/${runnerParameters.numberOfRunners} instances created. ` + + `Returning partial results; caller will retry the shortfall via SQS.`, + { data: fleet.Errors }, + ); + return instances; + } + logger.warn('Create fleet failed with zero instances, ScaleError will be thrown to trigger retry.'); logger.debug('Create fleet failed.', { data: fleet.Errors }); - throw new ScaleError(failedCount); + throw new ScaleError(runnerParameters.numberOfRunners); + } + + if (instances.length > 0) { + logger.warn( + `Partial fleet success: ${instances.length}/${runnerParameters.numberOfRunners} instances created. ` + + `Error not recognized as scaling error; returning partial results.`, + { data: fleet.Errors }, + ); + return instances; } logger.warn('Create fleet failed, error not recognized as scaling error.', { data: fleet.Errors }); From e8cdc4ecb3576ee60caf8b2a3315e25b0a23463f Mon Sep 17 00:00:00 2001 From: Vegard Hansen Date: Tue, 26 May 2026 22:59:58 +0200 Subject: [PATCH 2/2] =?UTF-8?q?refactor:=20address=20review=20feedback=20?= =?UTF-8?q?=E2=80=94=20mechanism-agnostic=20log,=20stronger=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Reword partial-success log messages to remove SQS reference; add created/requested counts to log metadata - ScaleError test now uses numberOfRunners=3 to validate failedInstanceCount - Partial-success test now requests 3 runners and gets 1 back (truly partial) --- .../control-plane/src/aws/runners.test.ts | 14 +++++++++----- lambdas/functions/control-plane/src/aws/runners.ts | 6 +++--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index bd385d255d..82f31295be 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -511,13 +511,15 @@ describe('create runner with errors', () => { it('test ScaleError with multiple error.', async () => { createFleetMockWithErrors(['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'NotMappedError']); - await expect(createRunner(createRunnerConfig(defaultRunnerConfig))).rejects.toMatchObject({ + await expect( + createRunner({ ...createRunnerConfig(defaultRunnerConfig), numberOfRunners: 3 }), + ).rejects.toMatchObject({ name: 'ScaleError', - failedInstanceCount: 1, // numberOfRunners when zero instances created + failedInstanceCount: 3, // numberOfRunners when zero instances created }); expect(mockEC2Client).toHaveReceivedCommandWith( CreateFleetCommand, - expectedCreateFleetRequest(defaultExpectedFleetRequestValues), + expectedCreateFleetRequest({ ...defaultExpectedFleetRequestValues, totalTargetCapacity: 3 }), ); expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand); }); @@ -546,10 +548,12 @@ describe('create runner with errors', () => { it('returns partial instances on recognized scale error instead of throwing', async () => { createFleetMockWithErrors(['UnfulfillableCapacity'], ['i-partial']); - await expect(createRunner(createRunnerConfig(defaultRunnerConfig))).resolves.toEqual(['i-partial']); + await expect( + createRunner({ ...createRunnerConfig(defaultRunnerConfig), numberOfRunners: 3 }), + ).resolves.toEqual(['i-partial']); expect(mockEC2Client).toHaveReceivedCommandWith( CreateFleetCommand, - expectedCreateFleetRequest(defaultExpectedFleetRequestValues), + expectedCreateFleetRequest({ ...defaultExpectedFleetRequestValues, totalTargetCapacity: 3 }), ); }); diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index a5dd46e4d7..93b52e0a90 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -204,8 +204,8 @@ async function processFleetResult( if (instances.length > 0) { logger.warn( `Partial fleet success: ${instances.length}/${runnerParameters.numberOfRunners} instances created. ` + - `Returning partial results; caller will retry the shortfall via SQS.`, - { data: fleet.Errors }, + `Returning partial results; unfulfilled requests remain for retry.`, + { data: fleet.Errors, created: instances.length, requested: runnerParameters.numberOfRunners }, ); return instances; } @@ -218,7 +218,7 @@ async function processFleetResult( logger.warn( `Partial fleet success: ${instances.length}/${runnerParameters.numberOfRunners} instances created. ` + `Error not recognized as scaling error; returning partial results.`, - { data: fleet.Errors }, + { data: fleet.Errors, created: instances.length, requested: runnerParameters.numberOfRunners }, ); return instances; }