From 185f1026a68223d8a10d896cbeed067da59eb1db Mon Sep 17 00:00:00 2001 From: Vegard Hansen Date: Tue, 26 May 2026 22:17:40 +0200 Subject: [PATCH 1/2] fix(lambda): return batch failures on unhandled errors instead of dropping messages The scaleUpHandler catch block previously returned empty batchItemFailures for non-ScaleError exceptions, causing SQS to delete all messages in the batch. Jobs were permanently lost with no retry. Return all message IDs as batchItemFailures so SQS retries them. The redrive policy / maxReceiveCount will move persistently-failing messages to the DLQ, preventing infinite retries. Fixes #5024 --- .../control-plane/src/lambda.test.ts | 28 ++++++++++++++++--- lambdas/functions/control-plane/src/lambda.ts | 8 ++++-- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/lambdas/functions/control-plane/src/lambda.test.ts b/lambdas/functions/control-plane/src/lambda.test.ts index 2c9a98e420..5a78e57004 100644 --- a/lambdas/functions/control-plane/src/lambda.test.ts +++ b/lambdas/functions/control-plane/src/lambda.test.ts @@ -97,10 +97,13 @@ describe('Test scale up lambda wrapper.', () => { await expect(scaleUpHandler(sqsEvent, context)).resolves.not.toThrow(); }); - it('Non scale should resolve.', async () => { + it('Non scale error should return message as batch failure for retry.', async () => { const error = new Error('Non scale should resolve.'); vi.mocked(scaleUp).mockRejectedValue(error); - await expect(scaleUpHandler(sqsEvent, context)).resolves.not.toThrow(); + const result = await scaleUpHandler(sqsEvent, context); + expect(result).toEqual({ + batchItemFailures: [{ itemIdentifier: sqsRecord.messageId }], + }); }); it('Scale should create a batch failure message', async () => { @@ -208,14 +211,31 @@ describe('Test scale up lambda wrapper.', () => { await scaleUpHandler(multiRecordEvent, context); }); - it('Should return all failed messages when scaleUp throws non-ScaleError', async () => { + it('Should return all messages as batch failures when scaleUp throws non-ScaleError', async () => { const records = createMultipleRecords(2); const multiRecordEvent: SQSEvent = { Records: records }; vi.mocked(scaleUp).mockRejectedValue(new Error('Generic error')); const result = await scaleUpHandler(multiRecordEvent, context); - expect(result).toEqual({ batchItemFailures: [] }); + expect(result).toEqual({ + batchItemFailures: [{ itemIdentifier: 'message-0' }, { itemIdentifier: 'message-1' }], + }); + }); + + it('Should log error with retry message when scaleUp throws non-ScaleError', async () => { + const records = createMultipleRecords(3); + const multiRecordEvent: SQSEvent = { Records: records }; + const error = new Error('GitHub API timeout'); + + vi.mocked(scaleUp).mockRejectedValue(error); + const logSpy = vi.spyOn(logger, 'error'); + + await scaleUpHandler(multiRecordEvent, context); + expect(logSpy).toHaveBeenCalledWith( + expect.stringContaining('returning batch for retry'), + expect.objectContaining({ error }), + ); }); it('Should throw when scaleUp throws ScaleError', async () => { diff --git a/lambdas/functions/control-plane/src/lambda.ts b/lambdas/functions/control-plane/src/lambda.ts index e2a0451c95..77c05c13be 100644 --- a/lambdas/functions/control-plane/src/lambda.ts +++ b/lambdas/functions/control-plane/src/lambda.ts @@ -55,9 +55,11 @@ export async function scaleUpHandler(event: SQSEvent, context: Context): Promise batchItemFailures.push(...e.toBatchItemFailures(sqsMessages)); logger.warn(`${e.detailedMessage} A retry will be attempted via SQS.`, { error: e }); } else { - logger.error(`Error processing batch (size: ${sqsMessages.length}): ${(e as Error).message}, ignoring batch`, { - error: e, - }); + logger.error( + `Error processing batch (size: ${sqsMessages.length}): ${(e as Error).message}, returning batch for retry`, + { error: e }, + ); + batchItemFailures.push(...sqsMessages.map(({ messageId }) => ({ itemIdentifier: messageId }))); } return { batchItemFailures }; From d949963399318ea61270c81de86e7b0dc35bb82c Mon Sep 17 00:00:00 2001 From: Vegard Hansen Date: Tue, 26 May 2026 22:56:28 +0200 Subject: [PATCH 2/2] test: use order-insensitive assertion for batchItemFailures --- lambdas/functions/control-plane/src/lambda.test.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lambdas/functions/control-plane/src/lambda.test.ts b/lambdas/functions/control-plane/src/lambda.test.ts index 5a78e57004..d4c0f7ad5d 100644 --- a/lambdas/functions/control-plane/src/lambda.test.ts +++ b/lambdas/functions/control-plane/src/lambda.test.ts @@ -218,9 +218,10 @@ describe('Test scale up lambda wrapper.', () => { vi.mocked(scaleUp).mockRejectedValue(new Error('Generic error')); const result = await scaleUpHandler(multiRecordEvent, context); - expect(result).toEqual({ - batchItemFailures: [{ itemIdentifier: 'message-0' }, { itemIdentifier: 'message-1' }], - }); + expect(result.batchItemFailures).toHaveLength(2); + expect(result.batchItemFailures).toEqual( + expect.arrayContaining([{ itemIdentifier: 'message-0' }, { itemIdentifier: 'message-1' }]), + ); }); it('Should log error with retry message when scaleUp throws non-ScaleError', async () => {