From aa895e8c486246fd922e9d14e32d9ba2e958b817 Mon Sep 17 00:00:00 2001 From: syn Date: Tue, 3 Mar 2026 15:54:45 -0600 Subject: [PATCH] Handle Fly 400 "no capacity" as a capacity error with region fallback Fly returns 400 with {"error":"no capacity"} on createVolume when a region lacks capacity. This was not recognized by isFlyInsufficientResources, causing provision to fail instead of falling back to the next region. - Add 400 to CAPACITY_STATUS_CODES and "no capacity" to CAPACITY_MARKERS - Include region in createVolume error context for easier debugging - Add tests for the new 400 capacity case and non-capacity 400 rejection --- kiloclaw/src/fly/client.test.ts | 33 +++++++++++++++++++++++++++++++++ kiloclaw/src/fly/client.ts | 20 ++++++++++++-------- 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/kiloclaw/src/fly/client.test.ts b/kiloclaw/src/fly/client.test.ts index 78143379f..9a81f9d22 100644 --- a/kiloclaw/src/fly/client.test.ts +++ b/kiloclaw/src/fly/client.test.ts @@ -72,6 +72,39 @@ describe('isFlyInsufficientResources', () => { expect(isFlyInsufficientResources(err)).toBe(true); }); + // -- Confirmed production 400 payload: no capacity on createVolume -- + + it('matches production 400 payload: no capacity on createVolume', () => { + const body = '{"error":"no capacity"}'; + const err = new FlyApiError(`Fly API createVolume failed (400): ${body}`, 400, body); + expect(isFlyInsufficientResources(err)).toBe(true); + }); + + // -- Non-capacity 400s: must NOT trigger recovery -- + + it('returns false for non-capacity 400 errors', () => { + const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + + const body = '{"error":"invalid machine config"}'; + const err = new FlyApiError(`Fly API failed (400): ${body}`, 400, body); + expect(isFlyInsufficientResources(err)).toBe(false); + + warnSpy.mockRestore(); + }); + + it('returns false and logs warning for unclassified 400', () => { + const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + const body = '{"error":"some unknown 400 reason"}'; + const err = new FlyApiError(`Fly API failed (400): ${body}`, 400, body); + + expect(isFlyInsufficientResources(err)).toBe(false); + expect(warnSpy).toHaveBeenCalledWith( + '[fly] Unclassified 400 error (not treated as capacity):', + body + ); + warnSpy.mockRestore(); + }); + // -- Non-capacity 403s: must NOT trigger recovery -- it('returns false for auth 403 errors', () => { diff --git a/kiloclaw/src/fly/client.ts b/kiloclaw/src/fly/client.ts index afd4a79af..c88811a68 100644 --- a/kiloclaw/src/fly/client.ts +++ b/kiloclaw/src/fly/client.ts @@ -193,7 +193,7 @@ export async function createVolume( method: 'POST', body: JSON.stringify(request), }); - await assertOk(resp, 'createVolume'); + await assertOk(resp, `createVolume [${request.region}]`); return resp.json(); } @@ -257,17 +257,19 @@ export function isFlyNotFound(err: unknown): boolean { /** * Status codes that Fly uses for capacity/resource exhaustion errors. + * - 400: "no capacity" on createVolume (observed in production) * - 412: "insufficient resources" when creating a machine with an existing volume * - 409: "insufficient memory" when updating/starting a machine on a full host * - 403: org memory quota exceeded in a region ("over the allowed quota") */ -const CAPACITY_STATUS_CODES = [403, 409, 412]; +const CAPACITY_STATUS_CODES = [400, 403, 409, 412]; /** * Capacity-related markers in Fly error bodies. Matched case-insensitively * against the JSON body fields (error, status) and raw body text. * * Confirmed from production: + * - 400: '{"error":"no capacity"}' * - 412: "insufficient resources to create new machine with existing volume 'vol_xxx'" * - 409: "could not reserve resource for machine: insufficient memory available to fulfill request" * - 403: 'organization "Kilo" is using N MB of memory in {region} which is over the allowed quota' @@ -276,6 +278,7 @@ const CAPACITY_STATUS_CODES = [403, 409, 412]; * capacity error formats from Fly. */ const CAPACITY_MARKERS = [ + 'no capacity', 'insufficient resources', 'insufficient memory', 'over the allowed quota', @@ -285,13 +288,14 @@ const CAPACITY_MARKERS = [ * Check if a Fly API error is a capacity/resource exhaustion issue * (host where a volume/machine lives has no room, or org quota exceeded). * - * Fly uses 412 for volume-pinned capacity issues, 409 for memory - * exhaustion on updateMachine, and 403 for org memory quota exceeded - * in a region. These codes are also used for unrelated errors - * (precondition/version mismatches, conflicts, auth), so we only + * Fly uses 400 for "no capacity" on volume creation, + * 412 for volume-pinned capacity issues, 409 for memory exhaustion + * on updateMachine, and 403 for org memory quota exceeded in a region. + * These codes are also used for unrelated errors (bad request, + * precondition/version mismatches, conflicts, auth), so we only * trigger recovery when the body contains explicit capacity markers. * - * Logs a warning for unclassified 403/409/412s so we can tune matching. + * Logs a warning for unclassified 400/403/409/412s so we can tune matching. */ export function isFlyInsufficientResources(err: unknown): boolean { if (!(err instanceof FlyApiError) || !CAPACITY_STATUS_CODES.includes(err.status)) return false; @@ -317,7 +321,7 @@ export function isFlyInsufficientResources(err: unknown): boolean { // Fall back to raw text matching across message + body if (CAPACITY_MARKERS.some(m => searchText.includes(m))) return true; - // 409/412 but no capacity signal — likely a version/precondition/conflict issue. + // Status matched but no capacity signal — likely a bad-request/auth/conflict/precondition issue. // Log so we can tune matching if Fly introduces new capacity error formats. console.warn(`[fly] Unclassified ${err.status} error (not treated as capacity):`, err.body); return false;