Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions kiloclaw/src/fly/client.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,39 @@ describe('isFlyInsufficientResources', () => {
expect(isFlyInsufficientResources(err)).toBe(true);
});

// -- Confirmed production 400 payload: no capacity on createVolume --

it('matches production 400 payload: no capacity on createVolume', () => {
const body = '{"error":"no capacity"}';
const err = new FlyApiError(`Fly API createVolume failed (400): ${body}`, 400, body);
expect(isFlyInsufficientResources(err)).toBe(true);
});

// -- Non-capacity 400s: must NOT trigger recovery --

it('returns false for non-capacity 400 errors', () => {
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});

const body = '{"error":"invalid machine config"}';
const err = new FlyApiError(`Fly API failed (400): ${body}`, 400, body);
expect(isFlyInsufficientResources(err)).toBe(false);

warnSpy.mockRestore();
});

it('returns false and logs warning for unclassified 400', () => {
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
const body = '{"error":"some unknown 400 reason"}';
const err = new FlyApiError(`Fly API failed (400): ${body}`, 400, body);

expect(isFlyInsufficientResources(err)).toBe(false);
expect(warnSpy).toHaveBeenCalledWith(
'[fly] Unclassified 400 error (not treated as capacity):',
body
);
warnSpy.mockRestore();
});

// -- Non-capacity 403s: must NOT trigger recovery --

it('returns false for auth 403 errors', () => {
Expand Down
20 changes: 12 additions & 8 deletions kiloclaw/src/fly/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ export async function createVolume(
method: 'POST',
body: JSON.stringify(request),
});
await assertOk(resp, 'createVolume');
await assertOk(resp, `createVolume [${request.region}]`);
Copy link
Contributor Author

@pandemicsyn pandemicsyn Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mostly out of morbid curiosity, but having the region handy here might be helpful later

return resp.json();
}

Expand Down Expand Up @@ -257,17 +257,19 @@ export function isFlyNotFound(err: unknown): boolean {

/**
* Status codes that Fly uses for capacity/resource exhaustion errors.
* - 400: "no capacity" on createVolume (observed in production)
* - 412: "insufficient resources" when creating a machine with an existing volume
* - 409: "insufficient memory" when updating/starting a machine on a full host
* - 403: org memory quota exceeded in a region ("over the allowed quota")
*/
const CAPACITY_STATUS_CODES = [403, 409, 412];
const CAPACITY_STATUS_CODES = [400, 403, 409, 412];

/**
* Capacity-related markers in Fly error bodies. Matched case-insensitively
* against the JSON body fields (error, status) and raw body text.
*
* Confirmed from production:
* - 400: '{"error":"no capacity"}'
* - 412: "insufficient resources to create new machine with existing volume 'vol_xxx'"
* - 409: "could not reserve resource for machine: insufficient memory available to fulfill request"
* - 403: 'organization "Kilo" is using N MB of memory in {region} which is over the allowed quota'
Expand All @@ -276,6 +278,7 @@ const CAPACITY_STATUS_CODES = [403, 409, 412];
* capacity error formats from Fly.
*/
const CAPACITY_MARKERS = [
'no capacity',
'insufficient resources',
'insufficient memory',
'over the allowed quota',
Expand All @@ -285,13 +288,14 @@ const CAPACITY_MARKERS = [
* Check if a Fly API error is a capacity/resource exhaustion issue
* (host where a volume/machine lives has no room, or org quota exceeded).
*
* Fly uses 412 for volume-pinned capacity issues, 409 for memory
* exhaustion on updateMachine, and 403 for org memory quota exceeded
* in a region. These codes are also used for unrelated errors
* (precondition/version mismatches, conflicts, auth), so we only
* Fly uses 400 for "no capacity" on volume creation,
* 412 for volume-pinned capacity issues, 409 for memory exhaustion
* on updateMachine, and 403 for org memory quota exceeded in a region.
* These codes are also used for unrelated errors (bad request,
* precondition/version mismatches, conflicts, auth), so we only
* trigger recovery when the body contains explicit capacity markers.
*
* Logs a warning for unclassified 403/409/412s so we can tune matching.
* Logs a warning for unclassified 400/403/409/412s so we can tune matching.
*/
export function isFlyInsufficientResources(err: unknown): boolean {
if (!(err instanceof FlyApiError) || !CAPACITY_STATUS_CODES.includes(err.status)) return false;
Expand All @@ -317,7 +321,7 @@ export function isFlyInsufficientResources(err: unknown): boolean {
// Fall back to raw text matching across message + body
if (CAPACITY_MARKERS.some(m => searchText.includes(m))) return true;

// 409/412 but no capacity signal — likely a version/precondition/conflict issue.
// Status matched but no capacity signal — likely a bad-request/auth/conflict/precondition issue.
// Log so we can tune matching if Fly introduces new capacity error formats.
console.warn(`[fly] Unclassified ${err.status} error (not treated as capacity):`, err.body);
return false;
Expand Down