Skip to content

Commit cf14693

Browse files
authored
improvement(memory): replace unbounded server caches with lru-cache to fix heap growth (#4652)
* fix(memory): prune toolSchemaCache and semaphores to prevent heap growth toolSchemaCache (lib/copilot/chat/payload.ts): module-level Map keyed by userId:workspaceId never deleted expired entries, only checked TTL on read. With 100K+ unique user/workspace pairs each holding 50-200KB of tool schemas, this was the primary driver of the 24MB -> 25GB heap growth observed in CloudWatch. Add a setInterval sweep every 30s (matching the TTL) with .unref() so it does not prevent graceful shutdown. semaphores (lib/core/async-jobs/backends/database.ts): acquireSlot created Semaphore entries that releaseSlot never deleted. With per-execution UUID keys (e.g. scheduleJobId), each scheduled workflow run would add a permanent entry. Store the concurrency limit on the Semaphore struct and delete the entry from the Map when all slots are free and no waiters remain. validatorCache (lib/copilot/tools/server/generated-schema.ts): validated as bounded (93 tools x 2 schema kinds = 186 max entries, ~2-9MB). No fix needed. isolated-vm nativeContexts: validated as deferred GC, self-healed by worker rotation at MAX_EXECUTIONS_PER_WORKER=200. externalMB spikes trace to concurrent isolate heaps at peak load (128MB limit x active isolates), not a reference leak. No fix needed. * fix(memory): prune effectiveEnvCache and instrument cache sizes in telemetry effectiveEnvCache (lib/environment/utils.ts): same unbounded accumulation pattern as toolSchemaCache — module-level Map keyed by userId:workspaceId with a 15s TTL that is only checked on read, never proactively evicted. Adds a periodic sweep matching the TTL interval with .unref(). cache-registry (lib/monitoring/cache-registry.ts): lightweight registry so modules can expose their cache sizes to telemetry without coupling. toolSchemaCache and effectiveEnvCache both register on module load. memory-telemetry: emits cacheSizes in every Memory snapshot log so CloudWatch can confirm the caches stay bounded post-deploy. * improvement(memory): replace manual TTL Maps with lru-cache for toolSchemaCache and effectiveEnvCache Replaces the homegrown Map + setInterval sweep pattern with LRUCache from the lru-cache npm package, which is the standard Node.js solution for bounded in-process caching with TTL. Changes per cache: - Removes manual ToolSchemaCacheEntry / EffectiveEnvCacheEntry types - Removes setInterval sweep timers (and the .unref() boilerplate) - Removes the two-phase promise->value entry update inside the IIFE - Stores Promise<T> directly — in-flight and resolved states share one type - max: 200 (toolSchemaCache) / max: 500 (effectiveEnvCache) as hard ceilings - TTL behaviour and concurrent-request deduplication are preserved exactly - cache-registry .size reporting works unchanged via lru-cache's .size prop * fix(memory): remove redundant waiters guard in releaseSlot
1 parent 268fa0e commit cf14693

7 files changed

Lines changed: 48 additions & 49 deletions

File tree

apps/sim/lib/copilot/chat/payload.ts

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import { createLogger } from '@sim/logger'
22
import { toError } from '@sim/utils/errors'
3+
import { LRUCache } from 'lru-cache'
34
import { getHighestPrioritySubscription } from '@/lib/billing/core/subscription'
45
import { isPaid } from '@/lib/billing/plan-helpers'
56
import { getToolEntry } from '@/lib/copilot/tool-executor/router'
67
import { getCopilotToolDescription } from '@/lib/copilot/tools/descriptions'
78
import { isHosted } from '@/lib/core/config/feature-flags'
9+
import { registerCache } from '@/lib/monitoring/cache-registry'
810
import { buildMothershipToolsForRequest } from '@/lib/mothership/settings/runtime'
911
import { trackChatUpload } from '@/lib/uploads/contexts/workspace/workspace-file-manager'
1012
import { tools } from '@/tools/registry'
@@ -13,13 +15,12 @@ import { getLatestVersionTools, stripVersionSuffix } from '@/tools/utils'
1315
const logger = createLogger('CopilotChatPayload')
1416
const TOOL_SCHEMA_CACHE_TTL_MS = 30_000
1517

16-
type ToolSchemaCacheEntry = {
17-
expiresAt: number
18-
value?: ToolSchema[]
19-
promise?: Promise<ToolSchema[]>
20-
}
18+
const toolSchemaCache = new LRUCache<string, Promise<ToolSchema[]>>({
19+
max: 200,
20+
ttl: TOOL_SCHEMA_CACHE_TTL_MS,
21+
})
2122

22-
const toolSchemaCache = new Map<string, ToolSchemaCacheEntry>()
23+
registerCache('toolSchemaCache', () => toolSchemaCache.size)
2324

2425
interface BuildPayloadParams {
2526
message: string
@@ -74,13 +75,10 @@ export async function buildIntegrationToolSchemas(
7475
workspaceId?: string
7576
): Promise<ToolSchema[]> {
7677
const cacheKey = `${userId}:${workspaceId ?? ''}:${options.schemaSurface ?? 'copilot'}`
77-
const now = Date.now()
78+
7879
const cached = toolSchemaCache.get(cacheKey)
79-
if (cached?.value && cached.expiresAt > now) {
80-
return cached.value.map((tool) => ({ ...tool, input_schema: { ...tool.input_schema } }))
81-
}
82-
if (cached?.promise) {
83-
const tools = await cached.promise
80+
if (cached) {
81+
const tools = await cached
8482
return tools.map((tool) => ({ ...tool, input_schema: { ...tool.input_schema } }))
8583
}
8684

@@ -187,18 +185,10 @@ export async function buildIntegrationToolSchemas(
187185
)
188186
}
189187

190-
toolSchemaCache.set(cacheKey, {
191-
value: integrationTools,
192-
expiresAt: Date.now() + TOOL_SCHEMA_CACHE_TTL_MS,
193-
})
194-
195188
return integrationTools
196189
})()
197190

198-
toolSchemaCache.set(cacheKey, {
199-
expiresAt: now + TOOL_SCHEMA_CACHE_TTL_MS,
200-
promise,
201-
})
191+
toolSchemaCache.set(cacheKey, promise)
202192

203193
const integrationTools = await promise
204194
return integrationTools.map((tool) => ({ ...tool, input_schema: { ...tool.input_schema } }))

apps/sim/lib/core/async-jobs/backends/database.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ function rowToJob(row: AsyncJobRow): Job {
3838
const inlineAbortControllers = new Map<string, AbortController>()
3939

4040
interface Semaphore {
41+
limit: number
4142
available: number
4243
waiters: Array<() => void>
4344
}
@@ -46,7 +47,7 @@ const semaphores = new Map<string, Semaphore>()
4647
async function acquireSlot(key: string, limit: number): Promise<void> {
4748
let s = semaphores.get(key)
4849
if (!s) {
49-
s = { available: limit, waiters: [] }
50+
s = { limit, available: limit, waiters: [] }
5051
semaphores.set(key, s)
5152
}
5253
if (s.available > 0) {
@@ -65,6 +66,9 @@ function releaseSlot(key: string): void {
6566
return
6667
}
6768
s.available += 1
69+
if (s.available === s.limit) {
70+
semaphores.delete(key)
71+
}
6872
}
6973

7074
export class DatabaseJobQueue implements JobQueueBackend {

apps/sim/lib/environment/utils.ts

Lines changed: 15 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,24 +4,25 @@ import { createLogger } from '@sim/logger'
44
import { getErrorMessage } from '@sim/utils/errors'
55
import { generateId } from '@sim/utils/id'
66
import { eq, inArray } from 'drizzle-orm'
7+
import { LRUCache } from 'lru-cache'
78
import { decryptSecret, encryptSecret } from '@/lib/core/security/encryption'
89
import {
910
createWorkspaceEnvCredentials,
1011
getAccessibleEnvCredentials,
1112
syncPersonalEnvCredentialsForUser,
1213
} from '@/lib/credentials/environment'
14+
import { registerCache } from '@/lib/monitoring/cache-registry'
1315
import { checkWorkspaceAccess } from '@/lib/workspaces/permissions/utils'
1416

1517
const logger = createLogger('EnvironmentUtils')
1618
const EFFECTIVE_ENV_CACHE_TTL_MS = 15_000
1719

18-
type EffectiveEnvCacheEntry = {
19-
expiresAt: number
20-
value?: Record<string, string>
21-
promise?: Promise<Record<string, string>>
22-
}
20+
const effectiveEnvCache = new LRUCache<string, Promise<Record<string, string>>>({
21+
max: 500,
22+
ttl: EFFECTIVE_ENV_CACHE_TTL_MS,
23+
})
2324

24-
const effectiveEnvCache = new Map<string, EffectiveEnvCacheEntry>()
25+
registerCache('effectiveEnvCache', () => effectiveEnvCache.size)
2526

2627
function getEffectiveEnvCacheKey(userId: string, workspaceId?: string) {
2728
return `${userId}:${workspaceId ?? ''}`
@@ -325,37 +326,24 @@ export async function getEffectiveDecryptedEnv(
325326
workspaceId?: string
326327
): Promise<Record<string, string>> {
327328
const cacheKey = getEffectiveEnvCacheKey(userId, workspaceId)
328-
const now = Date.now()
329-
const cached = effectiveEnvCache.get(cacheKey)
330-
331-
if (cached?.value && cached.expiresAt > now) {
332-
return { ...cached.value }
333-
}
334329

335-
if (cached?.promise) {
336-
const value = await cached.promise
330+
const cached = effectiveEnvCache.get(cacheKey)
331+
if (cached) {
332+
const value = await cached
337333
return { ...value }
338334
}
339335

340336
const promise = getPersonalAndWorkspaceEnv(userId, workspaceId)
341-
.then(({ personalDecrypted, workspaceDecrypted }) => {
342-
const value = { ...personalDecrypted, ...workspaceDecrypted }
343-
effectiveEnvCache.set(cacheKey, {
344-
value,
345-
expiresAt: Date.now() + EFFECTIVE_ENV_CACHE_TTL_MS,
346-
})
347-
return value
348-
})
337+
.then(({ personalDecrypted, workspaceDecrypted }) => ({
338+
...personalDecrypted,
339+
...workspaceDecrypted,
340+
}))
349341
.catch((error) => {
350342
effectiveEnvCache.delete(cacheKey)
351343
throw error
352344
})
353345

354-
effectiveEnvCache.set(cacheKey, {
355-
expiresAt: now + EFFECTIVE_ENV_CACHE_TTL_MS,
356-
promise,
357-
})
358-
346+
effectiveEnvCache.set(cacheKey, promise)
359347
const value = await promise
360348
return { ...value }
361349
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
const registry = new Map<string, () => number>()
2+
3+
export function registerCache(name: string, getSize: () => number): void {
4+
registry.set(name, getSize)
5+
}
6+
7+
export function getCacheSizes(): Record<string, number> {
8+
const sizes: Record<string, number> = {}
9+
for (const [name, getSize] of registry) {
10+
sizes[name] = getSize()
11+
}
12+
return sizes
13+
}

apps/sim/lib/monitoring/memory-telemetry.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import v8 from 'node:v8'
77
import { createLogger } from '@sim/logger'
8+
import { getCacheSizes } from '@/lib/monitoring/cache-registry'
89

910
const logger = createLogger('MemoryTelemetry', { logLevel: 'INFO' })
1011

@@ -33,6 +34,7 @@ export function startMemoryTelemetry(intervalMs = 60_000) {
3334
? process.getActiveResourcesInfo().length
3435
: -1,
3536
uptimeMin: Math.round(process.uptime() / 60),
37+
cacheSizes: getCacheSizes(),
3638
})
3739
}, intervalMs)
3840
timer.unref()

apps/sim/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@
148148
"json5": "2.2.3",
149149
"jszip": "3.10.1",
150150
"jwt-decode": "^4.0.0",
151+
"lru-cache": "11.3.6",
151152
"lucide-react": "^0.479.0",
152153
"mammoth": "^1.9.0",
153154
"mermaid": "11.15.0",

bun.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)