Skip to content

Commit 14d7efc

Browse files
committed
feat(satellite): add cgroup v2 auto-detection and fix stderr log delivery
Cgroup v2 auto-detection: - Add cgroup-detector.ts that reads /proc/self/cgroup at startup, verifies write access to the delegated subtree, moves the satellite process into a child cgroup to satisfy the "no internal process" constraint, and enables memory + pids controllers via cgroup.subtree_control - ProcessSpawner detects cgroup availability once at startup and conditionally adds --use_cgroupv2, --cgroupv2_mount, --cgroup_mem_max, --cgroup_pids_max to nsjail for both MCP server and build command spawns - --disable_clone_newcgroup is now omitted when cgroup is available (nsjail needs the cgroup namespace for per-process isolation) - NSJAIL_MEMORY_LIMIT_MB default changed from 2048 to "inf" — Node.js v24 WASM (undici HTTP parser) reserves ~10GB virtual address space; cgroup enforces the 512MB physical RAM cap when Delegate=yes is configured Fix nsjail INFO lines consuming stderr rate limit: - Move nsjail INFO pre-filter before rateLimiter.shouldAcceptLog() so the ~15 startup INFO mount lines do not burn rate limit slots before the actual MCP server output arrives Fix missing event type in isValidEventType(): - Add mcp.server.log_rate_limit_exceeded to the validTypes array so EventBus does not reject it with event_emit_invalid_type Fix crash exit logged to user-facing log buffer: - Add exit handler that writes an error log entry on non-zero exit and immediately flushes the buffer so crash messages reach the backend without waiting for the 3-second batch interval Expand stderr capture on build failures: - Increase substring limit from 200 to 1000 characters in all npm install, npm run build, uv, and pip error messages to expose full failure context Fix Python deployment: venv shebang absolute paths inside nsjail: - resolvePythonEntryPoint now captures the module entry point string from [project.scripts] (e.g. "plane_mcp.__main__:main") and returns it as moduleEntryPoint - github-deployment.ts uses python3 -m <module> when moduleEntryPoint is available, avoiding the venv shebang script that contains an absolute host path that does not exist inside nsjail's isolated filesystem Fix pyproject.toml dependency parsing with extras syntax: - Replace single regex /\[([\s\S]*?)\]/ with a bracket-aware loop so dependencies containing extras (e.g. "pkg[redis,hiredis]") are parsed correctly instead of truncating at the first ] inside the extras bracket Fix isPyprojectSimpleScript for module-based packages: - Check [project.scripts] entry point module roots (e.g. "plane_mcp" from "plane_mcp.__main__:main") against the filesystem so packages that use a module directory instead of a src/ layout are correctly identified as installable rather than simple scripts
1 parent 6f70692 commit 14d7efc

File tree

8 files changed

+306
-91
lines changed

8 files changed

+306
-91
lines changed

services/satellite/.env.example

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,14 +105,30 @@ EVENT_FLUSH_TIMEOUT_MS=5000
105105
# Only active when NODE_ENV=production and platform is Linux
106106
#
107107
# Defaults based on empirical testing with npx and Node.js V8:
108-
# - 2048MB memory: Absolute minimum for V8 initialization (cannot be reduced)
108+
# - "inf" virtual memory (rlimit_as): Node.js v24 WASM (undici HTTP parser) needs ~10GB virtual space
109+
# - 512MB physical memory (cgroup): Precise control over actual RAM usage per process
109110
# - 1000 processes: npm spawns many child processes during package installation
110111
# - 1024 file descriptors: Adequate for file I/O operations
111112
# - 50MB file size: Prevents oversized downloads while accommodating 99% of npm packages
112113
# - 100MB tmpfs: Sufficient for npm cache and temporary operations
113-
114-
# Memory limit per process in MB (default: 2048, V8 minimum requirement)
115-
NSJAIL_MEMORY_LIMIT_MB=2048
114+
#
115+
# Cgroup v2 limits (auto-detected at startup):
116+
# When running as a systemd service with Delegate=yes, the satellite auto-detects
117+
# the delegated cgroup path and enables physical memory + PID limits via cgroup v2.
118+
# If cgroups are unavailable, falls back to rlimit-only mode gracefully.
119+
120+
# Virtual address space limit per process (default: "inf")
121+
# Accepts MB number (e.g. "4096") or nsjail keywords: "inf", "soft", "hard"
122+
# "inf" is required for Node.js v24 — WASM (undici) reserves ~10GB virtual address space
123+
NSJAIL_MEMORY_LIMIT_MB=inf
124+
125+
# Cgroup physical memory limit in bytes (default: 536870912 = 512MB)
126+
# Only applied when cgroup v2 is available with a delegated subtree
127+
NSJAIL_CGROUP_MEM_MAX_BYTES=536870912
128+
129+
# Cgroup max PIDs per process (default: 1000)
130+
# Only applied when cgroup v2 is available with a delegated subtree
131+
NSJAIL_CGROUP_PIDS_MAX=1000
116132

117133
# CPU time limit per process in seconds (default: 60)
118134
NSJAIL_CPU_TIME_LIMIT_SECONDS=60

services/satellite/src/config/nsjail.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,19 @@ function parseSizeToBytes(sizeStr: string): number {
2424
* These limits apply only in production on Linux platforms when nsjail isolation is enabled
2525
*
2626
* Defaults based on empirical testing with package managers (npm, uvx) and runtime requirements:
27-
* - 2048MB virtual memory (RLIMIT_AS): Fallback for systems without cgroup support
27+
* - "inf" virtual memory (RLIMIT_AS): Node.js v24 WASM (undici HTTP parser) needs ~10GB virtual space
2828
* - 512MB physical memory (cgroup_mem_max): Precise control over actual memory usage
2929
* - 1000 processes: Adequate for package managers which spawn many child processes
3030
* - 1024 file descriptors: Adequate for file I/O operations
3131
* - 50MB file size: Prevents oversized downloads while accommodating most packages
3232
* - 100MB tmpfs: Sufficient for package manager cache operations
3333
*/
3434
export const nsjailConfig = {
35-
/** Memory limit per MCP server process in MB (default: 2048, sufficient for most runtimes) */
36-
memoryLimitMB: parseInt(process.env.NSJAIL_MEMORY_LIMIT_MB || '2048', 10),
35+
/** Virtual address space limit per MCP process.
36+
* Accepts MB number (e.g. "4096") or nsjail keywords: "inf", "soft", "hard".
37+
* Default "inf" — Node.js v24 fetch (undici) uses WASM which reserves ~10GB
38+
* virtual address space. This is virtual memory only, not physical RAM. */
39+
memoryLimitMB: process.env.NSJAIL_MEMORY_LIMIT_MB || 'inf',
3740

3841
/** Cgroup physical memory limit in bytes (default: 512MB = 536870912 bytes) */
3942
cgroupMemMaxBytes: parseInt(process.env.NSJAIL_CGROUP_MEM_MAX_BYTES || '536870912', 10),

services/satellite/src/events/registry.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ export function isValidEventType(type: string): type is EventType {
281281
'mcp.server.respawned',
282282
'mcp.server.status_changed',
283283
'mcp.server.logs',
284+
'mcp.server.log_rate_limit_exceeded',
284285
'mcp.request.logs',
285286
'mcp.tools.discovered',
286287
'mcp.tools.updated',
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import { readFileSync, writeFileSync, mkdirSync, existsSync, accessSync, constants } from 'fs';
2+
3+
export interface CgroupInfo {
4+
available: boolean;
5+
version: 'v2' | 'v1' | 'none';
6+
mountPath: string | null; // e.g. "/sys/fs/cgroup/system.slice/deploystack-satellite.service"
7+
reason?: string; // why unavailable
8+
}
9+
10+
/**
11+
* Detect cgroup v2 availability and find the delegated cgroup path.
12+
*
13+
* When running as a systemd service with Delegate=yes, the process owns a
14+
* subtree under /sys/fs/cgroup (e.g. /sys/fs/cgroup/system.slice/deploystack-satellite.service).
15+
* nsjail must use this delegated path (not the root) to create per-process cgroups.
16+
*
17+
* Cgroup v2 has a "no internal process" constraint: a cgroup cannot both have
18+
* member processes AND enable controllers for children. To satisfy this, we move
19+
* the satellite process into a "satellite" child cgroup, leaving the service
20+
* cgroup root free for nsjail to write cgroup.subtree_control.
21+
*/
22+
export function detectCgroupV2(): CgroupInfo {
23+
// 1. Check cgroup v2 is mounted
24+
if (!existsSync('/sys/fs/cgroup/cgroup.controllers')) {
25+
return { available: false, version: 'none', mountPath: null, reason: 'cgroup v2 not mounted' };
26+
}
27+
28+
// 2. Read our cgroup path from /proc/self/cgroup
29+
// Format: "0::/system.slice/deploystack-satellite.service"
30+
try {
31+
const content = readFileSync('/proc/self/cgroup', 'utf-8').trim();
32+
const match = content.match(/0::(.+)/);
33+
if (!match || match[1] === '/') {
34+
return { available: false, version: 'v2', mountPath: null, reason: 'process is in root cgroup (no delegation)' };
35+
}
36+
37+
const serviceCgroupRelative = match[1];
38+
const serviceCgroupPath = `/sys/fs/cgroup${serviceCgroupRelative}`;
39+
40+
// 3. Verify the delegated path exists
41+
if (!existsSync(serviceCgroupPath)) {
42+
return { available: false, version: 'v2', mountPath: null, reason: `delegated path does not exist: ${serviceCgroupPath}` };
43+
}
44+
45+
// 4. Verify write access to the delegated path
46+
try {
47+
accessSync(serviceCgroupPath, constants.W_OK);
48+
} catch {
49+
return { available: false, version: 'v2', mountPath: null, reason: `no write access to ${serviceCgroupPath}` };
50+
}
51+
52+
// 5. Move ourselves into a child cgroup to satisfy the "no internal process" constraint.
53+
// cgroup v2 requires that a cgroup with controllers enabled for children has no
54+
// direct member processes. We create "satellite" child and move our PID there,
55+
// freeing the service root for nsjail's subtree_control writes.
56+
const satelliteChildPath = `${serviceCgroupPath}/satellite`;
57+
try {
58+
if (!existsSync(satelliteChildPath)) {
59+
mkdirSync(satelliteChildPath);
60+
}
61+
// Move our process into the child cgroup by writing our PID to cgroup.procs
62+
writeFileSync(`${satelliteChildPath}/cgroup.procs`, String(process.pid));
63+
} catch (err) {
64+
const msg = err instanceof Error ? err.message : String(err);
65+
return { available: false, version: 'v2', mountPath: null, reason: `failed to move process to child cgroup: ${msg}` };
66+
}
67+
68+
// 6. Enable memory and pids controllers on the service cgroup root
69+
// This writes "+memory +pids" to cgroup.subtree_control so nsjail's
70+
// per-process child cgroups can enforce limits.
71+
try {
72+
writeFileSync(`${serviceCgroupPath}/cgroup.subtree_control`, '+memory +pids');
73+
} catch (err) {
74+
const msg = err instanceof Error ? err.message : String(err);
75+
return { available: false, version: 'v2', mountPath: null, reason: `failed to enable controllers on subtree_control: ${msg}` };
76+
}
77+
78+
return { available: true, version: 'v2', mountPath: serviceCgroupPath };
79+
} catch {
80+
return { available: false, version: 'v2', mountPath: null, reason: 'failed to read /proc/self/cgroup' };
81+
}
82+
}

services/satellite/src/process/github-deployment.ts

Lines changed: 44 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ export class GitHubDeploymentHandler {
118118
emitBuildResult(this.logBuffer, metadata, 'npm install', result);
119119

120120
if (result.code !== 0) {
121-
throw new Error(`npm install failed with code ${result.code}: ${result.stderr.substring(0, 200)}`);
121+
throw new Error(`npm install failed with code ${result.code}: ${result.stderr.substring(0, 1000)}`);
122122
}
123123

124124
this.logger.info({
@@ -182,7 +182,7 @@ export class GitHubDeploymentHandler {
182182
stderr: stderr.substring(0, 500) // Limit stderr output
183183
}, `npm install failed with code ${code}`);
184184

185-
reject(new Error(`npm install failed with code ${code}: ${stderr.substring(0, 200)}`));
185+
reject(new Error(`npm install failed with code ${code}: ${stderr.substring(0, 1000)}`));
186186
}
187187
});
188188

@@ -252,7 +252,7 @@ export class GitHubDeploymentHandler {
252252
emitBuildResult(this.logBuffer, metadata, 'npm build', result);
253253

254254
if (result.code !== 0) {
255-
throw new Error(`npm run build failed with code ${result.code}: ${result.stderr.substring(0, 200)}`);
255+
throw new Error(`npm run build failed with code ${result.code}: ${result.stderr.substring(0, 1000)}`);
256256
}
257257

258258
this.logger.info({
@@ -316,7 +316,7 @@ export class GitHubDeploymentHandler {
316316
stderr: stderr.substring(0, 500)
317317
}, `npm run build failed with code ${code}`);
318318

319-
reject(new Error(`npm run build failed with code ${code}: ${stderr.substring(0, 200)}`));
319+
reject(new Error(`npm run build failed with code ${code}: ${stderr.substring(0, 1000)}`));
320320
}
321321
});
322322

@@ -522,7 +522,7 @@ export class GitHubDeploymentHandler {
522522
message: `[${command}] ${result.stderr.substring(0, 500)}`,
523523
timestamp: new Date().toISOString()
524524
});
525-
throw new Error(`${command} ${args.join(' ')} failed with code ${result.code}: ${result.stderr.substring(0, 200)}`);
525+
throw new Error(`${command} ${args.join(' ')} failed with code ${result.code}: ${result.stderr.substring(0, 1000)}`);
526526
}
527527

528528
// Step 2: For simple scripts, install dependencies into the venv
@@ -582,7 +582,7 @@ export class GitHubDeploymentHandler {
582582
emitBuildResult(this.logBuffer, metadata, 'uv pip', depsResult);
583583

584584
if (depsResult.code !== 0) {
585-
throw new Error(`uv pip install failed with code ${depsResult.code}: ${depsResult.stderr.substring(0, 200)}`);
585+
throw new Error(`uv pip install failed with code ${depsResult.code}: ${depsResult.stderr.substring(0, 1000)}`);
586586
}
587587
}
588588

@@ -649,7 +649,7 @@ export class GitHubDeploymentHandler {
649649
exit_code: code,
650650
stderr: stderr.substring(0, 500)
651651
}, `${cmd} failed with code ${code}`);
652-
reject(new Error(`${cmd} failed with code ${code}: ${stderr.substring(0, 200)}`));
652+
reject(new Error(`${cmd} failed with code ${code}: ${stderr.substring(0, 1000)}`));
653653
}
654654
});
655655

@@ -674,29 +674,14 @@ export class GitHubDeploymentHandler {
674674
let pipArgs: string[];
675675

676676
if (hasPyproject) {
677-
// Parse dependencies from pyproject.toml and install them directly
677+
// Parse dependencies from pyproject.toml using extracted helper
678678
const pyprojectPath = path.join(tempDir, 'pyproject.toml');
679-
const pyprojectContent = await fs.promises.readFile(pyprojectPath, 'utf8');
680-
681-
// Extract dependencies array from [project] section
682-
const depsMatch = pyprojectContent.match(/dependencies\s*=\s*\[([\s\S]*?)\]/);
683-
if (depsMatch) {
684-
const depsContent = depsMatch[1];
685-
// Parse individual dependencies (handle both "pkg" and 'pkg' quotes)
686-
const deps = depsContent
687-
.split(',')
688-
.map(d => d.trim())
689-
.filter(d => d.length > 0)
690-
.map(d => d.replace(/^["']|["']$/g, '')); // Remove quotes
691-
692-
if (deps.length > 0) {
693-
pipArgs = ['pip', 'install', ...deps];
694-
} else {
695-
// No dependencies, skip
696-
pipArgs = [];
697-
}
679+
const deps = await parsePyprojectDependencies(pyprojectPath);
680+
681+
if (deps.length > 0) {
682+
pipArgs = ['pip', 'install', ...deps];
698683
} else {
699-
// No dependencies section
684+
// No dependencies, skip
700685
pipArgs = [];
701686
}
702687
} else {
@@ -734,7 +719,7 @@ export class GitHubDeploymentHandler {
734719
* Resolve Python package entry point from pyproject.toml or __main__.py
735720
* Delegates to resolvePythonEntryPoint() from python-helpers.ts
736721
*/
737-
async resolvePythonPackageEntry(tempDir: string): Promise<{ command: string; entryPoint: string }> {
722+
async resolvePythonPackageEntry(tempDir: string): Promise<{ command: string; entryPoint: string; moduleEntryPoint?: string }> {
738723
this.logger.debug({
739724
operation: 'python_entry_resolve_start',
740725
temp_dir: tempDir
@@ -932,7 +917,7 @@ export class GitHubDeploymentHandler {
932917

933918
// Fetch GitHub App installation token (may be null for public repos)
934919
const tokenResult = await this.backendClient.fetchGitHubToken(config.installation_id);
935-
const githubToken = tokenResult?.token || undefined;
920+
const githubToken = tokenResult?.token || '';
936921

937922
if (githubToken) {
938923
this.logger.debug({
@@ -1038,15 +1023,12 @@ export class GitHubDeploymentHandler {
10381023
await this.installPythonDependencies(deploymentDir, config.installation_id, config.team_id, config.user_id);
10391024

10401025
// Resolve Python package entry point
1041-
const { command, entryPoint } = await this.resolvePythonPackageEntry(deploymentDir);
1026+
const { command, entryPoint, moduleEntryPoint } = await this.resolvePythonPackageEntry(deploymentDir);
10421027

10431028
// Determine the correct command and args based on entry point type
10441029
let updatedConfig: MCPServerConfig;
10451030

10461031
if (command === 'python3') {
1047-
// Running with system python3 interpreter - make script path relative
1048-
const relativeEntryPoint = path.relative(deploymentDir, entryPoint);
1049-
10501032
// Activate venv by setting PYTHONPATH environment variable
10511033
// This allows system python3 to find packages in .venv/lib/python3.x/site-packages
10521034
//
@@ -1071,10 +1053,37 @@ export class GitHubDeploymentHandler {
10711053
is_production: isProduction
10721054
}, `Activated Python venv via PYTHONPATH: ${sitePackagesPath}`);
10731055

1056+
// Determine args: prefer module entry point over script path.
1057+
// Module entry point (e.g., "plane_mcp.__main__:main") is used via `python3 -m`
1058+
// to avoid running the venv shebang script, which contains an absolute host path
1059+
// that doesn't exist inside nsjail's isolated filesystem.
1060+
let pythonArgs: string[];
1061+
if (moduleEntryPoint) {
1062+
// Extract module path from "module.path:func" → "module.path"
1063+
// Then invoke as: python3 -m module.path
1064+
// (The module must support __main__ execution, which all [project.scripts] entries do)
1065+
const colonIdx = moduleEntryPoint.lastIndexOf(':');
1066+
const modulePath = colonIdx !== -1
1067+
? moduleEntryPoint.substring(0, colonIdx)
1068+
: moduleEntryPoint;
1069+
pythonArgs = ['-m', modulePath];
1070+
1071+
this.logger.info({
1072+
operation: 'python_module_entry',
1073+
module_entry_point: moduleEntryPoint,
1074+
module_path: modulePath,
1075+
python_args: pythonArgs
1076+
}, `Using module entry point: python3 -m ${modulePath}`);
1077+
} else {
1078+
// Fall back to running script file directly (relative path, nsjail prepends /app)
1079+
const relativeEntryPoint = path.relative(deploymentDir, entryPoint);
1080+
pythonArgs = [relativeEntryPoint];
1081+
}
1082+
10741083
updatedConfig = {
10751084
...config,
10761085
command: 'python3',
1077-
args: [relativeEntryPoint],
1086+
args: pythonArgs,
10781087
temp_dir: deploymentDir,
10791088
env: activatedEnv
10801089
};

services/satellite/src/process/manager.ts

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -374,10 +374,16 @@ export class ProcessManager extends EventEmitter {
374374
const trimmedLine = line.trim();
375375
if (!trimmedLine) continue;
376376

377-
// Check rate limit BEFORE processing
377+
// Pre-filter nsjail INFO lines before rate limiting (infrastructure noise that
378+
// would be discarded anyway — don't waste rate limit slots on them)
379+
const nsjailPreCheck = parseNsjailLog(trimmedLine);
380+
if (nsjailPreCheck && nsjailPreCheck.level === 'I') {
381+
continue;
382+
}
383+
384+
// Check rate limit (only for lines we'll actually buffer)
378385
const rateLimitResult = rateLimiter.shouldAcceptLog(trimmedLine, currentTime);
379386
if (!rateLimitResult.accept) {
380-
// Log dropped - skip this line
381387
continue;
382388
}
383389

@@ -386,16 +392,11 @@ export class ProcessManager extends EventEmitter {
386392
? rateLimitResult.truncatedMessage!
387393
: trimmedLine;
388394

389-
// Check if this is an nsjail log
395+
// Re-parse nsjail for WARNING/ERROR/FATAL (truncation may have changed finalMessage)
390396
const nsjailLog = parseNsjailLog(finalMessage);
391397

392398
if (nsjailLog) {
393-
// nsjail log detected - filter out INFO level (infrastructure noise)
394-
if (nsjailLog.level === 'I') {
395-
// Skip nsjail INFO logs (Mount, Uid map, Jail parameters, etc.)
396-
continue;
397-
}
398-
// Keep nsjail WARNING/ERROR/FATAL logs with correct level mapping
399+
// nsjail WARNING/ERROR/FATAL - map to correct level
399400
const level: 'warn' | 'error' = nsjailLog.level === 'W' ? 'warn' : 'error';
400401
this.logBuffer.add({
401402
installation_id: config.installation_id,
@@ -436,6 +437,25 @@ export class ProcessManager extends EventEmitter {
436437
signal: signal
437438
}, `MCP server exited: ${config.installation_name} (code: ${code}, signal: ${signal})`);
438439

440+
// Send exit notification to user-facing logs on non-zero exit (crash)
441+
if (code !== 0 && code !== null) {
442+
const exitMessage = signal
443+
? `Server process terminated by signal ${signal} (exit code: ${code})`
444+
: `Server process exited with error (exit code: ${code})`;
445+
446+
this.logBuffer.add({
447+
installation_id: config.installation_id,
448+
team_id: config.team_id,
449+
user_id: config.user_id,
450+
level: 'error',
451+
message: exitMessage,
452+
timestamp: new Date().toISOString()
453+
});
454+
455+
// Flush immediately so crash logs reach the backend without waiting for 3s interval
456+
this.logBuffer.flush();
457+
}
458+
439459
this.emit('processExit', processInfo, code, signal);
440460
});
441461

0 commit comments

Comments
 (0)