diff --git a/js/tests/supervisord.test.ts b/js/tests/supervisord.test.ts new file mode 100644 index 00000000..80c26abf --- /dev/null +++ b/js/tests/supervisord.test.ts @@ -0,0 +1,67 @@ +import { expect } from 'vitest' +import { sandboxTest, wait } from './setup' + +async function waitForHealth(sandbox: any, maxRetries = 10, intervalMs = 100) { + for (let i = 0; i < maxRetries; i++) { + try { + const result = await sandbox.commands.run( + 'curl -s -o /dev/null -w "%{http_code}" http://0.0.0.0:49999/health' + ) + if (result.stdout.trim() === '200') { + return true + } + } catch { + // Connection refused or other error, retry + } + await wait(intervalMs) + } + return false +} + +sandboxTest('restart after jupyter kill', async ({ sandbox }) => { + // Verify health is up initially + const initialHealth = await waitForHealth(sandbox) + expect(initialHealth).toBe(true) + + // Kill the jupyter process as root + // The command handle may get killed too (since killing jupyter cascades to code-interpreter), + // so we catch the error. + try { + await sandbox.commands.run("kill -9 $(pgrep -f 'jupyter server')", { + user: 'root', + }) + } catch { + // Expected — the kill cascade may terminate the command handle + } + + // Wait for supervisord to restart both services (jupyter startup + code-interpreter startup) + const recovered = await waitForHealth(sandbox, 60, 500) + expect(recovered).toBe(true) + + // Verify code execution works after recovery + const result = await sandbox.runCode('x = 1; x') + expect(result.text).toEqual('1') +}) + +sandboxTest('restart after code-interpreter kill', async ({ sandbox }) => { + // Verify health is up initially + const initialHealth = await waitForHealth(sandbox) + expect(initialHealth).toBe(true) + + // Kill the code-interpreter process as root + try { + await sandbox.commands.run('kill -9 $(cat /var/run/code-interpreter.pid)', { + user: 'root', + }) + } catch { + // Expected — killing code-interpreter may terminate the command handle + } + + // Wait for supervisord to restart it and health to come back + const recovered = await waitForHealth(sandbox, 60, 500) + expect(recovered).toBe(true) + + // Verify code execution works after recovery + const result = await sandbox.runCode('x = 1; x') + expect(result.text).toEqual('1') +}) diff --git a/python/tests/async/test_async_supervisord.py b/python/tests/async/test_async_supervisord.py new file mode 100644 index 00000000..763a7ae9 --- /dev/null +++ b/python/tests/async/test_async_supervisord.py @@ -0,0 +1,59 @@ +import asyncio + +from e2b_code_interpreter.code_interpreter_async import AsyncSandbox + + +async def wait_for_health(sandbox: AsyncSandbox, max_retries=10, interval_ms=100): + for _ in range(max_retries): + try: + result = await sandbox.commands.run( + 'curl -s -o /dev/null -w "%{http_code}" http://0.0.0.0:49999/health' + ) + if result.stdout.strip() == "200": + return True + except Exception: + pass + await asyncio.sleep(interval_ms / 1000) + return False + + +async def test_restart_after_jupyter_kill(async_sandbox: AsyncSandbox): + # Verify health is up initially + assert await wait_for_health(async_sandbox) + + # Kill the jupyter process as root + # The command handle may get killed too (killing jupyter cascades to code-interpreter), + # so we catch the error. + try: + await async_sandbox.commands.run( + "kill -9 $(pgrep -f 'jupyter server')", user="root" + ) + except Exception: + pass + + # Wait for supervisord to restart both services + assert await wait_for_health(async_sandbox, 60, 500) + + # Verify code execution works after recovery + result = await async_sandbox.run_code("x = 1; x") + assert result.text == "1" + + +async def test_restart_after_code_interpreter_kill(async_sandbox: AsyncSandbox): + # Verify health is up initially + assert await wait_for_health(async_sandbox) + + # Kill the code-interpreter process as root + try: + await async_sandbox.commands.run( + "kill -9 $(cat /var/run/code-interpreter.pid)", user="root" + ) + except Exception: + pass + + # Wait for supervisord to restart it and health to come back + assert await wait_for_health(async_sandbox, 60, 500) + + # Verify code execution works after recovery + result = await async_sandbox.run_code("x = 1; x") + assert result.text == "1" diff --git a/python/tests/sync/test_supervisord.py b/python/tests/sync/test_supervisord.py new file mode 100644 index 00000000..d7f67bfc --- /dev/null +++ b/python/tests/sync/test_supervisord.py @@ -0,0 +1,57 @@ +import time + +from e2b_code_interpreter.code_interpreter_sync import Sandbox + + +def wait_for_health(sandbox: Sandbox, max_retries=10, interval_ms=100): + for _ in range(max_retries): + try: + result = sandbox.commands.run( + 'curl -s -o /dev/null -w "%{http_code}" http://0.0.0.0:49999/health' + ) + if result.stdout.strip() == "200": + return True + except Exception: + pass + time.sleep(interval_ms / 1000) + return False + + +def test_restart_after_jupyter_kill(sandbox: Sandbox): + # Verify health is up initially + assert wait_for_health(sandbox) + + # Kill the jupyter process as root + # The command handle may get killed too (killing jupyter cascades to code-interpreter), + # so we catch the error. + try: + sandbox.commands.run("kill -9 $(pgrep -f 'jupyter server')", user="root") + except Exception: + pass + + # Wait for supervisord to restart both services + assert wait_for_health(sandbox, 60, 500) + + # Verify code execution works after recovery + result = sandbox.run_code("x = 1; x") + assert result.text == "1" + + +def test_restart_after_code_interpreter_kill(sandbox: Sandbox): + # Verify health is up initially + assert wait_for_health(sandbox) + + # Kill the code-interpreter process as root + try: + sandbox.commands.run( + "kill -9 $(cat /var/run/code-interpreter.pid)", user="root" + ) + except Exception: + pass + + # Wait for supervisord to restart it and health to come back + assert wait_for_health(sandbox, 60, 500) + + # Verify code execution works after recovery + result = sandbox.run_code("x = 1; x") + assert result.text == "1" diff --git a/template/start-code-interpreter.sh b/template/start-code-interpreter.sh new file mode 100755 index 00000000..88ea456b --- /dev/null +++ b/template/start-code-interpreter.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +echo "Waiting for Jupyter server to be ready..." +until curl -s -o /dev/null -w '%{http_code}' http://localhost:8888/api/status | grep -q '200'; do + sleep 0.5 +done +echo "Jupyter server is ready, starting Code Interpreter..." + +echo $$ > /var/run/code-interpreter.pid +exec /root/.server/.venv/bin/uvicorn main:app --host 0.0.0.0 --port 49999 --workers 1 --no-access-log --no-use-colors --timeout-keep-alive 640 diff --git a/template/start-jupyter.sh b/template/start-jupyter.sh new file mode 100755 index 00000000..40e62bbf --- /dev/null +++ b/template/start-jupyter.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +/usr/local/bin/jupyter server --IdentityProvider.token="" + +# Jupyter exited — kill code-interpreter so supervisord restarts both +echo "Jupyter exited, killing code-interpreter..." +kill "$(cat /var/run/code-interpreter.pid)" 2>/dev/null diff --git a/template/start-up.sh b/template/start-up.sh index d30cfc9e..b0c9c691 100755 --- a/template/start-up.sh +++ b/template/start-up.sh @@ -1,22 +1,4 @@ #!/bin/bash -function start_jupyter_server() { - counter=0 - response=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8888/api/status") - while [[ ${response} -ne 200 ]]; do - let counter++ - if ((counter % 20 == 0)); then - echo "Waiting for Jupyter Server to start..." - sleep 0.1 - fi - - response=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8888/api/status") - done - - cd /root/.server/ - .venv/bin/uvicorn main:app --host 0.0.0.0 --port 49999 --workers 1 --no-access-log --no-use-colors --timeout-keep-alive 640 -} - echo "Starting Code Interpreter server..." -start_jupyter_server & -MATPLOTLIBRC=/root/.config/matplotlib/.matplotlibrc jupyter server --IdentityProvider.token="" >/dev/null 2>&1 +supervisord -c /etc/supervisord.conf diff --git a/template/supervisord.conf b/template/supervisord.conf new file mode 100644 index 00000000..c7e8d526 --- /dev/null +++ b/template/supervisord.conf @@ -0,0 +1,28 @@ +[supervisord] +nodaemon=true +logfile=/var/log/supervisord.log +pidfile=/var/run/supervisord.pid + +[program:jupyter] +command=/root/.jupyter/start-jupyter.sh +environment=MATPLOTLIBRC="/root/.config/matplotlib/.matplotlibrc" +stdout_logfile=/dev/null +stderr_logfile=/dev/fd/1 +stderr_logfile_maxbytes=0 +autorestart=true +stopasgroup=true +killasgroup=true +priority=10 + +[program:code-interpreter] +command=/root/.jupyter/start-code-interpreter.sh +directory=/root/.server +stdout_logfile=/dev/fd/1 +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/fd/1 +stderr_logfile_maxbytes=0 +autorestart=true +stopasgroup=true +killasgroup=true +priority=20 +startsecs=0 diff --git a/template/template.py b/template/template.py index e10e89a8..bf3c4d71 100644 --- a/template/template.py +++ b/template/template.py @@ -38,6 +38,7 @@ def make_template( "sudo", "fonts-noto-cjk", "ca-certificates", + "supervisor", ] ) .run_cmd("curl -fsSL https://deb.nodesource.com/setup_20.x | bash -") @@ -111,11 +112,17 @@ def make_template( template = ( template.copy("matplotlibrc", ".config/matplotlib/.matplotlibrc") .copy("start-up.sh", ".jupyter/start-up.sh") - .run_cmd("chmod +x .jupyter/start-up.sh") + .copy("start-code-interpreter.sh", ".jupyter/start-code-interpreter.sh") + .copy("start-jupyter.sh", ".jupyter/start-jupyter.sh") + .run_cmd( + "chmod +x .jupyter/start-code-interpreter.sh .jupyter/start-up.sh .jupyter/start-jupyter.sh" + ) .copy("jupyter_server_config.py", ".jupyter/") .make_dir(".ipython/profile_default/startup") .copy("ipython_kernel_config.py", ".ipython/profile_default/") .copy("startup_scripts", ".ipython/profile_default/startup") + # Install supervisord config + .copy("supervisord.conf", "/etc/supervisord.conf") ) if is_docker: