Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions images/chromium-headful/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ RUN --mount=type=cache,target=/root/.cache/go-build,id=$CACHEIDPREFIX-go-build \
GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH:-amd64} \
go build -ldflags="-s -w" -o /out/wrapper ./cmd/wrapper

# Build supervisord eventlistener shim
RUN --mount=type=cache,target=/root/.cache/go-build,id=$CACHEIDPREFIX-go-build \
--mount=type=cache,target=/go/pkg/mod,id=$CACHEIDPREFIX-go-pkg-mod \
GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH:-amd64} \
go build -ldflags="-s -w" -o /out/kernel-images-supervisord-shim ./cmd/supervisord-shim

# webrtc client
FROM node:22-bullseye-slim AS client
WORKDIR /src
Expand Down Expand Up @@ -378,6 +384,7 @@ RUN chmod +x /usr/local/bin/init-envoy.sh
# copy the kernel-images API binary built in the builder stage
COPY --from=server-builder /out/kernel-images-api /usr/local/bin/kernel-images-api
COPY --from=server-builder /out/chromium-launcher /usr/local/bin/chromium-launcher
COPY --from=server-builder /out/kernel-images-supervisord-shim /usr/local/bin/kernel-images-supervisord-shim
COPY --from=server-builder /out/wrapper /wrapper

# Copy and compile the Playwright daemon
Expand Down
11 changes: 11 additions & 0 deletions images/chromium-headful/supervisor/services/supervisord-shim.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[eventlistener:supervisord-shim]
command=/usr/local/bin/kernel-images-supervisord-shim
events=PROCESS_STATE_EXITED,PROCESS_STATE_FATAL
; buffer_size defaults to 10 which overflows when several supervised
; services flap in quick succession. Bump it so a burst of crashes
; doesn't cause supervisord to drop events before the shim drains them.
buffer_size=100
autostart=true
autorestart=true
stderr_logfile=/var/log/supervisord/supervisord-shim
; stdout is the eventlistener protocol channel; do not redirect.
7 changes: 7 additions & 0 deletions images/chromium-headless/image/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ RUN --mount=type=cache,target=/root/.cache/go-build,id=$CACHEIDPREFIX-go-build \
GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH:-amd64} \
go build -ldflags="-s -w" -o /out/wrapper ./cmd/wrapper

# Build supervisord eventlistener shim
RUN --mount=type=cache,target=/root/.cache/go-build,id=$CACHEIDPREFIX-go-build \
--mount=type=cache,target=/go/pkg/mod,id=$CACHEIDPREFIX-go-pkg-mod \
GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH:-amd64} \
go build -ldflags="-s -w" -o /out/kernel-images-supervisord-shim ./cmd/supervisord-shim

FROM docker.io/ubuntu:22.04 AS ffmpeg-downloader

# Allow cross-compilation when building with BuildKit platforms
Expand Down Expand Up @@ -256,6 +262,7 @@ RUN chmod +x /usr/local/bin/bake-certs.sh && /usr/local/bin/bake-certs.sh && rm
# Copy the kernel-images API binary built in the builder stage
COPY --from=server-builder /out/kernel-images-api /usr/local/bin/kernel-images-api
COPY --from=server-builder /out/chromium-launcher /usr/local/bin/chromium-launcher
COPY --from=server-builder /out/kernel-images-supervisord-shim /usr/local/bin/kernel-images-supervisord-shim

# Copy and compile the Playwright daemon
COPY server/runtime/playwright-daemon.ts /tmp/playwright-daemon.ts
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[eventlistener:supervisord-shim]
command=/usr/local/bin/kernel-images-supervisord-shim
events=PROCESS_STATE_EXITED,PROCESS_STATE_FATAL
; buffer_size defaults to 10 which overflows when several supervised
; services flap in quick succession. Bump it so a burst of crashes
; doesn't cause supervisord to drop events before the shim drains them.
buffer_size=100
autostart=true
autorestart=true
stderr_logfile=/var/log/supervisord/supervisord-shim
; stdout is the eventlistener protocol channel; do not redirect.
6 changes: 6 additions & 0 deletions server/cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
oapi "github.com/kernel/kernel-images/server/lib/oapi"
"github.com/kernel/kernel-images/server/lib/recorder"
"github.com/kernel/kernel-images/server/lib/scaletozero"
"github.com/kernel/kernel-images/server/lib/sysmon"
"github.com/kernel/kernel-images/server/lib/telemetry"
)

Expand Down Expand Up @@ -103,6 +104,11 @@ func main() {
}
telemetrySession := telemetry.NewTelemetrySession(eventStream)

// VM-internal failure telemetry (OOM kills via /dev/kmsg).
// service_crashed events arrive via POST /telemetry/events from the
// supervisord-shim child process, not through this monitor.
sysmon.New(eventStream, slogger).Start(ctx)

// Optional S2 storage sink.
var s2Writer *events.S2StorageWriter
if config.S2Basin != "" && config.S2AccessToken != "" && config.S2Stream != "" {
Expand Down
228 changes: 228 additions & 0 deletions server/cmd/supervisord-shim/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
// Command supervisord-shim is a tiny supervisord eventlistener that
// translates PROCESS_STATE_EXITED (expected=0) and PROCESS_STATE_FATAL events
// into BrowserServiceCrashedEvent payloads and POSTs them to the local
// kernel-images-api telemetry endpoint.
//
// All schema-mapping and event publishing logic lives here; lib/sysmon does
// not handle supervisord events. Keeping the shim as the sole owner of the
// supervisord protocol means lib/sysmon stays single-purpose (kmsg only).
//
// Wire protocol per supervisord docs:
//
// stdout: "READY\n"
// stdin: header line ("ver:3.0 ... eventname:PROCESS_STATE_EXITED len:54\n")
// stdin: payload of `len` bytes (no trailing newline)
// stdout: "RESULT 2\nOK\n" (always; ACK regardless of downstream success)
//
// We always ACK with OK so supervisord doesn't quarantine us when the
// downstream HTTP target is briefly unavailable. The events are
// best-effort; if the API is down, we drop and log.
//
// All logging goes to stderr — stdout is the supervisord protocol channel.
package main

import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"strconv"
"strings"
"time"
)

const (
defaultTelemetryURL = "http://127.0.0.1:10001/telemetry/events"
httpTimeout = 2 * time.Second
)

func main() {
log.SetOutput(os.Stderr)
log.SetFlags(log.LstdFlags | log.Lmicroseconds)

telemetryURL := os.Getenv("KERNEL_IMAGES_TELEMETRY_URL")
if telemetryURL == "" {
telemetryURL = defaultTelemetryURL
}

pub := &publisher{
url: telemetryURL,
client: &http.Client{Timeout: httpTimeout},
}

in := bufio.NewReader(os.Stdin)
out := bufio.NewWriter(os.Stdout)

for {
if _, err := out.WriteString("READY\n"); err != nil {
log.Fatalf("write READY: %v", err)
}
if err := out.Flush(); err != nil {
log.Fatalf("flush READY: %v", err)
}

header, payload, err := readEvent(in)
if err != nil {
if err == io.EOF {
return
}
log.Fatalf("read event: %v", err)
}

// Try to publish but always ACK supervisord.
if ev, ok := mapEvent(header, payload); ok {
if perr := pub.publish(context.Background(), ev); perr != nil {
log.Printf("publish telemetry event: %v", perr)
}
}

if _, err := out.WriteString("RESULT 2\nOK\n"); err != nil {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing newline in RESULT breaks supervisord protocol

High Severity

The shim writes "RESULT 2\nOK\n" but the supervisord eventlistener protocol requires "RESULT 2\nOK" — no trailing newline after the data. The official childutils.py send() produces RESULT 2\nOK exactly. The extra \n is left in supervisord's read buffer after it consumes the declared 2 bytes of result data. When the shim next sends READY\n, supervisord's buffer contains \nREADY\n; the first 6 bytes (\nREADY) don't match the expected READY\n token, so supervisord transitions the listener to UNKNOWN state and clears its buffer — discarding the READY\n. Both sides then block waiting for input from each other, permanently deadlocking the listener after the first event.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 4ecdc0d. Configure here.

log.Fatalf("write RESULT: %v", err)
}
if err := out.Flush(); err != nil {
log.Fatalf("flush RESULT: %v", err)
}
}
}

// readEvent reads one supervisord event: a header line followed by a payload
// of declared length.
func readEvent(in *bufio.Reader) (map[string]string, map[string]string, error) {
headerLine, err := in.ReadString('\n')
if err != nil {
return nil, nil, err
}
header := parseFields(strings.TrimRight(headerLine, "\n"))

lenStr, ok := header["len"]
if !ok {
return nil, nil, fmt.Errorf("missing len in header: %q", headerLine)
}
n, err := strconv.Atoi(lenStr)
if err != nil {
return nil, nil, fmt.Errorf("invalid len %q: %w", lenStr, err)
}

buf := make([]byte, n)
if _, err := io.ReadFull(in, buf); err != nil {
return nil, nil, fmt.Errorf("read payload: %w", err)
}
payload := parseFields(string(buf))
return header, payload, nil
}

// parseFields parses supervisord's "key:value key:value" tokenization.
// Values are split on the first colon; supervisord does not escape colons in
// values, but in practice the values we care about (process names, states,
// ints) never contain them.
func parseFields(s string) map[string]string {
out := make(map[string]string)
for _, tok := range strings.Fields(s) {
i := strings.IndexByte(tok, ':')
if i < 0 {
continue
}
out[tok[:i]] = tok[i+1:]
}
return out
}

// telemetryEventBody mirrors oapi.TelemetryEvent but is duplicated here so the
// shim does not pull in the entire server module — keeps the binary tiny.
type telemetryEventBody struct {
Type string `json:"type"`
Category string `json:"category"`
Source telemetryEventSource `json:"source"`
Data serviceCrashedPayload `json:"data"`
}

type telemetryEventSource struct {
Kind string `json:"kind"`
Event string `json:"event"`
}

type serviceCrashedPayload struct {
ServiceName string `json:"service_name"`
FromState string `json:"from_state"`
Pid *int `json:"pid,omitempty"`
}
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shim uses ad-hoc structs instead of oapi types

Low Severity

The shim defines ad-hoc struct types (telemetryEventBody, telemetryEventSource, serviceCrashedPayload) instead of using generated oapi types like oapi.BrowserServiceCrashedEventData. The rule requires all event producers to use generated oapi types when building payloads to prevent drift between the documented API contract and actual event shapes. If the OpenAPI schema for service_crashed events changes, these duplicated types won't be updated automatically.

Fix in Cursor Fix in Web

Triggered by learned rule: Producer repo must own typed event schemas in OpenAPI and use generated oapi types

Reviewed by Cursor Bugbot for commit 4ecdc0d. Configure here.


// mapEvent decides whether to publish and constructs the event payload.
// Returns ok=false for events we deliberately skip (intentional stops,
// non-crash event types).
func mapEvent(header, payload map[string]string) (telemetryEventBody, bool) {
eventName := header["eventname"]
switch eventName {
case "PROCESS_STATE_EXITED":
// expected=0 means the exit was not in `exitcodes` — i.e. a crash.
// expected=1 means clean shutdown (supervisorctl stop, or a configured
// exitcode). Skip the latter.
if payload["expected"] != "0" {
return telemetryEventBody{}, false
}
case "PROCESS_STATE_FATAL":
// FATAL: supervisord exhausted startretries. Always a crash.
default:
return telemetryEventBody{}, false
}

name := payload["processname"]
if name == "" {
return telemetryEventBody{}, false
}
fromState := payload["from_state"]
if fromState == "" {
return telemetryEventBody{}, false
}

body := telemetryEventBody{
Type: "service_crashed",
Category: "system",
Source: telemetryEventSource{
Kind: "local_process",
Event: "supervisord.process_" + strings.ToLower(strings.TrimPrefix(eventName, "PROCESS_STATE_")),
},
Data: serviceCrashedPayload{
ServiceName: name,
FromState: fromState,
},
}
if pidStr := payload["pid"]; pidStr != "" {
if pid, err := strconv.Atoi(pidStr); err == nil {
body.Data.Pid = &pid
}
}
return body, true
}

type publisher struct {
url string
client *http.Client
}

func (p *publisher) publish(ctx context.Context, body telemetryEventBody) error {
buf, err := json.Marshal(body)
if err != nil {
return fmt.Errorf("marshal: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.url, bytes.NewReader(buf))
if err != nil {
return fmt.Errorf("new request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := p.client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
b, _ := io.ReadAll(resp.Body)
return fmt.Errorf("status %d: %s", resp.StatusCode, bytes.TrimSpace(b))
}
return nil
}
Loading
Loading