Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions cmd/obol/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,33 @@ Hermes/OpenClaw onboard flow used by the master agent.`,
return listAgentInstances(cfg, cmd.String("runtime"), getUI(cmd))
},
},
{
Name: "repair-perms",
Usage: "Fix Hermes data PVC ownership for a CRD child agent (Linux k3d)",
ArgsUsage: "<name>",
Description: `Host-side chown for agent-<name>/hermes-data so the child Hermes pod can write under /data.

Use when a factory-spawned or CLI-created child agent crash-loops with
Permission denied under /data/.hermes on Linux k3d (KubeletInUserNamespace).

This is a no-op on backends where in-pod init chown already works.`,
Action: func(ctx context.Context, cmd *cli.Command) error {
if cmd.NArg() != 1 {
return fmt.Errorf("agent name required: obol agent repair-perms <name>")
}
name := strings.TrimSpace(cmd.Args().First())
if err := agentcrd.ValidateName(name); err != nil {
return err
}
if err := kubectl.EnsureCluster(cfg); err != nil {
return fmt.Errorf("Obol Stack is not running. Start it with `obol stack up` first")
}
u := getUI(cmd)
hermes.EnsureCRDAgentHermesPVCOwnership(cfg, name, u)
u.Successf("Repaired Hermes data volume ownership for agent %s", name)
return nil
},
},
{
Name: "delete",
Usage: "Remove an agent instance and its cluster resources",
Expand Down
4 changes: 4 additions & 0 deletions cmd/obol/agent_crd.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (

"github.com/ObolNetwork/obol-stack/internal/agentcrd"
"github.com/ObolNetwork/obol-stack/internal/config"
"github.com/ObolNetwork/obol-stack/internal/hermes"
"github.com/ObolNetwork/obol-stack/internal/kubectl"
"github.com/ObolNetwork/obol-stack/internal/ui"
"github.com/urfave/cli/v3"
Expand Down Expand Up @@ -132,6 +133,9 @@ func createCRDAgent(cfg *config.Config, u *ui.UI, opts createCRDAgentOptions) er
}
u.Successf("Agent %s/%s %s", agentcrd.Namespace(opts.Name), opts.Name, action)
u.Infof("Reconciler will provision: namespace → %s deployment → status updates", "hermes")
// Host-side chown for agent-<name>/hermes-data (#475). Master-agent Sync
// already does this for hermes-obol-agent; CRD children skip helm Sync.
hermes.EnsureCRDAgentHermesPVCOwnership(cfg, opts.Name, u)
u.Infof("Inspect: kubectl get agent %s -n %s -o yaml", opts.Name, agentcrd.Namespace(opts.Name))
return nil
}
Expand Down
1 change: 1 addition & 0 deletions cmd/obol/sell_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ func runAgentBackedDemo(
}
u.Successf("Agent %s/%s created (skills: %s)", agentcrd.Namespace(agentName), agentName,
strings.Join(spec.Agent.Skills, ", "))
hermes.EnsureCRDAgentHermesPVCOwnership(cfg, agentName, u)
}

// 2. Build and apply the agent-typed ServiceOffer.
Expand Down
11 changes: 9 additions & 2 deletions internal/agentcrd/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,19 @@ func Namespace(name string) string {
return "agent-" + name
}

// HermesDataPVCHostPath is the host-side directory backing the child agent's
// hermes-data PVC (<DataDir>/agent-<name>/hermes-data). local-path-provisioner
// maps the PVC here; HostHomePath writes under .../hermes-data/.hermes.
func HermesDataPVCHostPath(cfg *config.Config, name string) string {
desc := agentruntime.Describe(agentruntime.Hermes)
return filepath.Join(cfg.DataDir, Namespace(name), desc.DataPVCName)
}

// HostHomePath is where the agent's .hermes data lives on the host. The
// cluster mounts this into the Hermes pod via hostPath; writing
// SOUL.md/skills here puts them inside the pod automatically.
func HostHomePath(cfg *config.Config, name string) string {
desc := agentruntime.Describe(agentruntime.Hermes)
return filepath.Join(cfg.DataDir, Namespace(name), desc.DataPVCName, desc.HomeDir)
return filepath.Join(HermesDataPVCHostPath(cfg, name), agentruntime.Describe(agentruntime.Hermes).HomeDir)
}

// HostSkillsPath is the per-agent skills dir. OBOL_SKILLS_DIR points here
Expand Down
13 changes: 13 additions & 0 deletions internal/agentcrd/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,19 @@ func TestBuildAgent_Populated(t *testing.T) {
}
}

func TestHermesDataPVCHostPath(t *testing.T) {
cfg := &config.Config{DataDir: "/data/obol"}
got := HermesDataPVCHostPath(cfg, "quant-rc4")
want := "/data/obol/agent-quant-rc4/hermes-data"
if got != want {
t.Fatalf("HermesDataPVCHostPath = %q, want %q", got, want)
}
home := HostHomePath(cfg, "quant-rc4")
if home != want+"/.hermes" {
t.Fatalf("HostHomePath = %q, want %q/.hermes", home, want)
}
}

func TestSeedHostFiles_FreshAgent(t *testing.T) {
dir := t.TempDir()
cfg := &config.Config{DataDir: dir}
Expand Down
13 changes: 13 additions & 0 deletions internal/embed/skills/agent-factory/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,16 @@ python3 scripts/factory.py create medical-advisor \
- The profile seed Secret is named `hermes-profile-seed` and contains `profile.tar.gz`.
- Runtime environment overrides go in the optional `hermes-env` Secret.
- The factory intentionally writes deterministic resource names only.

### Linux k3d: fix Hermes PVC permissions after create

On Linux k3d (`KubeletInUserNamespace`), the child Hermes pod may crash-loop with
`Permission denied` under `/data/.hermes` until the host-side data directory is
chowned. The factory runs in-cluster and cannot do that itself — run from the host:

```bash
obol agent repair-perms <child-name>
```

`obol agent new` and `obol sell demo quant` run this automatically; factory-only
creates need the one-liner above (same workaround as rc1 manual `docker exec` chown).
56 changes: 30 additions & 26 deletions internal/hermes/hermes.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"path/filepath"
"strings"

"github.com/ObolNetwork/obol-stack/internal/agentcrd"
"github.com/ObolNetwork/obol-stack/internal/agentruntime"
"github.com/ObolNetwork/obol-stack/internal/config"
"github.com/ObolNetwork/obol-stack/internal/dns"
Expand Down Expand Up @@ -1306,10 +1307,12 @@ func fixRuntimeVolumeOwnership(cfg *config.Config, hostPath string, u *ui.UI) {
owner := fmt.Sprintf("%d:%d", containerUID, containerGID)
switch backendName {
case "k3d":
if err := k3dNodeExec(cfg, hostPath, "chown -R "+owner+" {}"); err != nil && u != nil {
shellCmd := fmt.Sprintf("mkdir -p {} && chown -R %s {}", owner)
if err := k3dNodeExec(cfg, hostPath, shellCmd); err != nil && u != nil {
u.Warnf("Failed to fix volume ownership for %s: %v", hostPath, err)
}
default:
_ = os.MkdirAll(hostPath, 0o755)
_ = os.Chown(hostPath, containerUID, containerGID)
}
}
Expand All @@ -1333,11 +1336,10 @@ func hermesPVCPaths(cfg *config.Config, id string) []string {
}
}

// ensureHermesPVCOwnership host-side chowns the Hermes PVC backing directories
// to containerUID:containerGID so the agent's init containers can write under
// /data on the first start.
// EnsureHermesDataPVCOwnership host-side chowns a Hermes data PVC backing dir
// to containerUID:containerGID so the agent can write under /data on first start.
//
// Why this is needed (issue #475):
// Why this is needed (issue #475, extended to agent-* namespaces in #481 follow-up):
// - The embedded k3d config (internal/embed/k3d-config.yaml) sets
// KubeletInUserNamespace=true. Pod "root" maps to a host subuid that
// lacks chown authority over the host bind-mount path provisioned by
Expand All @@ -1352,38 +1354,28 @@ func hermesPVCPaths(cfg *config.Config, id string) []string {
// k3d server container runs at the host Docker daemon's authority, which is
// real root and is not subject to the kubelet's user-namespacing.
//
// Best-effort. Waits up to 60s for each PVC to be Bound (local-path uses
// WaitForFirstConsumer, so the host dir doesn't exist until the consuming
// Best-effort. Waits up to 60s for the PVC to be Bound (local-path uses
// WaitForFirstConsumer, so the host dir may not exist until the consuming
// pod is scheduled). On non-k3d backends fixRuntimeVolumeOwnership falls
// back to a plain os.Chown.
//
// If a Hermes pod is currently stuck in Init:CrashLoopBackOff because of the
// pre-fix permissions, deletes it so kubelet re-creates with the corrected
// perms immediately rather than after exponential backoff (up to ~5 min).
// Skips the delete when no pod is stuck so repeated `Sync` calls
// (e.g. `obol model sync` after `obol model prefer`) do not gratuitously
// Skips the delete when no pod is stuck so repeated calls do not gratuitously
// restart a healthy agent.
func ensureHermesPVCOwnership(cfg *config.Config, id string, u *ui.UI) {
namespace := agentruntime.Namespace(agentruntime.Hermes, id)
func EnsureHermesDataPVCOwnership(cfg *config.Config, namespace, hostPath string, u *ui.UI) {
kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml")
kubectlBin := filepath.Join(cfg.BinDir, "kubectl")

// Wait only for the PVCs hermesPVCPaths chowns. remote-signer-keystores
// is intentionally NOT in this loop — see the doc comment on
// hermesPVCPaths for why.
for _, pvc := range []string{
agentruntime.Describe(agentruntime.Hermes).DataPVCName,
} {
waitCmd := exec.Command(kubectlBin,
"wait", "--for=jsonpath={.status.phase}=Bound",
"--timeout=60s", "pvc/"+pvc, "-n", namespace)
waitCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath)
_ = waitCmd.Run() // best-effort; continue even on timeout
}
pvcName := agentruntime.Describe(agentruntime.Hermes).DataPVCName
waitCmd := exec.Command(kubectlBin,
"wait", "--for=jsonpath={.status.phase}=Bound",
"--timeout=60s", "pvc/"+pvcName, "-n", namespace)
waitCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath)
_ = waitCmd.Run() // best-effort; continue even on timeout

for _, p := range hermesPVCPaths(cfg, id) {
fixRuntimeVolumeOwnership(cfg, p, u)
}
fixRuntimeVolumeOwnership(cfg, hostPath, u)

if hermesInitStuck(cfg, namespace) {
deleteCmd := exec.Command(kubectlBin,
Expand All @@ -1397,6 +1389,18 @@ func ensureHermesPVCOwnership(cfg *config.Config, id string, u *ui.UI) {
}
}

// EnsureCRDAgentHermesPVCOwnership applies EnsureHermesDataPVCOwnership for
// serviceoffer-controller child agents (namespace agent-<name>). Call after
// `obol agent new`, `obol sell demo quant`, or when repairing factory-spawned
// children that crash-loop on Linux k3d.
func EnsureCRDAgentHermesPVCOwnership(cfg *config.Config, agentName string, u *ui.UI) {
EnsureHermesDataPVCOwnership(cfg, agentcrd.Namespace(agentName), agentcrd.HermesDataPVCHostPath(cfg, agentName), u)
}

func ensureHermesPVCOwnership(cfg *config.Config, id string, u *ui.UI) {
EnsureHermesDataPVCOwnership(cfg, agentruntime.Namespace(agentruntime.Hermes, id), hermesPVCPaths(cfg, id)[0], u)
}

// hermesInitStuck reports whether at least one Hermes pod has an init
// container in CrashLoopBackOff or an Error waiting state — the signature of
// the perm-denied symptom this fix targets. Returns false on any kubectl
Expand Down
44 changes: 44 additions & 0 deletions plans/crd-agent-hermes-pvc-chown.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# CRD child agent Hermes PVC ownership (Linux k3d)

## Problem

PR #481 (`ensureHermesPVCOwnership`) runs only after `hermes.Sync` for the **master**
agent (`hermes-obol-agent`). Child agents provisioned by the serviceoffer-controller
use namespace `agent-<name>` and never go through helm Sync.

On Linux k3d with `KubeletInUserNamespace`, local-path-provisioner creates
`<DataDir>/agent-<name>/hermes-data` owned `1000:1000`. Hermes runs as `10000:10000`
and cannot write `/data/.hermes/home` → Init crash-loop.

Factory-spawned children (`agent-factory` skill, in-cluster API) hit the same gap:
they never invoke host-side `obol` seed + chown.

## Fix (branch `fix/crd-agent-hermes-pvc-chown`)

| Call site | When |
|-----------|------|
| `hermes.EnsureCRDAgentHermesPVCOwnership` | Shared: wait PVC Bound → `mkdir -p` + `chown 10000:10000` via k3d `docker exec` |
| `obol agent new` | After Agent CR apply |
| `obol sell demo quant` | After new Agent CR apply |
| `obol agent repair-perms <name>` | Manual repair after `agent-factory` create |

Master agent path unchanged: `ensureHermesPVCOwnership` → `EnsureHermesDataPVCOwnership` for `hermes-<id>`.

## Operator workaround (pre-fix / factory-only)

```bash
obol agent repair-perms <child-name>
# or legacy rc1 style:
docker exec k3d-obol-stack-<stack-id>-server-0 \
chown -R 10000:10000 /data/agent-<name>/hermes-data
```

## Verify

```bash
kubectl get pods -n agent-<name>
# hermes should reach Running, not Init:CrashLoopBackOff

obol sell status <offer> -n agent-<name>
# Ready=True once Hermes is healthy
```
Loading