From 3358c0e7748309f73e60c22ca07b9c0c3cdb9960 Mon Sep 17 00:00:00 2001 From: kimizuka Date: Wed, 3 Jun 2026 23:03:42 +0900 Subject: [PATCH 1/4] feat(hooks): expose per-turn usage and cost in after_llm_call Forward the per-call token usage and computed USD cost to the after_llm_call hook payload so sidecar cost ledgers can record per-call spend from the payload alone, without subscribing to the runtime event channel. Cost is a *float64 so the wire contract can distinguish an unpriced model (nil, key absent) from a priced free call (pointer to 0). The per-turn cost is computed once in computeMessageCost and threaded into both the hook payload and the recorded assistant message, so the two can never disagree. For harness agents the cost is surfaced only when the harness reported a non-zero value, avoiding reporting a billed turn as free when a harness omits its cost (e.g. codex). Signed-off-by: kimizuka --- pkg/hooks/types.go | 36 +++++++++++++++++++++++++++ pkg/runtime/harness.go | 15 +++++++++++- pkg/runtime/hooks.go | 13 +++++++--- pkg/runtime/loop.go | 55 +++++++++++++++++++++++++++++++++--------- 4 files changed, 103 insertions(+), 16 deletions(-) diff --git a/pkg/hooks/types.go b/pkg/hooks/types.go index 06e16be3f..f6a602ef0 100644 --- a/pkg/hooks/types.go +++ b/pkg/hooks/types.go @@ -68,6 +68,12 @@ const ( EventBeforeLLMCall EventType = "before_llm_call" // EventAfterLLMCall fires immediately after a successful model call, // before the response is recorded. Failed calls fire EventOnError. + // The Input carries the response text in [Input.StopResponse] + // (matching the stop event), the model that produced it in + // [Input.ModelID], and per-turn billing data in [Input.Usage] and + // [Input.Cost] so sidecar cost ledgers can record per-call spend + // from the payload alone, without subscribing to the runtime event + // channel. EventAfterLLMCall EventType = "after_llm_call" // EventSessionEnd fires when a session terminates. EventSessionEnd EventType = "session_end" @@ -293,6 +299,36 @@ type Input struct { ApprovalDecision string `json:"approval_decision,omitempty"` ApprovalSource string `json:"approval_source,omitempty"` + // AfterLLMCall specific: per-turn token usage and the computed USD + // cost of the model response the runtime just received. Both are + // populated only for [EventAfterLLMCall] and are nil for every + // other event. They are the hook-side counterpart of the runtime's + // internal TokenUsageEvent and let sidecar cost ledgers record + // per-call spend from the payload alone. + // + // Usage is a pointer so a handler can distinguish "the provider + // reported no usage" (nil) from "usage was zero". + // + // Cost is a *float64 with three meaningful states, mirroring the + // runtime's own pricing gate (usage present AND a model definition + // with a pricing table): + // - nil → unpriced: the model has no pricing data on file + // (unknown model ID, custom endpoint without cost + // config) or the provider reported no usage. With + // omitempty the "cost" key is absent on the wire. + // - 0 → a priced model whose computed cost is genuinely zero + // (a free call). Emitted as "cost": 0, NOT elided — + // omitempty on a pointer drops only nil, never a + // non-nil pointer to the zero value. + // - non-0 → the priced USD cost of this single response. + // A handler therefore reads a present "cost" as authoritative and + // an absent one as "unpriced", with no need to cross-check usage. + // (This is deliberately a *float64, unlike [chat.Message.Cost], + // which is a plain float64 with omitempty and so cannot distinguish + // a free priced call from an unpriced one on the wire.) + Usage *chat.Usage `json:"usage,omitempty"` + Cost *float64 `json:"cost,omitempty"` + // Compaction fields (BeforeCompaction, AfterCompaction). InputTokens int64 `json:"input_tokens,omitempty"` OutputTokens int64 `json:"output_tokens,omitempty"` diff --git a/pkg/runtime/harness.go b/pkg/runtime/harness.go index c48b380f8..f2de19cb0 100644 --- a/pkg/runtime/harness.go +++ b/pkg/runtime/harness.go @@ -189,7 +189,20 @@ func (r *LocalRuntime) runHarnessAgent(ctx context.Context, sess *session.Sessio content = strings.TrimSpace(finalResult) } - r.executeAfterLLMCallHooks(ctx, sess, a, modelID, content) + // A harness reports its own TotalCostUSD, which the harness + // library defaults to 0 whenever the harness output omits a cost + // (e.g. the codex harness never reports one). That 0 is + // indistinguishable from a genuinely free call, so — to avoid + // telling a cost ledger that a billed turn was free — surface cost + // only when the harness reported a non-zero value and leave it nil + // (unpriced) otherwise. This keeps the wire contract honest: a + // present cost is always a real reported figure. + var hookCost *float64 + if cost != 0 { + c := cost + hookCost = &c + } + r.executeAfterLLMCallHooks(ctx, sess, a, modelID, content, usage, hookCost) r.recordHarnessAssistantMessage(sess, a, content, modelID, usage, cost, events) r.executeStopHooks(ctx, sess, a, content, events) diff --git a/pkg/runtime/hooks.go b/pkg/runtime/hooks.go index cca582d90..4431afb04 100644 --- a/pkg/runtime/hooks.go +++ b/pkg/runtime/hooks.go @@ -441,15 +441,22 @@ func (r *LocalRuntime) executeBeforeLLMCallHooks( // model call, before the response is recorded into the session and // tool calls are dispatched. The assistant text content is passed via // stop_response (matching the stop event), so handlers can reuse the -// same parsing logic. Failed model calls fire on_error instead and -// skip this event. -func (r *LocalRuntime) executeAfterLLMCallHooks(ctx context.Context, sess *session.Session, a *agent.Agent, modelID, responseContent string) { +// same parsing logic. The per-turn token usage and computed USD cost +// are forwarded via [hooks.Input.Usage] and [hooks.Input.Cost] so +// sidecar cost ledgers can record per-call spend from the payload +// alone. cost is a *float64 so an unpriced model (nil) is distinct on +// the wire from a priced free call (a pointer to 0); the caller owns +// that distinction. Failed model calls fire on_error instead and skip +// this event. +func (r *LocalRuntime) executeAfterLLMCallHooks(ctx context.Context, sess *session.Session, a *agent.Agent, modelID, responseContent string, usage *chat.Usage, cost *float64) { r.dispatchHook(ctx, a, hooks.EventAfterLLMCall, &hooks.Input{ SessionID: sess.ID, AgentName: a.Name(), ModelID: modelID, StopResponse: responseContent, LastUserMessage: sess.GetLastUserMessageContent(), + Usage: usage, + Cost: cost, }, nil) } diff --git a/pkg/runtime/loop.go b/pkg/runtime/loop.go index 84b66129a..3548ffd4a 100644 --- a/pkg/runtime/loop.go +++ b/pkg/runtime/loop.go @@ -373,8 +373,8 @@ func (r *LocalRuntime) runStreamLoop(ctx context.Context, sess *session.Session, // the actual inference context), then falls back to the models.dev // catalogue. The lookup above is reused inside resolveContextLimit // only when context_size isn't supplied; we keep the explicit call - // here because m is also threaded into [recordAssistantMessage] for - // per-message cost computation. + // here because m is also passed to [computeMessageCost] for + // per-turn cost computation. contextLimit := r.resolveContextLimit(ctx, model, modelID) if contextLimit > 0 && r.sessionCompaction && compaction.ShouldCompact(sess.InputTokens, sess.OutputTokens, 0, contextLimit) { r.compactWithReason(ctx, sess, "", compactionReasonThreshold, sink) @@ -565,11 +565,20 @@ func (r *LocalRuntime) runTurn( // A successful model call resets the overflow compaction counter. ls.overflowCompactions = 0 + // Compute the per-turn cost once, here, so the exact same value + // reaches both the after_llm_call hook payload and the recorded + // assistant message — the hook's cost is therefore guaranteed to + // equal the cost the session bills for this turn. It is nil when + // the turn cannot be priced (no usage, or a model with no pricing + // table); see computeMessageCost. + msgCost := computeMessageCost(res.Usage, m) + // after_llm_call hooks fire on success only; failed calls // fire on_error above. The assistant text content is passed // via stop_response, matching the stop event's payload, so - // handlers can reuse the same parsing. - r.executeAfterLLMCallHooks(ctx, sess, a, modelID.String(), res.Content) + // handlers can reuse the same parsing. Usage and Cost carry the + // per-turn billing data for sidecar cost ledgers. + r.executeAfterLLMCallHooks(ctx, sess, a, modelID.String(), res.Content, res.Usage, msgCost) if usedModel != nil && usedModel.ID() != model.ID() { slog.InfoContext(ctx, "Used fallback model", "agent", a.Name(), "primary", model.ID().String(), "used", usedModel.ID().String()) @@ -583,7 +592,7 @@ func (r *LocalRuntime) runTurn( endStreamSpan() slog.DebugContext(ctx, "Stream processed", "agent", a.Name(), "tool_calls", len(res.Calls), "content_length", len(res.Content), "stopped", res.Stopped) - msgUsage := r.recordAssistantMessage(sess, a, res, agentTools, modelID.String(), m, events) + msgUsage := r.recordAssistantMessage(sess, a, res, agentTools, modelID.String(), msgCost, events) usage := SessionUsage(sess, contextLimit) usage.LastMessage = msgUsage @@ -701,16 +710,39 @@ func (r *LocalRuntime) Run(ctx context.Context, sess *session.Session) ([]sessio return sess.GetAllMessages(), nil } +// computeMessageCost returns the USD cost of a single model response, +// or nil when the response cannot be priced. It is nil when there is +// no usage to price (usage == nil) or the model has no pricing table +// (m == nil — e.g. an unknown model ID or a custom endpoint without +// cost config — or m.Cost == nil). A non-nil result of 0 therefore +// means "priced, but this call was free", distinct from "unpriced" +// (nil). This single arithmetic source feeds both the persisted +// assistant message (dereferenced to 0 when nil) and the +// after_llm_call hook payload (which keeps the nil/0 distinction), so +// the two can never disagree. +func computeMessageCost(usage *chat.Usage, m *modelsdev.Model) *float64 { + if usage == nil || m == nil || m.Cost == nil { + return nil + } + cost := (float64(usage.InputTokens)*m.Cost.Input + + float64(usage.OutputTokens)*m.Cost.Output + + float64(usage.CachedInputTokens)*m.Cost.CacheRead + + float64(usage.CacheWriteTokens)*m.Cost.CacheWrite) / 1e6 + return &cost +} + // recordAssistantMessage adds the model's response to the session and returns // per-message usage information for the token-usage event. Empty responses // (no text and no tool calls) are silently skipped since providers reject them. +// cost is the precomputed per-turn cost (see computeMessageCost); nil records +// as 0, matching the previous "no pricing data" behaviour. func (r *LocalRuntime) recordAssistantMessage( sess *session.Session, a *agent.Agent, res streamResult, agentTools []tools.Tool, modelID string, - m *modelsdev.Model, + cost *float64, events EventSink, ) *MessageUsage { if strings.TrimSpace(res.Content) == "" && len(res.Calls) == 0 { @@ -732,13 +764,12 @@ func (r *LocalRuntime) recordAssistantMessage( } } - // Calculate per-message cost when pricing information is available. + // The per-turn cost was computed once in runTurn and threaded in; + // nil means the response could not be priced and records as 0, + // preserving the previous "no pricing data" behaviour. var messageCost float64 - if res.Usage != nil && m != nil && m.Cost != nil { - messageCost = (float64(res.Usage.InputTokens)*m.Cost.Input + - float64(res.Usage.OutputTokens)*m.Cost.Output + - float64(res.Usage.CachedInputTokens)*m.Cost.CacheRead + - float64(res.Usage.CacheWriteTokens)*m.Cost.CacheWrite) / 1e6 + if cost != nil { + messageCost = *cost } messageModel := modelID From 615970a0b1ad2b386a417ca40269ae655c954908 Mon Sep 17 00:00:00 2001 From: kimizuka Date: Wed, 3 Jun 2026 23:03:48 +0900 Subject: [PATCH 2/4] test(runtime): cover after_llm_call usage and cost payload Verify that after_llm_call populates usage and cost, that cost is nil when the model is unpriced, the nil-vs-zero JSON contract, harness usage with no cost surfacing as unpriced, and computeMessageCost. Signed-off-by: kimizuka --- pkg/runtime/after_llm_call_test.go | 260 +++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) diff --git a/pkg/runtime/after_llm_call_test.go b/pkg/runtime/after_llm_call_test.go index 2f0f519d8..7a76994ad 100644 --- a/pkg/runtime/after_llm_call_test.go +++ b/pkg/runtime/after_llm_call_test.go @@ -2,6 +2,9 @@ package runtime import ( "context" + "encoding/json" + "os" + stdruntime "runtime" "sync/atomic" "testing" @@ -9,12 +12,28 @@ import ( "github.com/stretchr/testify/require" "github.com/docker/docker-agent/pkg/agent" + "github.com/docker/docker-agent/pkg/chat" "github.com/docker/docker-agent/pkg/config/latest" "github.com/docker/docker-agent/pkg/hooks" + "github.com/docker/docker-agent/pkg/modelsdev" "github.com/docker/docker-agent/pkg/session" "github.com/docker/docker-agent/pkg/team" ) +// mockModelStoreWithCost returns a model carrying a fixed pricing +// table so after_llm_call can compute a non-nil per-turn cost. The +// zero mockModelStore returns a nil model, which exercises the +// unpriced (nil cost) path instead. +type mockModelStoreWithCost struct { + ModelStore + cost modelsdev.Cost +} + +func (m mockModelStoreWithCost) GetModel(_ context.Context, _ modelsdev.ID) (*modelsdev.Model, error) { + c := m.cost + return &modelsdev.Model{Cost: &c}, nil +} + // TestAfterLLMCallHook_PopulatesModelID is a regression test for the // doc/impl mismatch where [hooks.Input.ModelID] is documented as // populated for after_llm_call but executeAfterLLMCallHooks never @@ -74,3 +93,244 @@ func TestAfterLLMCallHook_PopulatesModelID(t *testing.T) { "after_llm_call payload must include the canonical model id; "+ "see pkg/hooks/types.go:177-186 for the documented contract") } + +// captureAfterLLMCall runs a single successful turn against the given +// model store and returns the after_llm_call payload the runtime +// dispatched, together with the session so callers can cross-check the +// hook cost against what the session recorded. Usage is fixed at 10 +// input / 5 output tokens so callers can assert an exact computed cost. +func captureAfterLLMCall(t *testing.T, store ModelStore) (*hooks.Input, *session.Session) { + t.Helper() + + const hookName = "test-after-llm-usage-cost" + + var captured atomic.Pointer[hooks.Input] + + stream := newStreamBuilder(). + AddContent("ok"). + AddStopWithUsage(10, 5). + Build() + prov := &mockProvider{id: "test/mock-model", stream: stream} + + root := agent.New("root", "test agent", + agent.WithModel(prov), + agent.WithHooks(&latest.HooksConfig{ + AfterLLMCall: []latest.HookDefinition{ + {Type: "builtin", Command: hookName}, + }, + }), + ) + tm := team.New(team.WithAgents(root)) + + rt, err := NewLocalRuntime(tm, + WithSessionCompaction(false), + WithModelStore(store), + ) + require.NoError(t, err) + + require.NoError(t, rt.hooksRegistry.RegisterBuiltin( + hookName, + func(_ context.Context, in *hooks.Input, _ []string) (*hooks.Output, error) { + snap := *in + captured.Store(&snap) + return nil, nil + }, + )) + + sess := session.New(session.WithUserMessage("hi")) + sess.Title = "Unit Test" + + for range rt.RunStream(t.Context(), sess) { + } + + got := captured.Load() + require.NotNil(t, got, "after_llm_call hook must fire on a successful turn") + return got, sess +} + +// TestAfterLLMCallHook_PopulatesUsageAndCost pins the priced-call +// contract: when the model has a pricing table, after_llm_call carries +// the provider's token usage and a non-nil Cost equal to the value the +// runtime records on the assistant message (same computeMessageCost +// call, threaded to both). +func TestAfterLLMCallHook_PopulatesUsageAndCost(t *testing.T) { + t.Parallel() + + rate := modelsdev.Cost{Input: 2.0, Output: 4.0} + in, sess := captureAfterLLMCall(t, mockModelStoreWithCost{cost: rate}) + + require.NotNil(t, in.Usage, "Usage must be populated on after_llm_call") + assert.Equal(t, int64(10), in.Usage.InputTokens) + assert.Equal(t, int64(5), in.Usage.OutputTokens) + + // Same arithmetic as computeMessageCost; inputs chosen for exact + // float64 representation so equality is reliable. + expected := (float64(10)*rate.Input + float64(5)*rate.Output) / 1e6 + require.NotNil(t, in.Cost, "Cost must be non-nil for a priced model") + assert.Equal(t, expected, *in.Cost, + "hook Cost must equal computeMessageCost(usage, model)") + + // The headline guarantee: the cost the hook reports is the same + // cost the session bills for the turn. OwnCost sums the recorded + // assistant message's Cost, set from the same computeMessageCost + // value threaded into recordAssistantMessage. + assert.Equal(t, *in.Cost, sess.OwnCost(), + "hook Cost must equal the cost the session recorded for the turn") +} + +// TestAfterLLMCallHook_CostNilWhenUnpriced pins the unpriced contract: +// when the model has no pricing data (the zero mockModelStore returns a +// nil model), Usage is still populated but Cost is nil — the signal a +// sidecar reads as "this model is unpriced", distinct from a priced +// free call (a non-nil pointer to 0). +func TestAfterLLMCallHook_CostNilWhenUnpriced(t *testing.T) { + t.Parallel() + + in, _ := captureAfterLLMCall(t, mockModelStore{}) + + require.NotNil(t, in.Usage, + "Usage must still be populated even when the model is unpriced") + assert.Equal(t, int64(10), in.Usage.InputTokens) + assert.Nil(t, in.Cost, + "Cost must be nil for an unpriced model so handlers can "+ + "distinguish it from a priced free call (pointer to 0)") +} + +// TestAfterLLMCallInput_CostJSONContract pins the wire format sidecar +// scripts depend on. With Cost as a *float64 + omitempty: +// - nil → the "cost" key is absent (unpriced), +// - &0 → "cost": 0 is present, NOT elided (priced free call — +// omitempty drops only nil pointers, never a pointer to 0), +// - &N → "cost": N. +// +// The same nil-omitted rule applies to Usage, keeping every non- +// after_llm_call event's payload free of spurious cost/usage keys. +func TestAfterLLMCallInput_CostJSONContract(t *testing.T) { + t.Parallel() + + marshalKeys := func(in *hooks.Input) map[string]any { + b, err := json.Marshal(in) + require.NoError(t, err) + var m map[string]any + require.NoError(t, json.Unmarshal(b, &m)) + return m + } + + t.Run("unpriced omits cost and usage", func(t *testing.T) { + t.Parallel() + m := marshalKeys(&hooks.Input{HookEventName: hooks.EventAfterLLMCall}) + _, hasCost := m["cost"] + _, hasUsage := m["usage"] + assert.False(t, hasCost, "nil Cost must be omitted, not emitted as null") + assert.False(t, hasUsage, "nil Usage must be omitted") + }) + + t.Run("priced free call emits explicit zero", func(t *testing.T) { + t.Parallel() + zero := 0.0 + m := marshalKeys(&hooks.Input{ + HookEventName: hooks.EventAfterLLMCall, + Usage: &chat.Usage{InputTokens: 1, OutputTokens: 1}, + Cost: &zero, + }) + raw, hasCost := m["cost"] + require.True(t, hasCost, + "a non-nil pointer to 0 must emit \"cost\": 0, not be elided — "+ + "this is what distinguishes a free priced call from an unpriced model") + assert.Equal(t, float64(0), raw) + _, hasUsage := m["usage"] + assert.True(t, hasUsage, "Usage must be present when set") + }) + + t.Run("priced call emits the value", func(t *testing.T) { + t.Parallel() + v := 0.0125 + m := marshalKeys(&hooks.Input{HookEventName: hooks.EventAfterLLMCall, Cost: &v}) + assert.Equal(t, 0.0125, m["cost"]) + }) +} + +// TestAfterLLMCallHook_HarnessUsageWithoutCostIsUnpriced pins the +// harness cost gate. The codex harness reports token counts via +// turn.completed but never a cost, so the harness library's +// TotalCostUSD defaults to 0. That 0 must be treated as unpriced (nil +// cost on the hook), NOT as a free priced call (cost 0) — otherwise a +// cost ledger would record a real, billed harness turn as $0. +func TestAfterLLMCallHook_HarnessUsageWithoutCostIsUnpriced(t *testing.T) { + if stdruntime.GOOS == "windows" { + t.Skip("shell script shim test") + } + + const hookName = "test-after-llm-harness-cost" + + binDir := t.TempDir() + writeHarnessScript(t, binDir, "codex", `#!/bin/sh +printf '%s\n' '{"type":"item.completed","item":{"type":"agent_message","text":"harness done"}}' +printf '%s\n' '{"type":"turn.completed","usage":{"input_tokens":120,"output_tokens":30}}' +`) + t.Setenv("PATH", binDir+string(os.PathListSeparator)+os.Getenv("PATH")) + + var captured atomic.Pointer[hooks.Input] + + root := agent.New("root", "You are an external coder.", + agent.WithHarness(&latest.HarnessConfig{Type: "codex"}), + agent.WithHooks(&latest.HooksConfig{ + AfterLLMCall: []latest.HookDefinition{{Type: "builtin", Command: hookName}}, + }), + ) + rt, err := NewLocalRuntime(team.New(team.WithAgents(root)), + WithSessionCompaction(false), WithModelStore(mockModelStore{})) + require.NoError(t, err) + + require.NoError(t, rt.hooksRegistry.RegisterBuiltin( + hookName, + func(_ context.Context, in *hooks.Input, _ []string) (*hooks.Output, error) { + snap := *in + captured.Store(&snap) + return nil, nil + }, + )) + + sess := session.New(session.WithUserMessage("do the task")) + sess.Title = "Harness Unit Test" + for range rt.RunStream(t.Context(), sess) { + } + + in := captured.Load() + require.NotNil(t, in, "after_llm_call must fire for a harness turn") + require.NotNil(t, in.Usage, "harness usage must be forwarded to the hook") + assert.Equal(t, int64(120), in.Usage.InputTokens) + assert.Equal(t, int64(30), in.Usage.OutputTokens) + assert.Nil(t, in.Cost, + "a harness that reports no cost must yield nil cost (unpriced), not 0 (free)") +} + +// TestComputeMessageCost unit-tests the single cost-arithmetic source +// shared by the persisted message and the after_llm_call payload, +// including every branch that yields nil (unpriced). +func TestComputeMessageCost(t *testing.T) { + t.Parallel() + + rate := &modelsdev.Cost{Input: 2.0, Output: 4.0, CacheRead: 1.0, CacheWrite: 5.0} + + t.Run("nil usage is unpriced", func(t *testing.T) { + t.Parallel() + assert.Nil(t, computeMessageCost(nil, &modelsdev.Model{Cost: rate})) + }) + t.Run("nil model is unpriced", func(t *testing.T) { + t.Parallel() + assert.Nil(t, computeMessageCost(&chat.Usage{InputTokens: 1}, nil)) + }) + t.Run("model without pricing table is unpriced", func(t *testing.T) { + t.Parallel() + assert.Nil(t, computeMessageCost(&chat.Usage{InputTokens: 1}, &modelsdev.Model{})) + }) + t.Run("priced computes from all token classes", func(t *testing.T) { + t.Parallel() + usage := &chat.Usage{InputTokens: 10, OutputTokens: 5, CachedInputTokens: 4, CacheWriteTokens: 2} + got := computeMessageCost(usage, &modelsdev.Model{Cost: rate}) + require.NotNil(t, got) + expected := (10*rate.Input + 5*rate.Output + 4*rate.CacheRead + 2*rate.CacheWrite) / 1e6 + assert.Equal(t, expected, *got) + }) +} From 8a4eb0b59ec98508dc0e006f1b27eeb9e3ff8e1a Mon Sep 17 00:00:00 2001 From: kimizuka Date: Wed, 3 Jun 2026 23:03:48 +0900 Subject: [PATCH 3/4] docs(hooks): document after_llm_call usage and cost fields Describe the new usage and cost fields, the priced/unpriced/free semantics and the harness caveat, and add a per-call cost-ledger example to examples/hooks.yaml. Signed-off-by: kimizuka --- docs/configuration/hooks/index.md | 7 +++++-- examples/hooks.yaml | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/docs/configuration/hooks/index.md b/docs/configuration/hooks/index.md index cc635644e..a4a11744b 100644 --- a/docs/configuration/hooks/index.md +++ b/docs/configuration/hooks/index.md @@ -259,7 +259,7 @@ In addition to the common fields, each event ships its own payload: | `turn_start` | _none_ (just the common fields) | | `turn_end` | `agent_name`, `reason` — one of `normal`, `continue`, `steered`, `error`, `canceled`, `hook_blocked`, `loop_detected` | | `before_llm_call` | `iteration` — 1-based run-loop iteration counter (the model call this hook is gating), `model_id` | -| `after_llm_call` | `agent_name`, `stop_response`, `last_user_message`, `model_id` | +| `after_llm_call` | `agent_name`, `stop_response`, `last_user_message`, `model_id`, `usage`, `cost` | | `session_end` | `reason` — one of `clear`, `logout`, `prompt_input_exit`, `other` | | `pre_compact` | `source` — one of `manual`, `auto`, `overflow`, `tool_overflow` | | `before_compaction` | `input_tokens`, `output_tokens`, `context_limit`, `compaction_reason` (one of `threshold`/`overflow`/`manual`) | @@ -281,6 +281,9 @@ Notes: - `prompt` is only populated for `user_prompt_submit`. Sub-sessions (transferred tasks, background agents, skills) do **not** fire this event because their kick-off message is synthesised by the runtime, not authored by the user. - `stop_response` carries the model's final assistant text for `stop`, `after_llm_call`, and `subagent_stop`. `last_user_message` carries the latest user message at dispatch time. - `model_id` is populated for `after_llm_call` (and `before_llm_call`) in the canonical `/` form (e.g. `anthropic/claude-sonnet-4-5`). For harness agents, `model_id` is the harness label (e.g. `claude-code`) rather than a canonical model name — see [Coding Harnesses]({{ '/features/harnesses/' | relative_url }}). +- `usage` and `cost` are populated for `after_llm_call` only. `usage` is the per-call token usage object (`input_tokens`, `output_tokens`, `cached_input_tokens`, `cached_write_tokens`, and `reasoning_tokens` — the last is itself omitted for non-reasoning models); the whole object is absent when the provider reported no usage. `cost` is the USD price of that one model response. For a **native model call** it is the price computed from `usage` and the model's pricing table, and equals the cost the session records for the turn: it is **absent** when the response is unpriced (no pricing data on file, or no usage) and an explicit `0` for a priced call that was free — so a present `cost` is authoritative and an absent one means "unpriced", with no need to cross-check `usage`. (For harness agents the meaning differs — see the next note.) A cost ledger can therefore record per-call spend from the payload alone, without subscribing to the runtime event channel. +- For [harness agents]({{ '/features/harnesses/' | relative_url }}), `cost` is the harness's own reported total for the call rather than a computed price, and is present only when the harness reported a non-zero cost (some harnesses, e.g. `codex`, report token counts but no cost — those turns carry `usage` with `cost` absent, even though the recorded message stores `0`). +- `after_llm_call` fires for **every** model call, including calls made inside sub-sessions (transferred tasks, background agents, skills). For those, `session_id` is the sub-session's id. Summing `cost` across `after_llm_call` events therefore captures **all** spend, including sub-sessions (and even sub-sessions that error before their cost is persisted). Do **not** add a separately-queried session cost total on top: the runtime's own total already recurses into and includes completed sub-session spend, so combining the two double-counts. Pick one source — the summed hook costs — as the authoritative ledger. - `context_limit` is `0` when the model definition is unavailable (treat `0` as "unknown", not as a real limit). - `approval_decision` is one of `allow`, `deny`, `canceled`. `approval_source` is a stable classifier of which step decided (e.g. `yolo`, `session_permissions_allow`, `session_permissions_deny`, `team_permissions_allow`, `team_permissions_deny`, `pre_tool_use_hook_allow`, `pre_tool_use_hook_deny`, `readonly_hint`, `user_approved`, `user_approved_session`, `user_approved_tool`, `user_rejected`, `context_canceled`). @@ -552,7 +555,7 @@ The `reason` field classifies the exit: `before_llm_call` fires immediately before every model call (after `turn_start` has assembled the messages). It cannot contribute context — use `turn_start` for that — but it can **stop the run** by returning `decision: block` (or exit code 2). The built-in `max_iterations` hook implements a hard cap on top of this event. -`after_llm_call` fires immediately after each successful model call, before the response is recorded into the session and tool calls are dispatched. The assistant text is in `stop_response`. Use it for response auditing, redaction logging, or quality metrics. Failed model calls fire `on_error` instead. +`after_llm_call` fires immediately after each successful model call, before the response is recorded into the session and tool calls are dispatched. The assistant text is in `stop_response`, and the call's `usage` and `cost` carry the per-turn token usage and computed USD spend (see the field notes above). Use it for response auditing, redaction logging, quality metrics, or a sidecar cost ledger that records per-call spend without subscribing to the runtime event channel. Failed model calls fire `on_error` instead. ### Before/After-Compaction: structured compaction control diff --git a/examples/hooks.yaml b/examples/hooks.yaml index 5ac38c416..f584b2921 100644 --- a/examples/hooks.yaml +++ b/examples/hooks.yaml @@ -65,6 +65,7 @@ # /tmp/agent-session.log (session_start, session_end) # /tmp/agent-prompts.log (user_prompt_submit) # /tmp/agent-llm-calls.log (before_llm_call, after_llm_call) +# /tmp/agent-cost-ledger.csv (after_llm_call: per-call token usage + cost) # /tmp/agent-turns.log (turn_end) # /tmp/agent-tool-results.log (post_tool_use) # /tmp/agent-permissions.log (permission_request) @@ -277,6 +278,14 @@ agents: # assistant text content arrives via stop_response (matching the # stop event's payload). Failed calls fire on_error instead and # skip this event. + # + # The payload also carries this call's token usage in .usage and its + # computed USD cost in .cost. .cost is ABSENT for an unpriced model + # (test with `has("cost")`) and an explicit 0 for a priced free call, + # so a present cost is authoritative without checking usage. That is + # everything a sidecar cost ledger needs — no event-channel wiring. + # after_llm_call also fires for sub-session turns (each with its own + # session_id), so summing .cost is the full spend for the run. # ==================================================================== after_llm_call: - type: command @@ -286,6 +295,12 @@ agents: SESSION_ID=$(echo "$INPUT" | jq -r '.session_id // "unknown"') LEN=$(echo "$INPUT" | jq -r '.stop_response // ""' | wc -c | tr -d ' ') echo "[$(date)] [←] $SESSION_ID llm call complete, content=$LEN chars" >> /tmp/agent-llm-calls.log + # Per-call cost ledger: timestamp, session, model, tokens, cost. + echo "$INPUT" | jq -r '[ + (now | todateiso8601), .session_id, .model_id, + (.usage.input_tokens // 0), (.usage.output_tokens // 0), + (if has("cost") then (.cost | tostring) else "unpriced" end) + ] | @csv' >> /tmp/agent-cost-ledger.csv # ==================================================================== # SESSION-END - cleanup when the session terminates. From dea13db6181167de66e5573cbba8ab317b86d19b Mon Sep 17 00:00:00 2001 From: kimizuka Date: Thu, 4 Jun 2026 10:10:19 +0900 Subject: [PATCH 4/4] test(runtime): satisfy golangci-lint in after_llm_call test Add the empty line embeddedstructfieldcheck wants between the embedded ModelStore and the cost field, and switch the float equality assertions to assert.InDelta to satisfy testifylint's float-compare rule. --- pkg/runtime/after_llm_call_test.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/runtime/after_llm_call_test.go b/pkg/runtime/after_llm_call_test.go index 7a76994ad..4eb8c6045 100644 --- a/pkg/runtime/after_llm_call_test.go +++ b/pkg/runtime/after_llm_call_test.go @@ -26,6 +26,7 @@ import ( // unpriced (nil cost) path instead. type mockModelStoreWithCost struct { ModelStore + cost modelsdev.Cost } @@ -167,14 +168,14 @@ func TestAfterLLMCallHook_PopulatesUsageAndCost(t *testing.T) { // float64 representation so equality is reliable. expected := (float64(10)*rate.Input + float64(5)*rate.Output) / 1e6 require.NotNil(t, in.Cost, "Cost must be non-nil for a priced model") - assert.Equal(t, expected, *in.Cost, + assert.InDelta(t, expected, *in.Cost, 1e-9, "hook Cost must equal computeMessageCost(usage, model)") // The headline guarantee: the cost the hook reports is the same // cost the session bills for the turn. OwnCost sums the recorded // assistant message's Cost, set from the same computeMessageCost // value threaded into recordAssistantMessage. - assert.Equal(t, *in.Cost, sess.OwnCost(), + assert.InDelta(t, *in.Cost, sess.OwnCost(), 1e-9, "hook Cost must equal the cost the session recorded for the turn") } @@ -237,7 +238,7 @@ func TestAfterLLMCallInput_CostJSONContract(t *testing.T) { require.True(t, hasCost, "a non-nil pointer to 0 must emit \"cost\": 0, not be elided — "+ "this is what distinguishes a free priced call from an unpriced model") - assert.Equal(t, float64(0), raw) + assert.InDelta(t, float64(0), raw, 1e-9) _, hasUsage := m["usage"] assert.True(t, hasUsage, "Usage must be present when set") }) @@ -246,7 +247,7 @@ func TestAfterLLMCallInput_CostJSONContract(t *testing.T) { t.Parallel() v := 0.0125 m := marshalKeys(&hooks.Input{HookEventName: hooks.EventAfterLLMCall, Cost: &v}) - assert.Equal(t, 0.0125, m["cost"]) + assert.InDelta(t, 0.0125, m["cost"], 1e-9) }) } @@ -331,6 +332,6 @@ func TestComputeMessageCost(t *testing.T) { got := computeMessageCost(usage, &modelsdev.Model{Cost: rate}) require.NotNil(t, got) expected := (10*rate.Input + 5*rate.Output + 4*rate.CacheRead + 2*rate.CacheWrite) / 1e6 - assert.Equal(t, expected, *got) + assert.InDelta(t, expected, *got, 1e-9) }) }