From 4875cf55726c9e87c8314fba8b5b732c943d86a4 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 5 Mar 2026 14:42:43 +0200 Subject: [PATCH 01/14] feat: record model thoughts Signed-off-by: Danny Kopping --- fixtures/anthropic/single_builtin_tool.txtar | 48 ++++++- intercept/messages/blocking.go | 42 ++++++- intercept/messages/streaming.go | 46 +++++++ internal/integrationtest/bridge_test.go | 125 +++++++++++++++++++ internal/integrationtest/trace_test.go | 2 + internal/testutil/mock_recorder.go | 15 +++ recorder/recorder.go | 34 +++++ recorder/types.go | 10 ++ 8 files changed, 316 insertions(+), 6 deletions(-) diff --git a/fixtures/anthropic/single_builtin_tool.txtar b/fixtures/anthropic/single_builtin_tool.txtar index 50ca93f1..c271cb7c 100644 --- a/fixtures/anthropic/single_builtin_tool.txtar +++ b/fixtures/anthropic/single_builtin_tool.txtar @@ -33,22 +33,55 @@ event: message_start data: {"type":"message_start","message":{"id":"msg_015SQewixvT9s4cABCVvUE6g","type":"message","role":"assistant","model":"claude-sonnet-4-20250514","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":2,"cache_creation_input_tokens":22,"cache_read_input_tokens":13993,"output_tokens":5,"service_tier":"standard"}} } event: content_block_start -data: {"type":"content_block_start","index":0,"content_block":{"type":"tool_use","id":"toolu_01RX68weRSquLx6HUTj65iBo","name":"Read","input":{}} } +data: {"type":"content_block_start","index":0,"content_block":{"type":"thinking","thinking":""}} + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"thinking_delta","thinking":"The user wants me to read"} } + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"thinking_delta","thinking":" a"} } + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"thinking_delta","thinking":" file called \""} } + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"thinking_delta","thinking":"foo\"."} } + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"thinking_delta","thinking":" Let me find"} } + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"thinking_delta","thinking":" and"} } + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"thinking_delta","thinking":" read it."} } + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"thinking_delta","thinking":""} } + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"signature_delta","signature":"Eu8BCkYICxgCKkBR++kFr7Za2JhF/9OCpjEc46/EcipL75RK+MEbxJ/VBJPWQTWrNGfwb5khWYJtKEpjjkH07cR/MQvThfb7t7CkEgwU4pKwL7NuZXd1/wgaDILyd0bYMqQovWo3dyIw95Ny7yZPljNBDLsvMBdBr7w+RtbU+AlSftjBuBZHp0VzI54/W+9u6f7qfx0JXsVBKldqqOjFvewT8Xm6Qp/77g6/j0zBiuAQABj/6vS1qATjd8KSIFDg9G/tCtzwmV/T/egmzswWd5CBiAhW6lgJgEDRr+gRUrFSOB7o3hypW8FUnUrr1JtzzwMYAQ=="}} + +event: content_block_stop +data: {"type":"content_block_stop","index":0} + +event: content_block_start +data: {"type":"content_block_start","index":1,"content_block":{"type":"tool_use","id":"toolu_01RX68weRSquLx6HUTj65iBo","name":"Read","input":{}}} event: ping data: {"type": "ping"} event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"input_json_delta","partial_json":""} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"input_json_delta","partial_json":""} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"input_json_delta","partial_json":"{\"file_path\": \"/tmp/blah/foo"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"input_json_delta","partial_json":"{\"file_path\": \"/tmp/blah/foo"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"input_json_delta","partial_json":"\"}"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"input_json_delta","partial_json":"\"}"} } event: content_block_stop -data: {"type":"content_block_stop","index":0 } +data: {"type":"content_block_stop","index":1 } event: message_delta data: {"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"output_tokens":61} } @@ -65,6 +98,11 @@ data: {"type":"message_stop" } "expires_at": "0001-01-01T00:00:00Z" }, "content": [ + { + "type": "thinking", + "thinking": "The user wants me to read a file called \"foo\". Let me find and read it.", + "signature": "Eu8BCkYICxgCKkBR++kFr7Za2JhF/9OCpjEc46/EcipL75RK+MEbxJ/VBJPWQTWrNGfwb5khWYJtKEpjjkH07cR/MQvThfb7t7CkEgwU4pKwL7NuZXd1/wgaDILyd0bYMqQovWo3dyIw95Ny7yZPljNBDLsvMBdBr7w+RtbU+AlSftjBuBZHp0VzI54/W+9u6f7qfx0JXsVBKldqqOjFvewT8Xm6Qp/77g6/j0zBiuAQABj/6vS1qATjd8KSIFDg9G/tCtzwmV/T/egmzswWd5CBiAhW6lgJgEDRr+gRUrFSOB7o3hypW8FUnUrr1JtzzwMYAQ==" + }, { "citations": null, "text": "I can see there's a file named `foo` in the `/tmp/blah` directory. Let me read it.", diff --git a/intercept/messages/blocking.go b/intercept/messages/blocking.go index e22b97f8..28e64578 100644 --- a/intercept/messages/blocking.go +++ b/intercept/messages/blocking.go @@ -135,6 +135,23 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req accumulateUsage(&cumulativeUsage, resp.Usage) + // Capture any thinking blocks that were returned. + var thoughtRecords []*recorder.ModelThoughtRecord + if !i.isSmallFastModel() { + for _, block := range resp.Content { + switch variant := block.AsAny().(type) { + case anthropic.ThinkingBlock: + thoughtRecords = append(thoughtRecords, &recorder.ModelThoughtRecord{ + InterceptionID: i.ID().String(), + Content: variant.Thinking, + }) + case anthropic.RedactedThinkingBlock: + // For redacted thinking, there's nothing useful we can capture. + continue + } + } + } + // Handle tool calls for non-streaming. var pendingToolCalls []anthropic.ToolUseBlock for _, c := range resp.Content { @@ -158,10 +175,20 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req Injected: false, }) + // Associate the model thoughts with this tool call. + for _, thought := range thoughtRecords { + thought.ProviderToolCallID = toolUse.ID + } } - // If no injected tool calls, we're done. + // If no injected tool calls, persist thoughts and we're done. if len(pendingToolCalls) == 0 { + for _, thought := range thoughtRecords { + if thought.ProviderToolCallID == "" { + continue + } + _ = i.recorder.RecordModelThought(ctx, thought) + } break } @@ -198,6 +225,11 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req InvocationError: err, }) + // Associate the model thoughts with this tool call. + for _, thought := range thoughtRecords { + thought.ProviderToolCallID = tc.ID + } + if err != nil { // Always provide a tool_result even if the tool call failed messages.Messages = append(messages.Messages, @@ -283,6 +315,14 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req } } + // Only persist thoughts that are associated to a tool call. + for _, thought := range thoughtRecords { + if thought.ProviderToolCallID == "" { + continue + } + _ = i.recorder.RecordModelThought(ctx, thought) + } + // Sync the raw payload with updated messages so that withBody() // sends the updated payload on the next iteration. if err := i.syncPayloadMessages(messages.Messages); err != nil { diff --git a/intercept/messages/streaming.go b/intercept/messages/streaming.go index 4e87fd85..f9b6525f 100644 --- a/intercept/messages/streaming.go +++ b/intercept/messages/streaming.go @@ -252,6 +252,24 @@ newStream: // Don't send message_stop until all tools have been called. case string(constant.ValueOf[constant.MessageStop]()): + + // Capture any thinking blocks that were returned. + var thoughtRecords []*recorder.ModelThoughtRecord + if !i.isSmallFastModel() { // TODO: remove. + for _, block := range message.Content { + switch variant := block.AsAny().(type) { + case anthropic.ThinkingBlock: + thoughtRecords = append(thoughtRecords, &recorder.ModelThoughtRecord{ + InterceptionID: i.ID().String(), + Content: variant.Thinking, + }) + case anthropic.RedactedThinkingBlock: + // For redacted thinking, there's nothing useful we can capture. + continue + } + } + } + if len(pendingToolCalls) > 0 { // Append the whole message from this stream as context since we'll be sending a new request with the tool results. messages.Messages = append(messages.Messages, message.ToParam()) @@ -306,6 +324,11 @@ newStream: InvocationError: err, }) + // Associate the model thoughts with this tool call. + for _, thought := range thoughtRecords { + thought.ProviderToolCallID = id + } + if err != nil { // Always provide a tool_result even if the tool call failed messages.Messages = append(messages.Messages, @@ -390,6 +413,15 @@ newStream: } } + // Only persist thoughts that are associated to a tool call. + for _, thought := range thoughtRecords { + if thought.ProviderToolCallID == "" { + continue + } + + _ = i.recorder.RecordModelThought(streamCtx, thought) + } + // Sync the raw payload with updated messages so that withBody() // sends the updated payload on the next iteration. if syncErr := i.syncPayloadMessages(messages.Messages); syncErr != nil { @@ -417,7 +449,21 @@ newStream: Args: variant.Input, Injected: false, }) + + // Associate the model thoughts with this tool call. + for _, thought := range thoughtRecords { + thought.ProviderToolCallID = variant.ID + } + } + } + + // Only persist thoughts that are associated to a tool call. + for _, thought := range thoughtRecords { + if thought.ProviderToolCallID == "" { + continue } + + _ = i.recorder.RecordModelThought(streamCtx, thought) } } } diff --git a/internal/integrationtest/bridge_test.go b/internal/integrationtest/bridge_test.go index 01eb5815..4956e8df 100644 --- a/internal/integrationtest/bridge_test.go +++ b/internal/integrationtest/bridge_test.go @@ -124,6 +124,131 @@ func TestAnthropicMessages(t *testing.T) { }) } +func TestAnthropicMessagesModelThoughts(t *testing.T) { + t.Parallel() + + t.Run("thinking captured with builtin tool", func(t *testing.T) { + t.Parallel() + + cases := []struct { + streaming bool + expectedToolCallID string + expectedThinkingSubstr string + }{ + { + streaming: true, + expectedToolCallID: "toolu_01RX68weRSquLx6HUTj65iBo", + expectedThinkingSubstr: "Let me find and read it.", + }, + { + streaming: false, + expectedToolCallID: "toolu_01AusGgY5aKFhzWrFBv9JfHq", + expectedThinkingSubstr: "Let me find and read it.", + }, + } + + for _, tc := range cases { + t.Run(fmt.Sprintf("streaming=%v", tc.streaming), func(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), time.Second*30) + t.Cleanup(cancel) + + fix := fixtures.Parse(t, fixtures.AntSingleBuiltinTool) + upstream := testutil.NewMockUpstream(t, ctx, testutil.NewFixtureResponse(fix)) + + recorderClient := &testutil.MockRecorder{} + logger := slogtest.Make(t, &slogtest.Options{}).Leveled(slog.LevelDebug) + providers := []aibridge.Provider{provider.NewAnthropic(anthropicCfg(upstream.URL, apiKey), nil)} + b, err := aibridge.NewRequestBridge(ctx, providers, recorderClient, mcp.NewServerProxyManager(nil, testTracer), logger, nil, testTracer) + require.NoError(t, err) + + mockSrv := httptest.NewUnstartedServer(b) + t.Cleanup(mockSrv.Close) + mockSrv.Config.BaseContext = func(_ net.Listener) context.Context { + return aibcontext.AsActor(ctx, userID, nil) + } + mockSrv.Start() + + reqBody, err := sjson.SetBytes(fix.Request(), "stream", tc.streaming) + require.NoError(t, err) + req := createAnthropicMessagesReq(t, mockSrv.URL, reqBody) + client := &http.Client{} + resp, err := client.Do(req) + require.NoError(t, err) + require.Equal(t, http.StatusOK, resp.StatusCode) + defer resp.Body.Close() + + if tc.streaming { + sp := aibridge.NewSSEParser() + require.NoError(t, sp.Parse(resp.Body)) + assert.Contains(t, sp.AllEvents(), "message_start") + assert.Contains(t, sp.AllEvents(), "message_stop") + } + + // Verify model thoughts were captured and associated with the tool call. + thoughts := recorderClient.RecordedModelThoughts() + require.Len(t, thoughts, 1) + assert.Contains(t, thoughts[0].Content, "The user wants me to read") + assert.Contains(t, thoughts[0].Content, tc.expectedThinkingSubstr) + assert.NotEmpty(t, thoughts[0].InterceptionID) + assert.Equal(t, tc.expectedToolCallID, thoughts[0].ProviderToolCallID) + + // Verify tool usage was also recorded. + toolUsages := recorderClient.RecordedToolUsages() + require.Len(t, toolUsages, 1) + assert.Equal(t, "Read", toolUsages[0].Tool) + assert.Equal(t, tc.expectedToolCallID, toolUsages[0].ToolCallID) + + recorderClient.VerifyAllInterceptionsEnded(t) + }) + } + }) + + t.Run("no thoughts without tool calls", func(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), time.Second*30) + t.Cleanup(cancel) + + // Use the simple fixture which has no tool calls — any thinking blocks + // should not be persisted since they can't be associated with a tool call. + fix := fixtures.Parse(t, fixtures.AntSimple) + upstream := testutil.NewMockUpstream(t, ctx, testutil.NewFixtureResponse(fix)) + + recorderClient := &testutil.MockRecorder{} + logger := slogtest.Make(t, &slogtest.Options{}).Leveled(slog.LevelDebug) + providers := []aibridge.Provider{provider.NewAnthropic(anthropicCfg(upstream.URL, apiKey), nil)} + b, err := aibridge.NewRequestBridge(ctx, providers, recorderClient, mcp.NewServerProxyManager(nil, testTracer), logger, nil, testTracer) + require.NoError(t, err) + + mockSrv := httptest.NewUnstartedServer(b) + t.Cleanup(mockSrv.Close) + mockSrv.Config.BaseContext = func(_ net.Listener) context.Context { + return aibcontext.AsActor(ctx, userID, nil) + } + mockSrv.Start() + + reqBody, err := sjson.SetBytes(fix.Request(), "stream", true) + require.NoError(t, err) + req := createAnthropicMessagesReq(t, mockSrv.URL, reqBody) + client := &http.Client{} + resp, err := client.Do(req) + require.NoError(t, err) + require.Equal(t, http.StatusOK, resp.StatusCode) + defer resp.Body.Close() + + sp := aibridge.NewSSEParser() + require.NoError(t, sp.Parse(resp.Body)) + + // No thoughts should be recorded when there are no tool calls. + thoughts := recorderClient.RecordedModelThoughts() + assert.Empty(t, thoughts) + + recorderClient.VerifyAllInterceptionsEnded(t) + }) +} + func TestAWSBedrockIntegration(t *testing.T) { t.Parallel() diff --git a/internal/integrationtest/trace_test.go b/internal/integrationtest/trace_test.go index 88bec31c..bdfb7f7f 100644 --- a/internal/integrationtest/trace_test.go +++ b/internal/integrationtest/trace_test.go @@ -51,6 +51,7 @@ func TestTraceAnthropic(t *testing.T) { {"Intercept.RecordPromptUsage", 1, codes.Unset}, {"Intercept.RecordTokenUsage", 1, codes.Unset}, {"Intercept.RecordToolUsage", 1, codes.Unset}, + {"Intercept.RecordModelThought", 1, codes.Unset}, {"Intercept.ProcessRequest.Upstream", 1, codes.Unset}, } @@ -63,6 +64,7 @@ func TestTraceAnthropic(t *testing.T) { {"Intercept.RecordPromptUsage", 1, codes.Unset}, {"Intercept.RecordTokenUsage", 2, codes.Unset}, {"Intercept.RecordToolUsage", 1, codes.Unset}, + {"Intercept.RecordModelThought", 1, codes.Unset}, {"Intercept.ProcessRequest.Upstream", 1, codes.Unset}, } diff --git a/internal/testutil/mock_recorder.go b/internal/testutil/mock_recorder.go index 09bcac39..55eafcd6 100644 --- a/internal/testutil/mock_recorder.go +++ b/internal/testutil/mock_recorder.go @@ -20,6 +20,7 @@ type MockRecorder struct { tokenUsages []*recorder.TokenUsageRecord userPrompts []*recorder.PromptUsageRecord toolUsages []*recorder.ToolUsageRecord + modelThoughts []*recorder.ModelThoughtRecord interceptionsEnd map[string]*recorder.InterceptionRecordEnded } @@ -64,6 +65,13 @@ func (m *MockRecorder) RecordToolUsage(ctx context.Context, req *recorder.ToolUs return nil } +func (m *MockRecorder) RecordModelThought(ctx context.Context, req *recorder.ModelThoughtRecord) error { + m.mu.Lock() + defer m.mu.Unlock() + m.modelThoughts = append(m.modelThoughts, req) + return nil +} + // RecordedTokenUsages returns a copy of recorded token usages in a thread-safe manner. // Note: This is a shallow clone - the slice is copied but the pointers reference the // same underlying records. This is sufficient for our test assertions which only read @@ -128,6 +136,13 @@ func (m *MockRecorder) ToolUsages() []*recorder.ToolUsageRecord { return m.toolUsages } +// RecordedModelThoughts returns a copy of recorded model thoughts in a thread-safe manner. +func (m *MockRecorder) RecordedModelThoughts() []*recorder.ModelThoughtRecord { + m.mu.Lock() + defer m.mu.Unlock() + return slices.Clone(m.modelThoughts) +} + // RecordedInterceptionEnd returns the stored InterceptionRecordEnded for the // given interception ID, or nil if not found. func (m *MockRecorder) RecordedInterceptionEnd(id string) *recorder.InterceptionRecordEnded { diff --git a/recorder/recorder.go b/recorder/recorder.go index 6e37b632..3bd657f9 100644 --- a/recorder/recorder.go +++ b/recorder/recorder.go @@ -116,6 +116,24 @@ func (r *RecorderWrapper) RecordToolUsage(ctx context.Context, req *ToolUsageRec return err } +func (r *RecorderWrapper) RecordModelThought(ctx context.Context, req *ModelThoughtRecord) (outErr error) { + ctx, span := r.tracer.Start(ctx, "Intercept.RecordModelThought", trace.WithAttributes(tracing.InterceptionAttributesFromContext(ctx)...)) + defer tracing.EndSpanErr(span, &outErr) + + client, err := r.clientFn() + if err != nil { + return fmt.Errorf("acquire client: %w", err) + } + + req.CreatedAt = time.Now() + if err = client.RecordModelThought(ctx, req); err == nil { + return nil + } + + r.logger.Warn(ctx, "failed to record model thought", slog.Error(err), slog.F("interception_id", req.InterceptionID)) + return err +} + func NewRecorder(logger slog.Logger, tracer trace.Tracer, clientFn func() (Recorder, error)) *RecorderWrapper { return &RecorderWrapper{ logger: logger, @@ -259,6 +277,22 @@ func (a *AsyncRecorder) RecordToolUsage(ctx context.Context, req *ToolUsageRecor return nil // Caller is not interested in error. } +func (a *AsyncRecorder) RecordModelThought(ctx context.Context, req *ModelThoughtRecord) error { + a.wg.Add(1) + go func() { + defer a.wg.Done() + timedCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), a.timeout) + defer cancel() + + err := a.wrapped.RecordModelThought(timedCtx, req) + if err != nil { + a.logger.Warn(timedCtx, "failed to record model thought", slog.Error(err), slog.F("payload", req)) + } + }() + + return nil // Caller is not interested in error. +} + func (a *AsyncRecorder) Wait() { a.wg.Wait() } diff --git a/recorder/types.go b/recorder/types.go index b33494d4..609e7142 100644 --- a/recorder/types.go +++ b/recorder/types.go @@ -19,6 +19,8 @@ type Recorder interface { RecordPromptUsage(ctx context.Context, req *PromptUsageRecord) error // RecordToolUsage records the tools used in an interception with an upstream AI provider. RecordToolUsage(ctx context.Context, req *ToolUsageRecord) error + // RecordModelThought records the reasoning/thinking produced in an interception with an upstream AI provider. + RecordModelThought(ctx context.Context, req *ModelThoughtRecord) error } type ToolArgs any @@ -73,3 +75,11 @@ type ToolUsageRecord struct { Metadata Metadata CreatedAt time.Time } + +type ModelThoughtRecord struct { + InterceptionID string + ProviderToolCallID string + Content string + Metadata Metadata + CreatedAt time.Time +} From f878dd6318d2f17960ed0462ce8ec5ba5cc449c5 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 5 Mar 2026 17:36:44 +0200 Subject: [PATCH 02/14] fix: send model thoughts with tool usage recording Signed-off-by: Danny Kopping --- intercept/messages/blocking.go | 56 ++++++++---------------- intercept/messages/streaming.go | 58 ++++++++----------------- internal/integrationtest/bridge_test.go | 22 +++++----- internal/integrationtest/trace_test.go | 2 - internal/testutil/mock_recorder.go | 15 ------- recorder/recorder.go | 34 --------------- recorder/types.go | 12 +++-- 7 files changed, 52 insertions(+), 147 deletions(-) diff --git a/intercept/messages/blocking.go b/intercept/messages/blocking.go index 28e64578..4ba71874 100644 --- a/intercept/messages/blocking.go +++ b/intercept/messages/blocking.go @@ -137,18 +137,16 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req // Capture any thinking blocks that were returned. var thoughtRecords []*recorder.ModelThoughtRecord - if !i.isSmallFastModel() { - for _, block := range resp.Content { - switch variant := block.AsAny().(type) { - case anthropic.ThinkingBlock: - thoughtRecords = append(thoughtRecords, &recorder.ModelThoughtRecord{ - InterceptionID: i.ID().String(), - Content: variant.Thinking, - }) - case anthropic.RedactedThinkingBlock: - // For redacted thinking, there's nothing useful we can capture. - continue - } + for _, block := range resp.Content { + switch variant := block.AsAny().(type) { + case anthropic.ThinkingBlock: + thoughtRecords = append(thoughtRecords, &recorder.ModelThoughtRecord{ + Content: variant.Thinking, + CreatedAt: time.Now(), + }) + case anthropic.RedactedThinkingBlock: + // For redacted thinking, there's nothing useful we can capture. + continue } } @@ -173,22 +171,15 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req Tool: toolUse.Name, Args: toolUse.Input, Injected: false, + ModelThoughts: thoughtRecords, }) - - // Associate the model thoughts with this tool call. - for _, thought := range thoughtRecords { - thought.ProviderToolCallID = toolUse.ID - } + // Clear after first use to avoid duplicating across + // multiple tool calls in the same message. + thoughtRecords = nil } - // If no injected tool calls, persist thoughts and we're done. + // If no injected tool calls, we're done. if len(pendingToolCalls) == 0 { - for _, thought := range thoughtRecords { - if thought.ProviderToolCallID == "" { - continue - } - _ = i.recorder.RecordModelThought(ctx, thought) - } break } @@ -223,12 +214,11 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req Args: tc.Input, Injected: true, InvocationError: err, + ModelThoughts: thoughtRecords, }) - - // Associate the model thoughts with this tool call. - for _, thought := range thoughtRecords { - thought.ProviderToolCallID = tc.ID - } + // Clear after first use to avoid duplicating across + // multiple tool calls in the same message. + thoughtRecords = nil if err != nil { // Always provide a tool_result even if the tool call failed @@ -315,14 +305,6 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req } } - // Only persist thoughts that are associated to a tool call. - for _, thought := range thoughtRecords { - if thought.ProviderToolCallID == "" { - continue - } - _ = i.recorder.RecordModelThought(ctx, thought) - } - // Sync the raw payload with updated messages so that withBody() // sends the updated payload on the next iteration. if err := i.syncPayloadMessages(messages.Messages); err != nil { diff --git a/intercept/messages/streaming.go b/intercept/messages/streaming.go index f9b6525f..949401f9 100644 --- a/intercept/messages/streaming.go +++ b/intercept/messages/streaming.go @@ -255,18 +255,16 @@ newStream: // Capture any thinking blocks that were returned. var thoughtRecords []*recorder.ModelThoughtRecord - if !i.isSmallFastModel() { // TODO: remove. - for _, block := range message.Content { - switch variant := block.AsAny().(type) { - case anthropic.ThinkingBlock: - thoughtRecords = append(thoughtRecords, &recorder.ModelThoughtRecord{ - InterceptionID: i.ID().String(), - Content: variant.Thinking, - }) - case anthropic.RedactedThinkingBlock: - // For redacted thinking, there's nothing useful we can capture. - continue - } + for _, block := range message.Content { + switch variant := block.AsAny().(type) { + case anthropic.ThinkingBlock: + thoughtRecords = append(thoughtRecords, &recorder.ModelThoughtRecord{ + Content: variant.Thinking, + CreatedAt: time.Now(), + }) + case anthropic.RedactedThinkingBlock: + // For redacted thinking, there's nothing useful we can capture. + continue } } @@ -322,12 +320,11 @@ newStream: Args: input, Injected: true, InvocationError: err, + ModelThoughts: thoughtRecords, }) - - // Associate the model thoughts with this tool call. - for _, thought := range thoughtRecords { - thought.ProviderToolCallID = id - } + // Clear after first use to avoid duplicating across + // multiple tool calls in the same message. + thoughtRecords = nil if err != nil { // Always provide a tool_result even if the tool call failed @@ -413,15 +410,6 @@ newStream: } } - // Only persist thoughts that are associated to a tool call. - for _, thought := range thoughtRecords { - if thought.ProviderToolCallID == "" { - continue - } - - _ = i.recorder.RecordModelThought(streamCtx, thought) - } - // Sync the raw payload with updated messages so that withBody() // sends the updated payload on the next iteration. if syncErr := i.syncPayloadMessages(messages.Messages); syncErr != nil { @@ -448,23 +436,13 @@ newStream: Tool: variant.Name, Args: variant.Input, Injected: false, + ModelThoughts: thoughtRecords, }) - - // Associate the model thoughts with this tool call. - for _, thought := range thoughtRecords { - thought.ProviderToolCallID = variant.ID - } + // Clear after first use to avoid duplicating across + // multiple tool calls in the same message. + thoughtRecords = nil } } - - // Only persist thoughts that are associated to a tool call. - for _, thought := range thoughtRecords { - if thought.ProviderToolCallID == "" { - continue - } - - _ = i.recorder.RecordModelThought(streamCtx, thought) - } } } diff --git a/internal/integrationtest/bridge_test.go b/internal/integrationtest/bridge_test.go index 4956e8df..4ebcc997 100644 --- a/internal/integrationtest/bridge_test.go +++ b/internal/integrationtest/bridge_test.go @@ -186,20 +186,17 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { assert.Contains(t, sp.AllEvents(), "message_stop") } - // Verify model thoughts were captured and associated with the tool call. - thoughts := recorderClient.RecordedModelThoughts() - require.Len(t, thoughts, 1) - assert.Contains(t, thoughts[0].Content, "The user wants me to read") - assert.Contains(t, thoughts[0].Content, tc.expectedThinkingSubstr) - assert.NotEmpty(t, thoughts[0].InterceptionID) - assert.Equal(t, tc.expectedToolCallID, thoughts[0].ProviderToolCallID) - - // Verify tool usage was also recorded. + // Verify tool usage was recorded with associated model thoughts. toolUsages := recorderClient.RecordedToolUsages() require.Len(t, toolUsages, 1) assert.Equal(t, "Read", toolUsages[0].Tool) assert.Equal(t, tc.expectedToolCallID, toolUsages[0].ToolCallID) + // Model thoughts should be embedded in the tool usage record. + require.Len(t, toolUsages[0].ModelThoughts, 1) + assert.Contains(t, toolUsages[0].ModelThoughts[0].Content, "The user wants me to read") + assert.Contains(t, toolUsages[0].ModelThoughts[0].Content, tc.expectedThinkingSubstr) + recorderClient.VerifyAllInterceptionsEnded(t) }) } @@ -241,9 +238,10 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { sp := aibridge.NewSSEParser() require.NoError(t, sp.Parse(resp.Body)) - // No thoughts should be recorded when there are no tool calls. - thoughts := recorderClient.RecordedModelThoughts() - assert.Empty(t, thoughts) + // No tool usages (and therefore no thoughts) should be recorded + // when there are no tool calls. + toolUsages := recorderClient.RecordedToolUsages() + assert.Empty(t, toolUsages) recorderClient.VerifyAllInterceptionsEnded(t) }) diff --git a/internal/integrationtest/trace_test.go b/internal/integrationtest/trace_test.go index bdfb7f7f..88bec31c 100644 --- a/internal/integrationtest/trace_test.go +++ b/internal/integrationtest/trace_test.go @@ -51,7 +51,6 @@ func TestTraceAnthropic(t *testing.T) { {"Intercept.RecordPromptUsage", 1, codes.Unset}, {"Intercept.RecordTokenUsage", 1, codes.Unset}, {"Intercept.RecordToolUsage", 1, codes.Unset}, - {"Intercept.RecordModelThought", 1, codes.Unset}, {"Intercept.ProcessRequest.Upstream", 1, codes.Unset}, } @@ -64,7 +63,6 @@ func TestTraceAnthropic(t *testing.T) { {"Intercept.RecordPromptUsage", 1, codes.Unset}, {"Intercept.RecordTokenUsage", 2, codes.Unset}, {"Intercept.RecordToolUsage", 1, codes.Unset}, - {"Intercept.RecordModelThought", 1, codes.Unset}, {"Intercept.ProcessRequest.Upstream", 1, codes.Unset}, } diff --git a/internal/testutil/mock_recorder.go b/internal/testutil/mock_recorder.go index 55eafcd6..09bcac39 100644 --- a/internal/testutil/mock_recorder.go +++ b/internal/testutil/mock_recorder.go @@ -20,7 +20,6 @@ type MockRecorder struct { tokenUsages []*recorder.TokenUsageRecord userPrompts []*recorder.PromptUsageRecord toolUsages []*recorder.ToolUsageRecord - modelThoughts []*recorder.ModelThoughtRecord interceptionsEnd map[string]*recorder.InterceptionRecordEnded } @@ -65,13 +64,6 @@ func (m *MockRecorder) RecordToolUsage(ctx context.Context, req *recorder.ToolUs return nil } -func (m *MockRecorder) RecordModelThought(ctx context.Context, req *recorder.ModelThoughtRecord) error { - m.mu.Lock() - defer m.mu.Unlock() - m.modelThoughts = append(m.modelThoughts, req) - return nil -} - // RecordedTokenUsages returns a copy of recorded token usages in a thread-safe manner. // Note: This is a shallow clone - the slice is copied but the pointers reference the // same underlying records. This is sufficient for our test assertions which only read @@ -136,13 +128,6 @@ func (m *MockRecorder) ToolUsages() []*recorder.ToolUsageRecord { return m.toolUsages } -// RecordedModelThoughts returns a copy of recorded model thoughts in a thread-safe manner. -func (m *MockRecorder) RecordedModelThoughts() []*recorder.ModelThoughtRecord { - m.mu.Lock() - defer m.mu.Unlock() - return slices.Clone(m.modelThoughts) -} - // RecordedInterceptionEnd returns the stored InterceptionRecordEnded for the // given interception ID, or nil if not found. func (m *MockRecorder) RecordedInterceptionEnd(id string) *recorder.InterceptionRecordEnded { diff --git a/recorder/recorder.go b/recorder/recorder.go index 3bd657f9..6e37b632 100644 --- a/recorder/recorder.go +++ b/recorder/recorder.go @@ -116,24 +116,6 @@ func (r *RecorderWrapper) RecordToolUsage(ctx context.Context, req *ToolUsageRec return err } -func (r *RecorderWrapper) RecordModelThought(ctx context.Context, req *ModelThoughtRecord) (outErr error) { - ctx, span := r.tracer.Start(ctx, "Intercept.RecordModelThought", trace.WithAttributes(tracing.InterceptionAttributesFromContext(ctx)...)) - defer tracing.EndSpanErr(span, &outErr) - - client, err := r.clientFn() - if err != nil { - return fmt.Errorf("acquire client: %w", err) - } - - req.CreatedAt = time.Now() - if err = client.RecordModelThought(ctx, req); err == nil { - return nil - } - - r.logger.Warn(ctx, "failed to record model thought", slog.Error(err), slog.F("interception_id", req.InterceptionID)) - return err -} - func NewRecorder(logger slog.Logger, tracer trace.Tracer, clientFn func() (Recorder, error)) *RecorderWrapper { return &RecorderWrapper{ logger: logger, @@ -277,22 +259,6 @@ func (a *AsyncRecorder) RecordToolUsage(ctx context.Context, req *ToolUsageRecor return nil // Caller is not interested in error. } -func (a *AsyncRecorder) RecordModelThought(ctx context.Context, req *ModelThoughtRecord) error { - a.wg.Add(1) - go func() { - defer a.wg.Done() - timedCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), a.timeout) - defer cancel() - - err := a.wrapped.RecordModelThought(timedCtx, req) - if err != nil { - a.logger.Warn(timedCtx, "failed to record model thought", slog.Error(err), slog.F("payload", req)) - } - }() - - return nil // Caller is not interested in error. -} - func (a *AsyncRecorder) Wait() { a.wg.Wait() } diff --git a/recorder/types.go b/recorder/types.go index 609e7142..d3cbaf73 100644 --- a/recorder/types.go +++ b/recorder/types.go @@ -18,9 +18,8 @@ type Recorder interface { // RecordPromptUsage records the prompts used in an interception with an upstream AI provider. RecordPromptUsage(ctx context.Context, req *PromptUsageRecord) error // RecordToolUsage records the tools used in an interception with an upstream AI provider. + // Any associated model thoughts should be included in the ToolUsageRecord. RecordToolUsage(ctx context.Context, req *ToolUsageRecord) error - // RecordModelThought records the reasoning/thinking produced in an interception with an upstream AI provider. - RecordModelThought(ctx context.Context, req *ModelThoughtRecord) error } type ToolArgs any @@ -74,12 +73,11 @@ type ToolUsageRecord struct { InvocationError error Metadata Metadata CreatedAt time.Time + ModelThoughts []*ModelThoughtRecord } type ModelThoughtRecord struct { - InterceptionID string - ProviderToolCallID string - Content string - Metadata Metadata - CreatedAt time.Time + Content string + Metadata Metadata + CreatedAt time.Time } From daaec9fe2500ccf298d7558d47ad60159dde10d3 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 6 Mar 2026 14:18:23 +0200 Subject: [PATCH 03/14] feat: capture responses reasoning Signed-off-by: Danny Kopping --- fixtures/anthropic/simple.txtar | 69 ++++++++------- .../blocking/single_builtin_tool.txtar | 11 +++ .../responses/streaming/builtin_tool.txtar | 44 +++++++--- .../openai/responses/streaming/simple.txtar | 64 +++++++++----- intercept/responses/base.go | 35 ++++++++ intercept/responses/injected_tools.go | 11 ++- internal/integrationtest/bridge_test.go | 50 +++-------- internal/integrationtest/responses_test.go | 84 +++++++++++++++++++ 8 files changed, 262 insertions(+), 106 deletions(-) diff --git a/fixtures/anthropic/simple.txtar b/fixtures/anthropic/simple.txtar index f1300b7b..235138cc 100644 --- a/fixtures/anthropic/simple.txtar +++ b/fixtures/anthropic/simple.txtar @@ -23,91 +23,100 @@ event: message_start data: {"type":"message_start","message":{"id":"msg_01Pvyf26bY17RcjmWfJsXGBn","type":"message","role":"assistant","model":"claude-sonnet-4-20250514","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":18,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"output_tokens":1,"service_tier":"standard"}} } event: content_block_start -data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""} } +data: {"type":"content_block_start","index":0,"content_block":{"type":"thinking","thinking":""}} + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"thinking_delta","thinking":"This is a classic philosophical question about medieval scholasticism. I'll give a thoughtful answer."}} + +event: content_block_stop +data: {"type":"content_block_stop","index":0} + +event: content_block_start +data: {"type":"content_block_start","index":1,"content_block":{"type":"text","text":""} } event: ping data: {"type": "ping"} event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"This"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":"This"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" is a famous philosophical question often used to illustrate medieval"}} +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" is a famous philosophical question often used to illustrate medieval"}} event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" scholastic debates that seem pointless or ov"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" scholastic debates that seem pointless or ov"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"erly abstract. The question \"How many angels can dance on the head of"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":"erly abstract. The question \"How many angels can dance on the head of"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" a pin?\" is typically cited as an example of us"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" a pin?\" is typically cited as an example of us"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"eless speculation.\n\nHistorically, medieval theolog"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":"eless speculation.\n\nHistorically, medieval theolog"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"ians did debate the nature of angels -"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":"ians did debate the nature of angels -"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" whether they were incorporeal beings, how"}} +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" whether they were incorporeal beings, how"}} event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" they occupied space, and whether multiple angels could exist"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" they occupied space, and whether multiple angels could exist"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" in the same location. However, there"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" in the same location. However, there"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"'s little evidence they literally"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":"'s little evidence they literally"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" debated dancing angels on pinheads.\n\nThe question has"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" debated dancing angels on pinheads.\n\nThe question has"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" no factual answer since it depends on assumptions about:"}} +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" no factual answer since it depends on assumptions about:"}} event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"\n- The existence and nature of angels\n- Whether"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":"\n- The existence and nature of angels\n- Whether"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" incorporeal beings occupy physical space\n- What"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" incorporeal beings occupy physical space\n- What"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" constitutes \"dancing\" for a spiritual"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" constitutes \"dancing\" for a spiritual"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" entity\n- The size of both the"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" entity\n- The size of both the"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" pin and the angels\n\nIt's become a metaph"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" pin and the angels\n\nIt's become a metaph"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"or for overthinking trivial matters"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":"or for overthinking trivial matters"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" or getting lost in theoretical discussions disconnected from practical reality."} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" or getting lost in theoretical discussions disconnected from practical reality."} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" Some use it to critique certain types of academic"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" Some use it to critique certain types of academic"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" or theological debate, while others defen"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" or theological debate, while others defen"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"d the value of exploring fundamental questions about existence an"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":"d the value of exploring fundamental questions about existence an"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"d metaphysics.\n\nSo while u"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":"d metaphysics.\n\nSo while u"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"nanswerable literally, it serves as an interesting lens"} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":"nanswerable literally, it serves as an interesting lens"} } event: content_block_delta -data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" for discussing the nature of philosophical inquiry itself."} } +data: {"type":"content_block_delta","index":1,"delta":{"type":"text_delta","text":" for discussing the nature of philosophical inquiry itself."} } event: content_block_stop -data: {"type":"content_block_stop","index":0 } +data: {"type":"content_block_stop","index":1 } event: message_delta data: {"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"output_tokens":240} } @@ -122,6 +131,10 @@ data: {"type":"message_stop" } "role": "assistant", "model": "claude-sonnet-4-20250514", "content": [ + { + "type": "thinking", + "thinking": "This is a classic philosophical question about medieval scholasticism. I'll give a thoughtful answer." + }, { "type": "text", "text": "This is a famous philosophical question, often called \"How many angels can dance on the head of a pin?\" It's typically used to represent pointless or overly abstract theological debates.\n\nThe question doesn't have a literal answer because:\n\n1. **Historical context**: It's often attributed to medieval scholastic philosophers, though there's little evidence they actually debated this exact question. It became a popular way to mock what some saw as useless academic arguments.\n\n2. **Philosophical purpose**: The question highlights the difficulty of discussing non-physical beings (angels) in physical terms (space on a pinhead).\n\n3. **Different interpretations**: \n - If angels are purely spiritual, they might not take up physical space at all\n - If they do occupy space, we'd need to know their \"size\"\n - The question might be asking about the nature of space, matter, and spirit\n\nSo the real answer is that it's not meant to be answered literally - it's a thought experiment about the limits of rational inquiry and the sometimes absurd directions theological speculation can take.\n\nWould you like to explore the philosophical implications behind this question, or were you thinking about it in a different context?" diff --git a/fixtures/openai/responses/blocking/single_builtin_tool.txtar b/fixtures/openai/responses/blocking/single_builtin_tool.txtar index f41bd3cc..14299ff3 100644 --- a/fixtures/openai/responses/blocking/single_builtin_tool.txtar +++ b/fixtures/openai/responses/blocking/single_builtin_tool.txtar @@ -50,6 +50,17 @@ "max_tool_calls": null, "model": "gpt-4.1-2025-04-14", "output": [ + { + "id": "rs_0da6045a8b68fa5200695fa23e100081a19bf68887d47ae93d", + "type": "reasoning", + "status": "completed", + "summary": [ + { + "type": "summary_text", + "text": "The user wants to add 3 and 5. Let me call the add function." + } + ] + }, { "id": "fc_0da6045a8b68fa5200695fa23e198081a19bf68887d47ae93d", "type": "function_call", diff --git a/fixtures/openai/responses/streaming/builtin_tool.txtar b/fixtures/openai/responses/streaming/builtin_tool.txtar index b6a7a0a5..98793f3b 100644 --- a/fixtures/openai/responses/streaming/builtin_tool.txtar +++ b/fixtures/openai/responses/streaming/builtin_tool.txtar @@ -40,41 +40,59 @@ event: response.in_progress data: {"type":"response.in_progress","response":{"id":"resp_0c3fb28cfcf463a500695fa2f0239481a095ec6ce3dfe4d458","object":"response","created_at":1767875312,"status":"in_progress","background":false,"completed_at":null,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4.1-2025-04-14","output":[],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}},"sequence_number":1} event: response.output_item.added -data: {"type":"response.output_item.added","item":{"id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","type":"function_call","status":"in_progress","arguments":"","call_id":"call_7VaiUXZYuuuwWwviCrckxq6t","name":"add"},"output_index":0,"sequence_number":2} +data: {"type":"response.output_item.added","item":{"id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","type":"reasoning","status":"in_progress","summary":[]},"output_index":0,"sequence_number":2} + +event: response.reasoning_summary_part.added +data: {"type":"response.reasoning_summary_part.added","item_id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","output_index":0,"part":{"type":"summary_text","text":""},"summary_index":0,"sequence_number":3} + +event: response.reasoning_summary_text.delta +data: {"type":"response.reasoning_summary_text.delta","item_id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","output_index":0,"summary_index":0,"delta":"The user wants to add 3 and 5. Let me call the add function.","sequence_number":4} + +event: response.reasoning_summary_text.done +data: {"type":"response.reasoning_summary_text.done","item_id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","output_index":0,"summary_index":0,"text":"The user wants to add 3 and 5. Let me call the add function.","sequence_number":5} + +event: response.reasoning_summary_part.done +data: {"type":"response.reasoning_summary_part.done","item_id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","output_index":0,"part":{"type":"summary_text","text":"The user wants to add 3 and 5. Let me call the add function."},"summary_index":0,"sequence_number":6} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","type":"reasoning","status":"completed","summary":[{"type":"summary_text","text":"The user wants to add 3 and 5. Let me call the add function."}]},"output_index":0,"sequence_number":7} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","type":"function_call","status":"in_progress","arguments":"","call_id":"call_7VaiUXZYuuuwWwviCrckxq6t","name":"add"},"output_index":1,"sequence_number":8} event: response.function_call_arguments.delta -data: {"type":"response.function_call_arguments.delta","delta":"{\"","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"gWZHP8i4lSgQYT","output_index":0,"sequence_number":3} +data: {"type":"response.function_call_arguments.delta","delta":"{\"","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"gWZHP8i4lSgQYT","output_index":1,"sequence_number":9} event: response.function_call_arguments.delta -data: {"type":"response.function_call_arguments.delta","delta":"a","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"yC1iubuqc098ZSH","output_index":0,"sequence_number":4} +data: {"type":"response.function_call_arguments.delta","delta":"a","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"yC1iubuqc098ZSH","output_index":1,"sequence_number":10} event: response.function_call_arguments.delta -data: {"type":"response.function_call_arguments.delta","delta":"\":","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"G17nNbWUcJkqA2","output_index":0,"sequence_number":5} +data: {"type":"response.function_call_arguments.delta","delta":"\":","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"G17nNbWUcJkqA2","output_index":1,"sequence_number":11} event: response.function_call_arguments.delta -data: {"type":"response.function_call_arguments.delta","delta":"3","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"Mj71L4eeLZbIEFU","output_index":0,"sequence_number":6} +data: {"type":"response.function_call_arguments.delta","delta":"3","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"Mj71L4eeLZbIEFU","output_index":1,"sequence_number":12} event: response.function_call_arguments.delta -data: {"type":"response.function_call_arguments.delta","delta":",\"","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"ZchcCauvlPtVc7","output_index":0,"sequence_number":7} +data: {"type":"response.function_call_arguments.delta","delta":",\"","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"ZchcCauvlPtVc7","output_index":1,"sequence_number":13} event: response.function_call_arguments.delta -data: {"type":"response.function_call_arguments.delta","delta":"b","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"gWLYMrsBI3ZHKVP","output_index":0,"sequence_number":8} +data: {"type":"response.function_call_arguments.delta","delta":"b","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"gWLYMrsBI3ZHKVP","output_index":1,"sequence_number":14} event: response.function_call_arguments.delta -data: {"type":"response.function_call_arguments.delta","delta":"\":","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"n4iUzpnbPE4DnO","output_index":0,"sequence_number":9} +data: {"type":"response.function_call_arguments.delta","delta":"\":","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"n4iUzpnbPE4DnO","output_index":1,"sequence_number":15} event: response.function_call_arguments.delta -data: {"type":"response.function_call_arguments.delta","delta":"5","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"23mO3rxkXqDOi6g","output_index":0,"sequence_number":10} +data: {"type":"response.function_call_arguments.delta","delta":"5","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"23mO3rxkXqDOi6g","output_index":1,"sequence_number":16} event: response.function_call_arguments.delta -data: {"type":"response.function_call_arguments.delta","delta":"}","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"AQnBsNz7GqkdylH","output_index":0,"sequence_number":11} +data: {"type":"response.function_call_arguments.delta","delta":"}","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"AQnBsNz7GqkdylH","output_index":1,"sequence_number":17} event: response.function_call_arguments.done -data: {"type":"response.function_call_arguments.done","arguments":"{\"a\":3,\"b\":5}","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","output_index":0,"sequence_number":12} +data: {"type":"response.function_call_arguments.done","arguments":"{\"a\":3,\"b\":5}","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","output_index":1,"sequence_number":18} event: response.output_item.done -data: {"type":"response.output_item.done","item":{"id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","type":"function_call","status":"completed","arguments":"{\"a\":3,\"b\":5}","call_id":"call_7VaiUXZYuuuwWwviCrckxq6t","name":"add"},"output_index":0,"sequence_number":13} +data: {"type":"response.output_item.done","item":{"id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","type":"function_call","status":"completed","arguments":"{\"a\":3,\"b\":5}","call_id":"call_7VaiUXZYuuuwWwviCrckxq6t","name":"add"},"output_index":1,"sequence_number":19} event: response.completed -data: {"type":"response.completed","response":{"id":"resp_0c3fb28cfcf463a500695fa2f0239481a095ec6ce3dfe4d458","object":"response","created_at":1767875312,"status":"completed","background":false,"completed_at":1767875312,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4.1-2025-04-14","output":[{"id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","type":"function_call","status":"completed","arguments":"{\"a\":3,\"b\":5}","call_id":"call_7VaiUXZYuuuwWwviCrckxq6t","name":"add"}],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"default","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":{"input_tokens":58,"input_tokens_details":{"cached_tokens":0},"output_tokens":18,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":76},"user":null,"metadata":{}},"sequence_number":14} +data: {"type":"response.completed","response":{"id":"resp_0c3fb28cfcf463a500695fa2f0239481a095ec6ce3dfe4d458","object":"response","created_at":1767875312,"status":"completed","background":false,"completed_at":1767875312,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4.1-2025-04-14","output":[{"id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","type":"reasoning","status":"completed","summary":[{"type":"summary_text","text":"The user wants to add 3 and 5. Let me call the add function."}]},{"id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","type":"function_call","status":"completed","arguments":"{\"a\":3,\"b\":5}","call_id":"call_7VaiUXZYuuuwWwviCrckxq6t","name":"add"}],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"default","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":{"input_tokens":58,"input_tokens_details":{"cached_tokens":0},"output_tokens":18,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":76},"user":null,"metadata":{}},"sequence_number":20} diff --git a/fixtures/openai/responses/streaming/simple.txtar b/fixtures/openai/responses/streaming/simple.txtar index d86aa6e4..c8736f9d 100644 --- a/fixtures/openai/responses/streaming/simple.txtar +++ b/fixtures/openai/responses/streaming/simple.txtar @@ -13,71 +13,89 @@ event: response.in_progress data: {"type":"response.in_progress","response":{"id":"resp_0f9c4b2f224d858000695fa062bf048197a680f357bbb09000","object":"response","created_at":1767874658,"status":"in_progress","background":false,"completed_at":null,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4o-mini-2024-07-18","output":[],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}},"sequence_number":1} event: response.output_item.added -data: {"type":"response.output_item.added","item":{"id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","type":"message","status":"in_progress","content":[],"role":"assistant"},"output_index":0,"sequence_number":2} +data: {"type":"response.output_item.added","item":{"id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","type":"reasoning","status":"in_progress","summary":[]},"output_index":0,"sequence_number":2} + +event: response.reasoning_summary_part.added +data: {"type":"response.reasoning_summary_part.added","item_id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","output_index":0,"part":{"type":"summary_text","text":""},"summary_index":0,"sequence_number":3} + +event: response.reasoning_summary_text.delta +data: {"type":"response.reasoning_summary_text.delta","item_id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","output_index":0,"summary_index":0,"delta":"The user wants a joke. I will tell a classic scarecrow joke.","sequence_number":4} + +event: response.reasoning_summary_text.done +data: {"type":"response.reasoning_summary_text.done","item_id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","output_index":0,"summary_index":0,"text":"The user wants a joke. I will tell a classic scarecrow joke.","sequence_number":5} + +event: response.reasoning_summary_part.done +data: {"type":"response.reasoning_summary_part.done","item_id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","output_index":0,"part":{"type":"summary_text","text":"The user wants a joke. I will tell a classic scarecrow joke."},"summary_index":0,"sequence_number":6} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","type":"reasoning","status":"completed","summary":[{"type":"summary_text","text":"The user wants a joke. I will tell a classic scarecrow joke."}]},"output_index":0,"sequence_number":7} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","type":"message","status":"in_progress","content":[],"role":"assistant"},"output_index":1,"sequence_number":8} event: response.content_part.added -data: {"type":"response.content_part.added","content_index":0,"item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","output_index":0,"part":{"type":"output_text","annotations":[],"logprobs":[],"text":""},"sequence_number":3} +data: {"type":"response.content_part.added","content_index":0,"item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","output_index":1,"part":{"type":"output_text","annotations":[],"logprobs":[],"text":""},"sequence_number":9} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":"Why","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"N16SG5UiLncOU","output_index":0,"sequence_number":4} +data: {"type":"response.output_text.delta","content_index":0,"delta":"Why","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"N16SG5UiLncOU","output_index":1,"sequence_number":10} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" did","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"OpojJ3pv0h55","output_index":0,"sequence_number":5} +data: {"type":"response.output_text.delta","content_index":0,"delta":" did","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"OpojJ3pv0h55","output_index":1,"sequence_number":11} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" the","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"11RCrnBxLo5x","output_index":0,"sequence_number":6} +data: {"type":"response.output_text.delta","content_index":0,"delta":" the","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"11RCrnBxLo5x","output_index":1,"sequence_number":12} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" scare","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"QZrRBlk6BV","output_index":0,"sequence_number":7} +data: {"type":"response.output_text.delta","content_index":0,"delta":" scare","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"QZrRBlk6BV","output_index":1,"sequence_number":13} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":"crow","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"gp7F8IVupiHG","output_index":0,"sequence_number":8} +data: {"type":"response.output_text.delta","content_index":0,"delta":"crow","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"gp7F8IVupiHG","output_index":1,"sequence_number":14} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" win","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"uKq4X8mT1jl9","output_index":0,"sequence_number":9} +data: {"type":"response.output_text.delta","content_index":0,"delta":" win","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"uKq4X8mT1jl9","output_index":1,"sequence_number":15} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" an","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"2Ox5JzaAsJHuT","output_index":0,"sequence_number":10} +data: {"type":"response.output_text.delta","content_index":0,"delta":" an","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"2Ox5JzaAsJHuT","output_index":1,"sequence_number":16} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" award","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"ZOQbZabNAQ","output_index":0,"sequence_number":11} +data: {"type":"response.output_text.delta","content_index":0,"delta":" award","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"ZOQbZabNAQ","output_index":1,"sequence_number":17} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":"?\n\n","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"N2dSd0FHBxooR","output_index":0,"sequence_number":12} +data: {"type":"response.output_text.delta","content_index":0,"delta":"?\n\n","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"N2dSd0FHBxooR","output_index":1,"sequence_number":18} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":"Because","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"LZ1O4laHt","output_index":0,"sequence_number":13} +data: {"type":"response.output_text.delta","content_index":0,"delta":"Because","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"LZ1O4laHt","output_index":1,"sequence_number":19} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" he","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"dqcS6ePaMvxMD","output_index":0,"sequence_number":14} +data: {"type":"response.output_text.delta","content_index":0,"delta":" he","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"dqcS6ePaMvxMD","output_index":1,"sequence_number":20} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" was","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"nR6CtC7MUsWW","output_index":0,"sequence_number":15} +data: {"type":"response.output_text.delta","content_index":0,"delta":" was","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"nR6CtC7MUsWW","output_index":1,"sequence_number":21} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" outstanding","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"dNVG","output_index":0,"sequence_number":16} +data: {"type":"response.output_text.delta","content_index":0,"delta":" outstanding","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"dNVG","output_index":1,"sequence_number":22} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" in","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"P7w4jjOcdVOla","output_index":0,"sequence_number":17} +data: {"type":"response.output_text.delta","content_index":0,"delta":" in","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"P7w4jjOcdVOla","output_index":1,"sequence_number":23} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" his","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"u9dg4RLIld4e","output_index":0,"sequence_number":18} +data: {"type":"response.output_text.delta","content_index":0,"delta":" his","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"u9dg4RLIld4e","output_index":1,"sequence_number":24} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" field","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"qefuqzOCOy","output_index":0,"sequence_number":19} +data: {"type":"response.output_text.delta","content_index":0,"delta":" field","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"qefuqzOCOy","output_index":1,"sequence_number":25} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":"!","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"DT9j4dSh0xyJdxU","output_index":0,"sequence_number":20} +data: {"type":"response.output_text.delta","content_index":0,"delta":"!","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"DT9j4dSh0xyJdxU","output_index":1,"sequence_number":26} event: response.output_text.done -data: {"type":"response.output_text.done","content_index":0,"item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"output_index":0,"sequence_number":21,"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"} +data: {"type":"response.output_text.done","content_index":0,"item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"output_index":1,"sequence_number":27,"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"} event: response.content_part.done -data: {"type":"response.content_part.done","content_index":0,"item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","output_index":0,"part":{"type":"output_text","annotations":[],"logprobs":[],"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"},"sequence_number":22} +data: {"type":"response.content_part.done","content_index":0,"item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","output_index":1,"part":{"type":"output_text","annotations":[],"logprobs":[],"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"},"sequence_number":28} event: response.output_item.done -data: {"type":"response.output_item.done","item":{"id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"logprobs":[],"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"}],"role":"assistant"},"output_index":0,"sequence_number":23} +data: {"type":"response.output_item.done","item":{"id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"logprobs":[],"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"}],"role":"assistant"},"output_index":1,"sequence_number":29} event: response.completed -data: {"type":"response.completed","response":{"id":"resp_0f9c4b2f224d858000695fa062bf048197a680f357bbb09000","object":"response","created_at":1767874658,"status":"completed","background":false,"completed_at":1767874660,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4o-mini-2024-07-18","output":[{"id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"logprobs":[],"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"}],"role":"assistant"}],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"default","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":{"input_tokens":11,"input_tokens_details":{"cached_tokens":0},"output_tokens":18,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":29},"user":null,"metadata":{}},"sequence_number":24} +data: {"type":"response.completed","response":{"id":"resp_0f9c4b2f224d858000695fa062bf048197a680f357bbb09000","object":"response","created_at":1767874658,"status":"completed","background":false,"completed_at":1767874660,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4o-mini-2024-07-18","output":[{"id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","type":"reasoning","status":"completed","summary":[{"type":"summary_text","text":"The user wants a joke. I will tell a classic scarecrow joke."}]},{"id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"logprobs":[],"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"}],"role":"assistant"}],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"default","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":{"input_tokens":11,"input_tokens_details":{"cached_tokens":0},"output_tokens":18,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":29},"user":null,"metadata":{}},"sequence_number":30} diff --git a/intercept/responses/base.go b/intercept/responses/base.go index 69db3878..0a889bb5 100644 --- a/intercept/responses/base.go +++ b/intercept/responses/base.go @@ -260,6 +260,9 @@ func (i *responsesInterceptionBase) recordNonInjectedToolUsage(ctx context.Conte return } + // Capture any reasoning items from the response output as model thoughts. + thoughtRecords := i.extractModelThoughts(response) + for _, item := range response.Output { var args recorder.ToolArgs @@ -280,9 +283,13 @@ func (i *responsesInterceptionBase) recordNonInjectedToolUsage(ctx context.Conte Tool: item.Name, Args: args, Injected: false, + ModelThoughts: thoughtRecords, }); err != nil { i.logger.Warn(ctx, "failed to record tool usage", slog.Error(err), slog.F("tool", item.Name)) } + // Clear after first use to avoid duplicating across + // multiple tool calls in the same message. + thoughtRecords = nil } } @@ -326,6 +333,34 @@ func (i *responsesInterceptionBase) recordTokenUsage(ctx context.Context, respon } } +// extractModelThoughts extracts reasoning summary items from response output +// and converts them to ModelThoughtRecords for association with tool usage. +func (i *responsesInterceptionBase) extractModelThoughts(response *responses.Response) []*recorder.ModelThoughtRecord { + if response == nil { + return nil + } + + var thoughts []*recorder.ModelThoughtRecord + for _, item := range response.Output { + if item.Type != string(constant.ValueOf[constant.Reasoning]()) { + continue + } + + reasoning := item.AsReasoning() + for _, summary := range reasoning.Summary { + if summary.Text == "" { + continue + } + thoughts = append(thoughts, &recorder.ModelThoughtRecord{ + Content: summary.Text, + CreatedAt: time.Now(), + }) + } + } + + return thoughts +} + func (i *responsesInterceptionBase) hasInjectableTools() bool { return i.mcpProxy != nil && len(i.mcpProxy.ListTools()) > 0 } diff --git a/intercept/responses/injected_tools.go b/intercept/responses/injected_tools.go index c3934fa3..db81941f 100644 --- a/intercept/responses/injected_tools.go +++ b/intercept/responses/injected_tools.go @@ -109,9 +109,15 @@ func (i *responsesInterceptionBase) handleInjectedToolCalls(ctx context.Context, return nil, nil } + // Capture any reasoning items from the response output as model thoughts. + thoughtRecords := i.extractModelThoughts(response) + var results []responses.ResponseInputItemUnionParam for _, fc := range pending { - results = append(results, i.invokeInjectedTool(ctx, response.ID, fc)) + results = append(results, i.invokeInjectedTool(ctx, response.ID, fc, thoughtRecords)) + // Clear after first use to avoid duplicating across + // multiple tool calls in the same message. + thoughtRecords = nil } return results, nil @@ -196,7 +202,7 @@ func (i *responsesInterceptionBase) getPendingInjectedToolCalls(response *respon return calls } -func (i *responsesInterceptionBase) invokeInjectedTool(ctx context.Context, responseID string, fc responses.ResponseFunctionToolCall) responses.ResponseInputItemUnionParam { +func (i *responsesInterceptionBase) invokeInjectedTool(ctx context.Context, responseID string, fc responses.ResponseFunctionToolCall, thoughtRecords []*recorder.ModelThoughtRecord) responses.ResponseInputItemUnionParam { tool := i.mcpProxy.GetTool(fc.Name) if tool == nil { return responses.ResponseInputItemParamOfFunctionCallOutput(fc.CallID, fmt.Sprintf("error: unknown injected function %q", fc.ID)) @@ -213,6 +219,7 @@ func (i *responsesInterceptionBase) invokeInjectedTool(ctx context.Context, resp Args: args, Injected: true, InvocationError: err, + ModelThoughts: thoughtRecords, }) var output string diff --git a/internal/integrationtest/bridge_test.go b/internal/integrationtest/bridge_test.go index 4ebcc997..918cfdee 100644 --- a/internal/integrationtest/bridge_test.go +++ b/internal/integrationtest/bridge_test.go @@ -155,29 +155,14 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { t.Cleanup(cancel) fix := fixtures.Parse(t, fixtures.AntSingleBuiltinTool) - upstream := testutil.NewMockUpstream(t, ctx, testutil.NewFixtureResponse(fix)) - - recorderClient := &testutil.MockRecorder{} - logger := slogtest.Make(t, &slogtest.Options{}).Leveled(slog.LevelDebug) - providers := []aibridge.Provider{provider.NewAnthropic(anthropicCfg(upstream.URL, apiKey), nil)} - b, err := aibridge.NewRequestBridge(ctx, providers, recorderClient, mcp.NewServerProxyManager(nil, testTracer), logger, nil, testTracer) - require.NoError(t, err) + upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) - mockSrv := httptest.NewUnstartedServer(b) - t.Cleanup(mockSrv.Close) - mockSrv.Config.BaseContext = func(_ net.Listener) context.Context { - return aibcontext.AsActor(ctx, userID, nil) - } - mockSrv.Start() + bridgeServer := newBridgeTestServer(t, ctx, upstream.URL) reqBody, err := sjson.SetBytes(fix.Request(), "stream", tc.streaming) require.NoError(t, err) - req := createAnthropicMessagesReq(t, mockSrv.URL, reqBody) - client := &http.Client{} - resp, err := client.Do(req) - require.NoError(t, err) + resp := bridgeServer.makeRequest(t, http.MethodPost, pathAnthropicMessages, reqBody) require.Equal(t, http.StatusOK, resp.StatusCode) - defer resp.Body.Close() if tc.streaming { sp := aibridge.NewSSEParser() @@ -187,7 +172,7 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { } // Verify tool usage was recorded with associated model thoughts. - toolUsages := recorderClient.RecordedToolUsages() + toolUsages := bridgeServer.Recorder.RecordedToolUsages() require.Len(t, toolUsages, 1) assert.Equal(t, "Read", toolUsages[0].Tool) assert.Equal(t, tc.expectedToolCallID, toolUsages[0].ToolCallID) @@ -197,7 +182,7 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { assert.Contains(t, toolUsages[0].ModelThoughts[0].Content, "The user wants me to read") assert.Contains(t, toolUsages[0].ModelThoughts[0].Content, tc.expectedThinkingSubstr) - recorderClient.VerifyAllInterceptionsEnded(t) + bridgeServer.Recorder.VerifyAllInterceptionsEnded(t) }) } }) @@ -211,39 +196,24 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { // Use the simple fixture which has no tool calls — any thinking blocks // should not be persisted since they can't be associated with a tool call. fix := fixtures.Parse(t, fixtures.AntSimple) - upstream := testutil.NewMockUpstream(t, ctx, testutil.NewFixtureResponse(fix)) - - recorderClient := &testutil.MockRecorder{} - logger := slogtest.Make(t, &slogtest.Options{}).Leveled(slog.LevelDebug) - providers := []aibridge.Provider{provider.NewAnthropic(anthropicCfg(upstream.URL, apiKey), nil)} - b, err := aibridge.NewRequestBridge(ctx, providers, recorderClient, mcp.NewServerProxyManager(nil, testTracer), logger, nil, testTracer) - require.NoError(t, err) + upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) - mockSrv := httptest.NewUnstartedServer(b) - t.Cleanup(mockSrv.Close) - mockSrv.Config.BaseContext = func(_ net.Listener) context.Context { - return aibcontext.AsActor(ctx, userID, nil) - } - mockSrv.Start() + bridgeServer := newBridgeTestServer(t, ctx, upstream.URL) reqBody, err := sjson.SetBytes(fix.Request(), "stream", true) require.NoError(t, err) - req := createAnthropicMessagesReq(t, mockSrv.URL, reqBody) - client := &http.Client{} - resp, err := client.Do(req) - require.NoError(t, err) + resp := bridgeServer.makeRequest(t, http.MethodPost, pathAnthropicMessages, reqBody) require.Equal(t, http.StatusOK, resp.StatusCode) - defer resp.Body.Close() sp := aibridge.NewSSEParser() require.NoError(t, sp.Parse(resp.Body)) // No tool usages (and therefore no thoughts) should be recorded // when there are no tool calls. - toolUsages := recorderClient.RecordedToolUsages() + toolUsages := bridgeServer.Recorder.RecordedToolUsages() assert.Empty(t, toolUsages) - recorderClient.VerifyAllInterceptionsEnded(t) + bridgeServer.Recorder.VerifyAllInterceptionsEnded(t) }) } diff --git a/internal/integrationtest/responses_test.go b/internal/integrationtest/responses_test.go index eee1235f..0573e7dd 100644 --- a/internal/integrationtest/responses_test.go +++ b/internal/integrationtest/responses_test.go @@ -374,6 +374,7 @@ func TestResponsesOutputMatchesUpstream(t *testing.T) { require.Len(t, recordedTools, 1) recordedTools[0].InterceptionID = tc.expectToolRecorded.InterceptionID // ignore interception id (interception id is not constant and response doesn't contain it) recordedTools[0].CreatedAt = tc.expectToolRecorded.CreatedAt // ignore time + recordedTools[0].ModelThoughts = tc.expectToolRecorded.ModelThoughts // ignore model thoughts (tested separately) require.Equal(t, tc.expectToolRecorded, recordedTools[0]) } else { require.Empty(t, recordedTools) @@ -936,6 +937,89 @@ func TestResponsesInjectedTool(t *testing.T) { } } +func TestResponsesModelThoughts(t *testing.T) { + t.Parallel() + + t.Run("reasoning captured with builtin tool", func(t *testing.T) { + t.Parallel() + + cases := []struct { + streaming bool + expectedToolCallID string + }{ + { + streaming: false, + expectedToolCallID: "call_CJSaa2u51JG996575oVljuNq", + }, + { + streaming: true, + expectedToolCallID: "call_7VaiUXZYuuuwWwviCrckxq6t", + }, + } + + for _, tc := range cases { + t.Run(fmt.Sprintf("streaming=%v", tc.streaming), func(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), time.Second*30) + t.Cleanup(cancel) + + var fix fixtures.Fixture + if tc.streaming { + fix = fixtures.Parse(t, fixtures.OaiResponsesStreamingBuiltinTool) + } else { + fix = fixtures.Parse(t, fixtures.OaiResponsesBlockingSingleBuiltinTool) + } + upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) + + bridgeServer := newBridgeTestServer(t, ctx, upstream.URL) + + resp := bridgeServer.makeRequest(t, http.MethodPost, pathOpenAIResponses, fix.Request()) + require.Equal(t, http.StatusOK, resp.StatusCode) + + _, err := io.ReadAll(resp.Body) + require.NoError(t, err) + + // Verify tool usage was recorded with associated model thoughts. + toolUsages := bridgeServer.Recorder.RecordedToolUsages() + require.Len(t, toolUsages, 1) + require.Equal(t, "add", toolUsages[0].Tool) + require.Equal(t, tc.expectedToolCallID, toolUsages[0].ToolCallID) + + // Model thoughts should be embedded in the tool usage record. + require.Len(t, toolUsages[0].ModelThoughts, 1) + require.Contains(t, toolUsages[0].ModelThoughts[0].Content, "The user wants to add 3 and 5") + require.Contains(t, toolUsages[0].ModelThoughts[0].Content, "Let me call the add function") + }) + } + }) + + t.Run("no thoughts without tool calls", func(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), time.Second*30) + t.Cleanup(cancel) + + // Use the simple fixture which has no tool calls — any reasoning + // should not be persisted since it can't be associated with a tool call. + fix := fixtures.Parse(t, fixtures.OaiResponsesStreamingSimple) + upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) + + bridgeServer := newBridgeTestServer(t, ctx, upstream.URL) + + resp := bridgeServer.makeRequest(t, http.MethodPost, pathOpenAIResponses, fix.Request()) + require.Equal(t, http.StatusOK, resp.StatusCode) + + _, err := io.ReadAll(resp.Body) + require.NoError(t, err) + + // No tool usages (and therefore no thoughts) should be recorded + // when there are no tool calls. + toolUsages := bridgeServer.Recorder.RecordedToolUsages() + require.Empty(t, toolUsages) + }) +} + func requireResponsesError(t *testing.T, code int, message string, body []byte) { var respErr responses.Error err := json.Unmarshal(body, &respErr) From 1c9a526c150db92976bdf2c0ca2507b0238b4a22 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 6 Mar 2026 15:54:11 +0200 Subject: [PATCH 04/14] chore: refactor tests Signed-off-by: Danny Kopping --- .../openai/responses/streaming/simple.txtar | 64 +++++++------------ internal/integrationtest/bridge_test.go | 16 ++--- internal/integrationtest/responses_test.go | 2 +- 3 files changed, 30 insertions(+), 52 deletions(-) diff --git a/fixtures/openai/responses/streaming/simple.txtar b/fixtures/openai/responses/streaming/simple.txtar index c8736f9d..d86aa6e4 100644 --- a/fixtures/openai/responses/streaming/simple.txtar +++ b/fixtures/openai/responses/streaming/simple.txtar @@ -13,89 +13,71 @@ event: response.in_progress data: {"type":"response.in_progress","response":{"id":"resp_0f9c4b2f224d858000695fa062bf048197a680f357bbb09000","object":"response","created_at":1767874658,"status":"in_progress","background":false,"completed_at":null,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4o-mini-2024-07-18","output":[],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}},"sequence_number":1} event: response.output_item.added -data: {"type":"response.output_item.added","item":{"id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","type":"reasoning","status":"in_progress","summary":[]},"output_index":0,"sequence_number":2} - -event: response.reasoning_summary_part.added -data: {"type":"response.reasoning_summary_part.added","item_id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","output_index":0,"part":{"type":"summary_text","text":""},"summary_index":0,"sequence_number":3} - -event: response.reasoning_summary_text.delta -data: {"type":"response.reasoning_summary_text.delta","item_id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","output_index":0,"summary_index":0,"delta":"The user wants a joke. I will tell a classic scarecrow joke.","sequence_number":4} - -event: response.reasoning_summary_text.done -data: {"type":"response.reasoning_summary_text.done","item_id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","output_index":0,"summary_index":0,"text":"The user wants a joke. I will tell a classic scarecrow joke.","sequence_number":5} - -event: response.reasoning_summary_part.done -data: {"type":"response.reasoning_summary_part.done","item_id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","output_index":0,"part":{"type":"summary_text","text":"The user wants a joke. I will tell a classic scarecrow joke."},"summary_index":0,"sequence_number":6} - -event: response.output_item.done -data: {"type":"response.output_item.done","item":{"id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","type":"reasoning","status":"completed","summary":[{"type":"summary_text","text":"The user wants a joke. I will tell a classic scarecrow joke."}]},"output_index":0,"sequence_number":7} - -event: response.output_item.added -data: {"type":"response.output_item.added","item":{"id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","type":"message","status":"in_progress","content":[],"role":"assistant"},"output_index":1,"sequence_number":8} +data: {"type":"response.output_item.added","item":{"id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","type":"message","status":"in_progress","content":[],"role":"assistant"},"output_index":0,"sequence_number":2} event: response.content_part.added -data: {"type":"response.content_part.added","content_index":0,"item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","output_index":1,"part":{"type":"output_text","annotations":[],"logprobs":[],"text":""},"sequence_number":9} +data: {"type":"response.content_part.added","content_index":0,"item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","output_index":0,"part":{"type":"output_text","annotations":[],"logprobs":[],"text":""},"sequence_number":3} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":"Why","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"N16SG5UiLncOU","output_index":1,"sequence_number":10} +data: {"type":"response.output_text.delta","content_index":0,"delta":"Why","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"N16SG5UiLncOU","output_index":0,"sequence_number":4} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" did","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"OpojJ3pv0h55","output_index":1,"sequence_number":11} +data: {"type":"response.output_text.delta","content_index":0,"delta":" did","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"OpojJ3pv0h55","output_index":0,"sequence_number":5} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" the","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"11RCrnBxLo5x","output_index":1,"sequence_number":12} +data: {"type":"response.output_text.delta","content_index":0,"delta":" the","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"11RCrnBxLo5x","output_index":0,"sequence_number":6} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" scare","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"QZrRBlk6BV","output_index":1,"sequence_number":13} +data: {"type":"response.output_text.delta","content_index":0,"delta":" scare","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"QZrRBlk6BV","output_index":0,"sequence_number":7} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":"crow","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"gp7F8IVupiHG","output_index":1,"sequence_number":14} +data: {"type":"response.output_text.delta","content_index":0,"delta":"crow","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"gp7F8IVupiHG","output_index":0,"sequence_number":8} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" win","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"uKq4X8mT1jl9","output_index":1,"sequence_number":15} +data: {"type":"response.output_text.delta","content_index":0,"delta":" win","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"uKq4X8mT1jl9","output_index":0,"sequence_number":9} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" an","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"2Ox5JzaAsJHuT","output_index":1,"sequence_number":16} +data: {"type":"response.output_text.delta","content_index":0,"delta":" an","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"2Ox5JzaAsJHuT","output_index":0,"sequence_number":10} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" award","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"ZOQbZabNAQ","output_index":1,"sequence_number":17} +data: {"type":"response.output_text.delta","content_index":0,"delta":" award","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"ZOQbZabNAQ","output_index":0,"sequence_number":11} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":"?\n\n","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"N2dSd0FHBxooR","output_index":1,"sequence_number":18} +data: {"type":"response.output_text.delta","content_index":0,"delta":"?\n\n","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"N2dSd0FHBxooR","output_index":0,"sequence_number":12} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":"Because","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"LZ1O4laHt","output_index":1,"sequence_number":19} +data: {"type":"response.output_text.delta","content_index":0,"delta":"Because","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"LZ1O4laHt","output_index":0,"sequence_number":13} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" he","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"dqcS6ePaMvxMD","output_index":1,"sequence_number":20} +data: {"type":"response.output_text.delta","content_index":0,"delta":" he","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"dqcS6ePaMvxMD","output_index":0,"sequence_number":14} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" was","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"nR6CtC7MUsWW","output_index":1,"sequence_number":21} +data: {"type":"response.output_text.delta","content_index":0,"delta":" was","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"nR6CtC7MUsWW","output_index":0,"sequence_number":15} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" outstanding","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"dNVG","output_index":1,"sequence_number":22} +data: {"type":"response.output_text.delta","content_index":0,"delta":" outstanding","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"dNVG","output_index":0,"sequence_number":16} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" in","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"P7w4jjOcdVOla","output_index":1,"sequence_number":23} +data: {"type":"response.output_text.delta","content_index":0,"delta":" in","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"P7w4jjOcdVOla","output_index":0,"sequence_number":17} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" his","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"u9dg4RLIld4e","output_index":1,"sequence_number":24} +data: {"type":"response.output_text.delta","content_index":0,"delta":" his","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"u9dg4RLIld4e","output_index":0,"sequence_number":18} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":" field","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"qefuqzOCOy","output_index":1,"sequence_number":25} +data: {"type":"response.output_text.delta","content_index":0,"delta":" field","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"qefuqzOCOy","output_index":0,"sequence_number":19} event: response.output_text.delta -data: {"type":"response.output_text.delta","content_index":0,"delta":"!","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"DT9j4dSh0xyJdxU","output_index":1,"sequence_number":26} +data: {"type":"response.output_text.delta","content_index":0,"delta":"!","item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"obfuscation":"DT9j4dSh0xyJdxU","output_index":0,"sequence_number":20} event: response.output_text.done -data: {"type":"response.output_text.done","content_index":0,"item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"output_index":1,"sequence_number":27,"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"} +data: {"type":"response.output_text.done","content_index":0,"item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","logprobs":[],"output_index":0,"sequence_number":21,"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"} event: response.content_part.done -data: {"type":"response.content_part.done","content_index":0,"item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","output_index":1,"part":{"type":"output_text","annotations":[],"logprobs":[],"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"},"sequence_number":28} +data: {"type":"response.content_part.done","content_index":0,"item_id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","output_index":0,"part":{"type":"output_text","annotations":[],"logprobs":[],"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"},"sequence_number":22} event: response.output_item.done -data: {"type":"response.output_item.done","item":{"id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"logprobs":[],"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"}],"role":"assistant"},"output_index":1,"sequence_number":29} +data: {"type":"response.output_item.done","item":{"id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"logprobs":[],"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"}],"role":"assistant"},"output_index":0,"sequence_number":23} event: response.completed -data: {"type":"response.completed","response":{"id":"resp_0f9c4b2f224d858000695fa062bf048197a680f357bbb09000","object":"response","created_at":1767874658,"status":"completed","background":false,"completed_at":1767874660,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4o-mini-2024-07-18","output":[{"id":"rs_0f9c4b2f224d858000695fa063a0708197af73c2f37cb0b9d3","type":"reasoning","status":"completed","summary":[{"type":"summary_text","text":"The user wants a joke. I will tell a classic scarecrow joke."}]},{"id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"logprobs":[],"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"}],"role":"assistant"}],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"default","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":{"input_tokens":11,"input_tokens_details":{"cached_tokens":0},"output_tokens":18,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":29},"user":null,"metadata":{}},"sequence_number":30} +data: {"type":"response.completed","response":{"id":"resp_0f9c4b2f224d858000695fa062bf048197a680f357bbb09000","object":"response","created_at":1767874658,"status":"completed","background":false,"completed_at":1767874660,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4o-mini-2024-07-18","output":[{"id":"msg_0f9c4b2f224d858000695fa063d4708197af73c2f37cb0b9d3","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"logprobs":[],"text":"Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!"}],"role":"assistant"}],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"default","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":{"input_tokens":11,"input_tokens_details":{"cached_tokens":0},"output_tokens":18,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":29},"user":null,"metadata":{}},"sequence_number":24} diff --git a/internal/integrationtest/bridge_test.go b/internal/integrationtest/bridge_test.go index 918cfdee..9cdf7ac1 100644 --- a/internal/integrationtest/bridge_test.go +++ b/internal/integrationtest/bridge_test.go @@ -131,19 +131,16 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { t.Parallel() cases := []struct { - streaming bool - expectedToolCallID string - expectedThinkingSubstr string + streaming bool + expectedToolCallID string }{ { - streaming: true, - expectedToolCallID: "toolu_01RX68weRSquLx6HUTj65iBo", - expectedThinkingSubstr: "Let me find and read it.", + streaming: true, + expectedToolCallID: "toolu_01RX68weRSquLx6HUTj65iBo", }, { - streaming: false, - expectedToolCallID: "toolu_01AusGgY5aKFhzWrFBv9JfHq", - expectedThinkingSubstr: "Let me find and read it.", + streaming: false, + expectedToolCallID: "toolu_01AusGgY5aKFhzWrFBv9JfHq", }, } @@ -180,7 +177,6 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { // Model thoughts should be embedded in the tool usage record. require.Len(t, toolUsages[0].ModelThoughts, 1) assert.Contains(t, toolUsages[0].ModelThoughts[0].Content, "The user wants me to read") - assert.Contains(t, toolUsages[0].ModelThoughts[0].Content, tc.expectedThinkingSubstr) bridgeServer.Recorder.VerifyAllInterceptionsEnded(t) }) diff --git a/internal/integrationtest/responses_test.go b/internal/integrationtest/responses_test.go index 0573e7dd..900ab783 100644 --- a/internal/integrationtest/responses_test.go +++ b/internal/integrationtest/responses_test.go @@ -1002,7 +1002,7 @@ func TestResponsesModelThoughts(t *testing.T) { // Use the simple fixture which has no tool calls — any reasoning // should not be persisted since it can't be associated with a tool call. - fix := fixtures.Parse(t, fixtures.OaiResponsesStreamingSimple) + fix := fixtures.Parse(t, fixtures.OaiResponsesStreamingCodex) upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) bridgeServer := newBridgeTestServer(t, ctx, upstream.URL) From 7e4a15a418570fe4d3494c64cc518a549a60d5f2 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 6 Mar 2026 16:07:06 +0200 Subject: [PATCH 05/14] chore: test multiple thoughts Signed-off-by: Danny Kopping --- .../multi_thinking_builtin_tool.txtar | 136 +++++++++++++++++ fixtures/fixtures.go | 9 ++ .../multi_reasoning_builtin_tool.txtar | 142 ++++++++++++++++++ .../multi_reasoning_builtin_tool.txtar | 94 ++++++++++++ internal/integrationtest/bridge_test.go | 34 ++++- internal/integrationtest/responses_test.go | 42 ++++-- 6 files changed, 437 insertions(+), 20 deletions(-) create mode 100644 fixtures/anthropic/multi_thinking_builtin_tool.txtar create mode 100644 fixtures/openai/responses/blocking/multi_reasoning_builtin_tool.txtar create mode 100644 fixtures/openai/responses/streaming/multi_reasoning_builtin_tool.txtar diff --git a/fixtures/anthropic/multi_thinking_builtin_tool.txtar b/fixtures/anthropic/multi_thinking_builtin_tool.txtar new file mode 100644 index 00000000..633d11d9 --- /dev/null +++ b/fixtures/anthropic/multi_thinking_builtin_tool.txtar @@ -0,0 +1,136 @@ +Claude Code has builtin tools to (e.g.) explore the filesystem. +This fixture has two thinking blocks before the tool_use block. + +-- request -- +{ + "model": "claude-sonnet-4-20250514", + "max_tokens": 1024, + "messages": [ + { + "role": "user", + "content": "read the foo file" + } + ] +} + +-- streaming -- +event: message_start +data: {"type":"message_start","message":{"id":"msg_015SQewixvT9s4cABCVvUE6g","type":"message","role":"assistant","model":"claude-sonnet-4-20250514","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":2,"cache_creation_input_tokens":22,"cache_read_input_tokens":13993,"output_tokens":5,"service_tier":"standard"}} } + +event: content_block_start +data: {"type":"content_block_start","index":0,"content_block":{"type":"thinking","thinking":""}} + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"thinking_delta","thinking":"The user wants me to read a file called \"foo\". Let me find and read it."}} + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"signature_delta","signature":"Eu8BCkYICxgCKkBR++kFr7Za2JhF/9OCpjEc46/EcipL75RK+MEbxJ/VBJPWQTWrNGfwb5khWYJtKEpjjkH07cR/MQvThfb7t7CkEgwU4pKwL7NuZXd1/wgaDILyd0bYMqQovWo3dyIw95Ny7yZPljNBDLsvMBdBr7w+RtbU+AlSftjBuBZHp0VzI54/W+9u6f7qfx0JXsVBKldqqOjFvewT8Xm6Qp/77g6/j0zBiuAQABj/6vS1qATjd8KSIFDg9G/tCtzwmV/T/egmzswWd5CBiAhW6lgJgEDRr+gRUrFSOB7o3hypW8FUnUrr1JtzzwMYAQ=="}} + +event: content_block_stop +data: {"type":"content_block_stop","index":0} + +event: content_block_start +data: {"type":"content_block_start","index":1,"content_block":{"type":"thinking","thinking":""}} + +event: content_block_delta +data: {"type":"content_block_delta","index":1,"delta":{"type":"thinking_delta","thinking":"I should use the Read tool to access the file contents."}} + +event: content_block_delta +data: {"type":"content_block_delta","index":1,"delta":{"type":"signature_delta","signature":"Aa1BCkYICxgCKkBR++kFr7Za2JhF/9OCpjEc46/EcipL75RK+MEbxJ/VBJPWQTWrNGfwb5khWYJtKEpjjkH07cR/MQvThfb7t7CkEgwU4pKwL7NuZXd1/wgaDILyd0bYMqQovWo3dyIw95Ny7yZPljNBDLsvMBdBr7w+RtbU+AlSftjBuBZHp0VzI54/W+9u6f7qfx0JXsVBKldqqOjFvewT8Xm6Qp/77g6/j0zBiuAQABj/6vS1qATjd8KSIFDg9G/tCtzwmV/T/egmzswWd5CBiAhW6lgJgEDRr+gRUrFSOB7o3hypW8FUnUrr1JtzzwMYAQ=="}} + +event: content_block_stop +data: {"type":"content_block_stop","index":1} + +event: content_block_start +data: {"type":"content_block_start","index":2,"content_block":{"type":"tool_use","id":"toolu_01RX68weRSquLx6HUTj65iBo","name":"Read","input":{}}} + +event: ping +data: {"type": "ping"} + +event: content_block_delta +data: {"type":"content_block_delta","index":2,"delta":{"type":"input_json_delta","partial_json":""} } + +event: content_block_delta +data: {"type":"content_block_delta","index":2,"delta":{"type":"input_json_delta","partial_json":"{\"file_path\": \"/tmp/blah/foo"} } + +event: content_block_delta +data: {"type":"content_block_delta","index":2,"delta":{"type":"input_json_delta","partial_json":"\"}"} } + +event: content_block_stop +data: {"type":"content_block_stop","index":2 } + +event: message_delta +data: {"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"output_tokens":61} } + +event: message_stop +data: {"type":"message_stop" } + + +-- non-streaming -- +{ + "id": "msg_01JHKqEmh7wYuPXqUWUvusfL", + "container": { + "id": "", + "expires_at": "0001-01-01T00:00:00Z" + }, + "content": [ + { + "type": "thinking", + "thinking": "The user wants me to read a file called \"foo\". Let me find and read it.", + "signature": "Eu8BCkYICxgCKkBR++kFr7Za2JhF/9OCpjEc46/EcipL75RK+MEbxJ/VBJPWQTWrNGfwb5khWYJtKEpjjkH07cR/MQvThfb7t7CkEgwU4pKwL7NuZXd1/wgaDILyd0bYMqQovWo3dyIw95Ny7yZPljNBDLsvMBdBr7w+RtbU+AlSftjBuBZHp0VzI54/W+9u6f7qfx0JXsVBKldqqOjFvewT8Xm6Qp/77g6/j0zBiuAQABj/6vS1qATjd8KSIFDg9G/tCtzwmV/T/egmzswWd5CBiAhW6lgJgEDRr+gRUrFSOB7o3hypW8FUnUrr1JtzzwMYAQ==" + }, + { + "type": "thinking", + "thinking": "I should use the Read tool to access the file contents.", + "signature": "Aa1BCkYICxgCKkBR++kFr7Za2JhF/9OCpjEc46/EcipL75RK+MEbxJ/VBJPWQTWrNGfwb5khWYJtKEpjjkH07cR/MQvThfb7t7CkEgwU4pKwL7NuZXd1/wgaDILyd0bYMqQovWo3dyIw95Ny7yZPljNBDLsvMBdBr7w+RtbU+AlSftjBuBZHp0VzI54/W+9u6f7qfx0JXsVBKldqqOjFvewT8Xm6Qp/77g6/j0zBiuAQABj/6vS1qATjd8KSIFDg9G/tCtzwmV/T/egmzswWd5CBiAhW6lgJgEDRr+gRUrFSOB7o3hypW8FUnUrr1JtzzwMYAQ==" + }, + { + "citations": null, + "text": "", + "type": "tool_use", + "id": "toolu_01AusGgY5aKFhzWrFBv9JfHq", + "input": { + "file_path": "/tmp/blah/foo" + }, + "name": "Read", + "content": { + "OfWebSearchResultBlockArray": null, + "OfString": "", + "OfMCPToolResultBlockContent": null, + "error_code": "", + "type": "", + "content": null, + "return_code": 0, + "stderr": "", + "stdout": "" + }, + "tool_use_id": "", + "server_name": "", + "is_error": false, + "file_id": "", + "signature": "", + "thinking": "", + "data": "" + } + ], + "model": "claude-sonnet-4-20250514", + "role": "assistant", + "stop_reason": "tool_use", + "stop_sequence": "", + "type": "message", + "usage": { + "cache_creation": { + "ephemeral_1h_input_tokens": 0, + "ephemeral_5m_input_tokens": 0 + }, + "cache_creation_input_tokens": 0, + "cache_read_input_tokens": 23490, + "input_tokens": 5, + "output_tokens": 84, + "server_tool_use": { + "web_search_requests": 0 + }, + "service_tier": "standard" + } +} + diff --git a/fixtures/fixtures.go b/fixtures/fixtures.go index 3c150471..cacf657c 100644 --- a/fixtures/fixtures.go +++ b/fixtures/fixtures.go @@ -15,6 +15,9 @@ var ( //go:embed anthropic/single_builtin_tool.txtar AntSingleBuiltinTool []byte + //go:embed anthropic/multi_thinking_builtin_tool.txtar + AntMultiThinkingBuiltinTool []byte + //go:embed anthropic/single_injected_tool.txtar AntSingleInjectedTool []byte @@ -61,6 +64,9 @@ var ( //go:embed openai/responses/blocking/single_builtin_tool.txtar OaiResponsesBlockingSingleBuiltinTool []byte + //go:embed openai/responses/blocking/multi_reasoning_builtin_tool.txtar + OaiResponsesBlockingMultiReasoningBuiltinTool []byte + //go:embed openai/responses/blocking/cached_input_tokens.txtar OaiResponsesBlockingCachedInputTokens []byte @@ -96,6 +102,9 @@ var ( //go:embed openai/responses/streaming/builtin_tool.txtar OaiResponsesStreamingBuiltinTool []byte + //go:embed openai/responses/streaming/multi_reasoning_builtin_tool.txtar + OaiResponsesStreamingMultiReasoningBuiltinTool []byte + //go:embed openai/responses/streaming/cached_input_tokens.txtar OaiResponsesStreamingCachedInputTokens []byte diff --git a/fixtures/openai/responses/blocking/multi_reasoning_builtin_tool.txtar b/fixtures/openai/responses/blocking/multi_reasoning_builtin_tool.txtar new file mode 100644 index 00000000..022b433e --- /dev/null +++ b/fixtures/openai/responses/blocking/multi_reasoning_builtin_tool.txtar @@ -0,0 +1,142 @@ +Two reasoning output items before a function_call. + +-- request -- +{ + "input": [ + { + "role": "user", + "content": "Is 3 + 5 a prime number? Use the add function to calculate the sum." + } + ], + "model": "gpt-4.1", + "stream": false, + "tools": [ + { + "type": "function", + "name": "add", + "description": "Add two numbers together.", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ] + } + } + ] +} + +-- non-streaming -- +{ + "id": "resp_0da6045a8b68fa5200695fa23dcc2c81a19c849f627abf8a31", + "object": "response", + "created_at": 1767875133, + "status": "completed", + "background": false, + "completed_at": 1767875134, + "error": null, + "incomplete_details": null, + "instructions": null, + "max_output_tokens": null, + "max_tool_calls": null, + "model": "gpt-4.1-2025-04-14", + "output": [ + { + "id": "rs_0da6045a8b68fa5200695fa23e100081a19bf68887d47ae93d", + "type": "reasoning", + "status": "completed", + "summary": [ + { + "type": "summary_text", + "text": "The user wants to add 3 and 5. Let me call the add function." + } + ] + }, + { + "id": "rs_1aa7045a8b68fa5200695fa23e200082b29cf79998e58bf94e", + "type": "reasoning", + "status": "completed", + "summary": [ + { + "type": "summary_text", + "text": "After adding, I will check if the result is prime." + } + ] + }, + { + "id": "fc_0da6045a8b68fa5200695fa23e198081a19bf68887d47ae93d", + "type": "function_call", + "status": "completed", + "arguments": "{\"a\":3,\"b\":5}", + "call_id": "call_CJSaa2u51JG996575oVljuNq", + "name": "add" + } + ], + "parallel_tool_calls": true, + "previous_response_id": null, + "prompt_cache_key": null, + "prompt_cache_retention": null, + "reasoning": { + "effort": null, + "summary": null + }, + "safety_identifier": null, + "service_tier": "default", + "store": true, + "temperature": 1.0, + "text": { + "format": { + "type": "text" + }, + "verbosity": "medium" + }, + "tool_choice": "auto", + "tools": [ + { + "type": "function", + "description": "Add two numbers together.", + "name": "add", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ], + "additionalProperties": false + }, + "strict": true + } + ], + "top_logprobs": 0, + "top_p": 1.0, + "truncation": "disabled", + "usage": { + "input_tokens": 58, + "input_tokens_details": { + "cached_tokens": 0 + }, + "output_tokens": 18, + "output_tokens_details": { + "reasoning_tokens": 0 + }, + "total_tokens": 76 + }, + "user": null, + "metadata": {} +} diff --git a/fixtures/openai/responses/streaming/multi_reasoning_builtin_tool.txtar b/fixtures/openai/responses/streaming/multi_reasoning_builtin_tool.txtar new file mode 100644 index 00000000..b54ebc7a --- /dev/null +++ b/fixtures/openai/responses/streaming/multi_reasoning_builtin_tool.txtar @@ -0,0 +1,94 @@ +Two reasoning output items before a function_call. + +-- request -- +{ + "input": [ + { + "role": "user", + "content": "Is 3 + 5 a prime number? Use the add function to calculate the sum." + } + ], + "model": "gpt-4.1", + "stream": true, + "tools": [ + { + "type": "function", + "name": "add", + "description": "Add two numbers together.", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ] + } + } + ] +} + +-- streaming -- +event: response.created +data: {"type":"response.created","response":{"id":"resp_0c3fb28cfcf463a500695fa2f0239481a095ec6ce3dfe4d458","object":"response","created_at":1767875312,"status":"in_progress","background":false,"completed_at":null,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4.1-2025-04-14","output":[],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}},"sequence_number":0} + +event: response.in_progress +data: {"type":"response.in_progress","response":{"id":"resp_0c3fb28cfcf463a500695fa2f0239481a095ec6ce3dfe4d458","object":"response","created_at":1767875312,"status":"in_progress","background":false,"completed_at":null,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4.1-2025-04-14","output":[],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}},"sequence_number":1} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","type":"reasoning","status":"in_progress","summary":[]},"output_index":0,"sequence_number":2} + +event: response.reasoning_summary_part.added +data: {"type":"response.reasoning_summary_part.added","item_id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","output_index":0,"part":{"type":"summary_text","text":""},"summary_index":0,"sequence_number":3} + +event: response.reasoning_summary_text.delta +data: {"type":"response.reasoning_summary_text.delta","item_id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","output_index":0,"summary_index":0,"delta":"The user wants to add 3 and 5. Let me call the add function.","sequence_number":4} + +event: response.reasoning_summary_text.done +data: {"type":"response.reasoning_summary_text.done","item_id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","output_index":0,"summary_index":0,"text":"The user wants to add 3 and 5. Let me call the add function.","sequence_number":5} + +event: response.reasoning_summary_part.done +data: {"type":"response.reasoning_summary_part.done","item_id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","output_index":0,"part":{"type":"summary_text","text":"The user wants to add 3 and 5. Let me call the add function."},"summary_index":0,"sequence_number":6} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","type":"reasoning","status":"completed","summary":[{"type":"summary_text","text":"The user wants to add 3 and 5. Let me call the add function."}]},"output_index":0,"sequence_number":7} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"rs_1aa7045a8b68fa5200695fa23e200082b29cf79998e58bf94e","type":"reasoning","status":"in_progress","summary":[]},"output_index":1,"sequence_number":8} + +event: response.reasoning_summary_part.added +data: {"type":"response.reasoning_summary_part.added","item_id":"rs_1aa7045a8b68fa5200695fa23e200082b29cf79998e58bf94e","output_index":1,"part":{"type":"summary_text","text":""},"summary_index":0,"sequence_number":9} + +event: response.reasoning_summary_text.delta +data: {"type":"response.reasoning_summary_text.delta","item_id":"rs_1aa7045a8b68fa5200695fa23e200082b29cf79998e58bf94e","output_index":1,"summary_index":0,"delta":"After adding, I will check if the result is prime.","sequence_number":10} + +event: response.reasoning_summary_text.done +data: {"type":"response.reasoning_summary_text.done","item_id":"rs_1aa7045a8b68fa5200695fa23e200082b29cf79998e58bf94e","output_index":1,"summary_index":0,"text":"After adding, I will check if the result is prime.","sequence_number":11} + +event: response.reasoning_summary_part.done +data: {"type":"response.reasoning_summary_part.done","item_id":"rs_1aa7045a8b68fa5200695fa23e200082b29cf79998e58bf94e","output_index":1,"part":{"type":"summary_text","text":"After adding, I will check if the result is prime."},"summary_index":0,"sequence_number":12} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"rs_1aa7045a8b68fa5200695fa23e200082b29cf79998e58bf94e","type":"reasoning","status":"completed","summary":[{"type":"summary_text","text":"After adding, I will check if the result is prime."}]},"output_index":1,"sequence_number":13} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","type":"function_call","status":"in_progress","arguments":"","call_id":"call_7VaiUXZYuuuwWwviCrckxq6t","name":"add"},"output_index":2,"sequence_number":14} + +event: response.function_call_arguments.delta +data: {"type":"response.function_call_arguments.delta","delta":"{\"a\":3,\"b\":5}","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","obfuscation":"gWZHP8i4lSgQYT","output_index":2,"sequence_number":15} + +event: response.function_call_arguments.done +data: {"type":"response.function_call_arguments.done","arguments":"{\"a\":3,\"b\":5}","item_id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","output_index":2,"sequence_number":16} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","type":"function_call","status":"completed","arguments":"{\"a\":3,\"b\":5}","call_id":"call_7VaiUXZYuuuwWwviCrckxq6t","name":"add"},"output_index":2,"sequence_number":17} + +event: response.completed +data: {"type":"response.completed","response":{"id":"resp_0c3fb28cfcf463a500695fa2f0239481a095ec6ce3dfe4d458","object":"response","created_at":1767875312,"status":"completed","background":false,"completed_at":1767875312,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4.1-2025-04-14","output":[{"id":"rs_0c3fb28cfcf463a500695fa2f0a0a881a0890103ba88b0628e","type":"reasoning","status":"completed","summary":[{"type":"summary_text","text":"The user wants to add 3 and 5. Let me call the add function."}]},{"id":"rs_1aa7045a8b68fa5200695fa23e200082b29cf79998e58bf94e","type":"reasoning","status":"completed","summary":[{"type":"summary_text","text":"After adding, I will check if the result is prime."}]},{"id":"fc_0c3fb28cfcf463a500695fa2f0b0a881a0890103ba88b0628e","type":"function_call","status":"completed","arguments":"{\"a\":3,\"b\":5}","call_id":"call_7VaiUXZYuuuwWwviCrckxq6t","name":"add"}],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"default","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":{"input_tokens":58,"input_tokens_details":{"cached_tokens":0},"output_tokens":18,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":76},"user":null,"metadata":{}},"sequence_number":18} + diff --git a/internal/integrationtest/bridge_test.go b/internal/integrationtest/bridge_test.go index 9cdf7ac1..88678f95 100644 --- a/internal/integrationtest/bridge_test.go +++ b/internal/integrationtest/bridge_test.go @@ -131,27 +131,50 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { t.Parallel() cases := []struct { + name string streaming bool + fixture []byte expectedToolCallID string + expectedThoughts []string }{ { + name: "single thinking block/streaming", streaming: true, + fixture: fixtures.AntSingleBuiltinTool, expectedToolCallID: "toolu_01RX68weRSquLx6HUTj65iBo", + expectedThoughts: []string{"The user wants me to read"}, }, { + name: "single thinking block/blocking", streaming: false, + fixture: fixtures.AntSingleBuiltinTool, expectedToolCallID: "toolu_01AusGgY5aKFhzWrFBv9JfHq", + expectedThoughts: []string{"The user wants me to read"}, + }, + { + name: "multiple thinking blocks/streaming", + streaming: true, + fixture: fixtures.AntMultiThinkingBuiltinTool, + expectedToolCallID: "toolu_01RX68weRSquLx6HUTj65iBo", + expectedThoughts: []string{"The user wants me to read", "I should use the Read tool"}, + }, + { + name: "multiple thinking blocks/blocking", + streaming: false, + fixture: fixtures.AntMultiThinkingBuiltinTool, + expectedToolCallID: "toolu_01AusGgY5aKFhzWrFBv9JfHq", + expectedThoughts: []string{"The user wants me to read", "I should use the Read tool"}, }, } for _, tc := range cases { - t.Run(fmt.Sprintf("streaming=%v", tc.streaming), func(t *testing.T) { + t.Run(tc.name, func(t *testing.T) { t.Parallel() ctx, cancel := context.WithTimeout(t.Context(), time.Second*30) t.Cleanup(cancel) - fix := fixtures.Parse(t, fixtures.AntSingleBuiltinTool) + fix := fixtures.Parse(t, tc.fixture) upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) bridgeServer := newBridgeTestServer(t, ctx, upstream.URL) @@ -174,9 +197,10 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { assert.Equal(t, "Read", toolUsages[0].Tool) assert.Equal(t, tc.expectedToolCallID, toolUsages[0].ToolCallID) - // Model thoughts should be embedded in the tool usage record. - require.Len(t, toolUsages[0].ModelThoughts, 1) - assert.Contains(t, toolUsages[0].ModelThoughts[0].Content, "The user wants me to read") + require.Len(t, toolUsages[0].ModelThoughts, len(tc.expectedThoughts)) + for i, expected := range tc.expectedThoughts { + assert.Contains(t, toolUsages[0].ModelThoughts[i].Content, expected) + } bridgeServer.Recorder.VerifyAllInterceptionsEnded(t) }) diff --git a/internal/integrationtest/responses_test.go b/internal/integrationtest/responses_test.go index 900ab783..3a1519db 100644 --- a/internal/integrationtest/responses_test.go +++ b/internal/integrationtest/responses_test.go @@ -3,7 +3,6 @@ package integrationtest import ( "context" "encoding/json" - "fmt" "io" "net" "net/http" @@ -944,32 +943,45 @@ func TestResponsesModelThoughts(t *testing.T) { t.Parallel() cases := []struct { - streaming bool + name string + fixture []byte expectedToolCallID string + expectedThoughts []string }{ { - streaming: false, + name: "single reasoning/blocking", + fixture: fixtures.OaiResponsesBlockingSingleBuiltinTool, expectedToolCallID: "call_CJSaa2u51JG996575oVljuNq", + expectedThoughts: []string{"The user wants to add 3 and 5"}, }, { - streaming: true, + name: "single reasoning/streaming", + fixture: fixtures.OaiResponsesStreamingBuiltinTool, expectedToolCallID: "call_7VaiUXZYuuuwWwviCrckxq6t", + expectedThoughts: []string{"The user wants to add 3 and 5"}, + }, + { + name: "multiple reasoning items/blocking", + fixture: fixtures.OaiResponsesBlockingMultiReasoningBuiltinTool, + expectedToolCallID: "call_CJSaa2u51JG996575oVljuNq", + expectedThoughts: []string{"The user wants to add 3 and 5", "After adding, I will check if the result is prime"}, + }, + { + name: "multiple reasoning items/streaming", + fixture: fixtures.OaiResponsesStreamingMultiReasoningBuiltinTool, + expectedToolCallID: "call_7VaiUXZYuuuwWwviCrckxq6t", + expectedThoughts: []string{"The user wants to add 3 and 5", "After adding, I will check if the result is prime"}, }, } for _, tc := range cases { - t.Run(fmt.Sprintf("streaming=%v", tc.streaming), func(t *testing.T) { + t.Run(tc.name, func(t *testing.T) { t.Parallel() ctx, cancel := context.WithTimeout(t.Context(), time.Second*30) t.Cleanup(cancel) - var fix fixtures.Fixture - if tc.streaming { - fix = fixtures.Parse(t, fixtures.OaiResponsesStreamingBuiltinTool) - } else { - fix = fixtures.Parse(t, fixtures.OaiResponsesBlockingSingleBuiltinTool) - } + fix := fixtures.Parse(t, tc.fixture) upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) bridgeServer := newBridgeTestServer(t, ctx, upstream.URL) @@ -986,10 +998,10 @@ func TestResponsesModelThoughts(t *testing.T) { require.Equal(t, "add", toolUsages[0].Tool) require.Equal(t, tc.expectedToolCallID, toolUsages[0].ToolCallID) - // Model thoughts should be embedded in the tool usage record. - require.Len(t, toolUsages[0].ModelThoughts, 1) - require.Contains(t, toolUsages[0].ModelThoughts[0].Content, "The user wants to add 3 and 5") - require.Contains(t, toolUsages[0].ModelThoughts[0].Content, "Let me call the add function") + require.Len(t, toolUsages[0].ModelThoughts, len(tc.expectedThoughts)) + for i, expected := range tc.expectedThoughts { + require.Contains(t, toolUsages[0].ModelThoughts[i].Content, expected) + } }) } }) From abfc67953193d6b645f2c3dc91a582bba8f217e3 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 6 Mar 2026 16:13:51 +0200 Subject: [PATCH 06/14] chore: refactor tests Signed-off-by: Danny Kopping --- internal/integrationtest/bridge_test.go | 159 +++++++++------------ internal/integrationtest/responses_test.go | 131 +++++++---------- 2 files changed, 122 insertions(+), 168 deletions(-) diff --git a/internal/integrationtest/bridge_test.go b/internal/integrationtest/bridge_test.go index 88678f95..6150173a 100644 --- a/internal/integrationtest/bridge_test.go +++ b/internal/integrationtest/bridge_test.go @@ -127,72 +127,76 @@ func TestAnthropicMessages(t *testing.T) { func TestAnthropicMessagesModelThoughts(t *testing.T) { t.Parallel() - t.Run("thinking captured with builtin tool", func(t *testing.T) { - t.Parallel() - - cases := []struct { - name string - streaming bool - fixture []byte - expectedToolCallID string - expectedThoughts []string - }{ - { - name: "single thinking block/streaming", - streaming: true, - fixture: fixtures.AntSingleBuiltinTool, - expectedToolCallID: "toolu_01RX68weRSquLx6HUTj65iBo", - expectedThoughts: []string{"The user wants me to read"}, - }, - { - name: "single thinking block/blocking", - streaming: false, - fixture: fixtures.AntSingleBuiltinTool, - expectedToolCallID: "toolu_01AusGgY5aKFhzWrFBv9JfHq", - expectedThoughts: []string{"The user wants me to read"}, - }, - { - name: "multiple thinking blocks/streaming", - streaming: true, - fixture: fixtures.AntMultiThinkingBuiltinTool, - expectedToolCallID: "toolu_01RX68weRSquLx6HUTj65iBo", - expectedThoughts: []string{"The user wants me to read", "I should use the Read tool"}, - }, - { - name: "multiple thinking blocks/blocking", - streaming: false, - fixture: fixtures.AntMultiThinkingBuiltinTool, - expectedToolCallID: "toolu_01AusGgY5aKFhzWrFBv9JfHq", - expectedThoughts: []string{"The user wants me to read", "I should use the Read tool"}, - }, - } + cases := []struct { + name string + streaming bool + fixture []byte + expectedToolCallID string + expectedThoughts []string // nil means no tool usages expected at all + }{ + { + name: "single thinking block/streaming", + streaming: true, + fixture: fixtures.AntSingleBuiltinTool, + expectedToolCallID: "toolu_01RX68weRSquLx6HUTj65iBo", + expectedThoughts: []string{"The user wants me to read"}, + }, + { + name: "single thinking block/blocking", + streaming: false, + fixture: fixtures.AntSingleBuiltinTool, + expectedToolCallID: "toolu_01AusGgY5aKFhzWrFBv9JfHq", + expectedThoughts: []string{"The user wants me to read"}, + }, + { + name: "multiple thinking blocks/streaming", + streaming: true, + fixture: fixtures.AntMultiThinkingBuiltinTool, + expectedToolCallID: "toolu_01RX68weRSquLx6HUTj65iBo", + expectedThoughts: []string{"The user wants me to read", "I should use the Read tool"}, + }, + { + name: "multiple thinking blocks/blocking", + streaming: false, + fixture: fixtures.AntMultiThinkingBuiltinTool, + expectedToolCallID: "toolu_01AusGgY5aKFhzWrFBv9JfHq", + expectedThoughts: []string{"The user wants me to read", "I should use the Read tool"}, + }, + { + name: "no thoughts without tool calls", + streaming: true, + fixture: fixtures.AntSimple, // This fixture contains thoughts, but they're not associated with tool calls. + }, + } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - t.Parallel() + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() - ctx, cancel := context.WithTimeout(t.Context(), time.Second*30) - t.Cleanup(cancel) + ctx, cancel := context.WithTimeout(t.Context(), time.Second*30) + t.Cleanup(cancel) - fix := fixtures.Parse(t, tc.fixture) - upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) + fix := fixtures.Parse(t, tc.fixture) + upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) - bridgeServer := newBridgeTestServer(t, ctx, upstream.URL) + bridgeServer := newBridgeTestServer(t, ctx, upstream.URL) - reqBody, err := sjson.SetBytes(fix.Request(), "stream", tc.streaming) - require.NoError(t, err) - resp := bridgeServer.makeRequest(t, http.MethodPost, pathAnthropicMessages, reqBody) - require.Equal(t, http.StatusOK, resp.StatusCode) + reqBody, err := sjson.SetBytes(fix.Request(), "stream", tc.streaming) + require.NoError(t, err) + resp := bridgeServer.makeRequest(t, http.MethodPost, pathAnthropicMessages, reqBody) + require.Equal(t, http.StatusOK, resp.StatusCode) - if tc.streaming { - sp := aibridge.NewSSEParser() - require.NoError(t, sp.Parse(resp.Body)) - assert.Contains(t, sp.AllEvents(), "message_start") - assert.Contains(t, sp.AllEvents(), "message_stop") - } + if tc.streaming { + sp := aibridge.NewSSEParser() + require.NoError(t, sp.Parse(resp.Body)) + assert.Contains(t, sp.AllEvents(), "message_start") + assert.Contains(t, sp.AllEvents(), "message_stop") + } - // Verify tool usage was recorded with associated model thoughts. - toolUsages := bridgeServer.Recorder.RecordedToolUsages() + toolUsages := bridgeServer.Recorder.RecordedToolUsages() + if tc.expectedThoughts == nil { + assert.Empty(t, toolUsages) + } else { require.Len(t, toolUsages, 1) assert.Equal(t, "Read", toolUsages[0].Tool) assert.Equal(t, tc.expectedToolCallID, toolUsages[0].ToolCallID) @@ -201,40 +205,11 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { for i, expected := range tc.expectedThoughts { assert.Contains(t, toolUsages[0].ModelThoughts[i].Content, expected) } + } - bridgeServer.Recorder.VerifyAllInterceptionsEnded(t) - }) - } - }) - - t.Run("no thoughts without tool calls", func(t *testing.T) { - t.Parallel() - - ctx, cancel := context.WithTimeout(t.Context(), time.Second*30) - t.Cleanup(cancel) - - // Use the simple fixture which has no tool calls — any thinking blocks - // should not be persisted since they can't be associated with a tool call. - fix := fixtures.Parse(t, fixtures.AntSimple) - upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) - - bridgeServer := newBridgeTestServer(t, ctx, upstream.URL) - - reqBody, err := sjson.SetBytes(fix.Request(), "stream", true) - require.NoError(t, err) - resp := bridgeServer.makeRequest(t, http.MethodPost, pathAnthropicMessages, reqBody) - require.Equal(t, http.StatusOK, resp.StatusCode) - - sp := aibridge.NewSSEParser() - require.NoError(t, sp.Parse(resp.Body)) - - // No tool usages (and therefore no thoughts) should be recorded - // when there are no tool calls. - toolUsages := bridgeServer.Recorder.RecordedToolUsages() - assert.Empty(t, toolUsages) - - bridgeServer.Recorder.VerifyAllInterceptionsEnded(t) - }) + bridgeServer.Recorder.VerifyAllInterceptionsEnded(t) + }) + } } func TestAWSBedrockIntegration(t *testing.T) { diff --git a/internal/integrationtest/responses_test.go b/internal/integrationtest/responses_test.go index 3a1519db..2cee005a 100644 --- a/internal/integrationtest/responses_test.go +++ b/internal/integrationtest/responses_test.go @@ -3,6 +3,7 @@ package integrationtest import ( "context" "encoding/json" + "fmt" "io" "net" "net/http" @@ -939,61 +940,64 @@ func TestResponsesInjectedTool(t *testing.T) { func TestResponsesModelThoughts(t *testing.T) { t.Parallel() - t.Run("reasoning captured with builtin tool", func(t *testing.T) { - t.Parallel() - - cases := []struct { - name string - fixture []byte - expectedToolCallID string - expectedThoughts []string - }{ - { - name: "single reasoning/blocking", - fixture: fixtures.OaiResponsesBlockingSingleBuiltinTool, - expectedToolCallID: "call_CJSaa2u51JG996575oVljuNq", - expectedThoughts: []string{"The user wants to add 3 and 5"}, - }, - { - name: "single reasoning/streaming", - fixture: fixtures.OaiResponsesStreamingBuiltinTool, - expectedToolCallID: "call_7VaiUXZYuuuwWwviCrckxq6t", - expectedThoughts: []string{"The user wants to add 3 and 5"}, - }, - { - name: "multiple reasoning items/blocking", - fixture: fixtures.OaiResponsesBlockingMultiReasoningBuiltinTool, - expectedToolCallID: "call_CJSaa2u51JG996575oVljuNq", - expectedThoughts: []string{"The user wants to add 3 and 5", "After adding, I will check if the result is prime"}, - }, - { - name: "multiple reasoning items/streaming", - fixture: fixtures.OaiResponsesStreamingMultiReasoningBuiltinTool, - expectedToolCallID: "call_7VaiUXZYuuuwWwviCrckxq6t", - expectedThoughts: []string{"The user wants to add 3 and 5", "After adding, I will check if the result is prime"}, - }, - } + cases := []struct { + name string + fixture []byte + expectedToolCallID string + expectedThoughts []string // nil means no tool usages expected at all + }{ + { + name: "single reasoning/blocking", + fixture: fixtures.OaiResponsesBlockingSingleBuiltinTool, + expectedToolCallID: "call_CJSaa2u51JG996575oVljuNq", + expectedThoughts: []string{"The user wants to add 3 and 5"}, + }, + { + name: "single reasoning/streaming", + fixture: fixtures.OaiResponsesStreamingBuiltinTool, + expectedToolCallID: "call_7VaiUXZYuuuwWwviCrckxq6t", + expectedThoughts: []string{"The user wants to add 3 and 5"}, + }, + { + name: "multiple reasoning items/blocking", + fixture: fixtures.OaiResponsesBlockingMultiReasoningBuiltinTool, + expectedToolCallID: "call_CJSaa2u51JG996575oVljuNq", + expectedThoughts: []string{"The user wants to add 3 and 5", "After adding, I will check if the result is prime"}, + }, + { + name: "multiple reasoning items/streaming", + fixture: fixtures.OaiResponsesStreamingMultiReasoningBuiltinTool, + expectedToolCallID: "call_7VaiUXZYuuuwWwviCrckxq6t", + expectedThoughts: []string{"The user wants to add 3 and 5", "After adding, I will check if the result is prime"}, + }, + { + name: "no thoughts without tool calls", + fixture: fixtures.OaiResponsesStreamingCodex, // This fixture contains reasoning, but it's not associated with tool calls. + }, + } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - t.Parallel() + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() - ctx, cancel := context.WithTimeout(t.Context(), time.Second*30) - t.Cleanup(cancel) + ctx, cancel := context.WithTimeout(t.Context(), time.Second*30) + t.Cleanup(cancel) - fix := fixtures.Parse(t, tc.fixture) - upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) + fix := fixtures.Parse(t, tc.fixture) + upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) - bridgeServer := newBridgeTestServer(t, ctx, upstream.URL) + bridgeServer := newBridgeTestServer(t, ctx, upstream.URL) - resp := bridgeServer.makeRequest(t, http.MethodPost, pathOpenAIResponses, fix.Request()) - require.Equal(t, http.StatusOK, resp.StatusCode) + resp := bridgeServer.makeRequest(t, http.MethodPost, pathOpenAIResponses, fix.Request()) + require.Equal(t, http.StatusOK, resp.StatusCode) - _, err := io.ReadAll(resp.Body) - require.NoError(t, err) + _, err := io.ReadAll(resp.Body) + require.NoError(t, err) - // Verify tool usage was recorded with associated model thoughts. - toolUsages := bridgeServer.Recorder.RecordedToolUsages() + toolUsages := bridgeServer.Recorder.RecordedToolUsages() + if tc.expectedThoughts == nil { + require.Empty(t, toolUsages) + } else { require.Len(t, toolUsages, 1) require.Equal(t, "add", toolUsages[0].Tool) require.Equal(t, tc.expectedToolCallID, toolUsages[0].ToolCallID) @@ -1002,34 +1006,9 @@ func TestResponsesModelThoughts(t *testing.T) { for i, expected := range tc.expectedThoughts { require.Contains(t, toolUsages[0].ModelThoughts[i].Content, expected) } - }) - } - }) - - t.Run("no thoughts without tool calls", func(t *testing.T) { - t.Parallel() - - ctx, cancel := context.WithTimeout(t.Context(), time.Second*30) - t.Cleanup(cancel) - - // Use the simple fixture which has no tool calls — any reasoning - // should not be persisted since it can't be associated with a tool call. - fix := fixtures.Parse(t, fixtures.OaiResponsesStreamingCodex) - upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) - - bridgeServer := newBridgeTestServer(t, ctx, upstream.URL) - - resp := bridgeServer.makeRequest(t, http.MethodPost, pathOpenAIResponses, fix.Request()) - require.Equal(t, http.StatusOK, resp.StatusCode) - - _, err := io.ReadAll(resp.Body) - require.NoError(t, err) - - // No tool usages (and therefore no thoughts) should be recorded - // when there are no tool calls. - toolUsages := bridgeServer.Recorder.RecordedToolUsages() - require.Empty(t, toolUsages) - }) + } + }) + } } func requireResponsesError(t *testing.T, code int, message string, body []byte) { From 464670a5233d8c9c817683f785392cdf62ec86a8 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 12 Mar 2026 12:07:15 +0200 Subject: [PATCH 07/14] feat: capture commentary as model thoughts Signed-off-by: Danny Kopping --- fixtures/fixtures.go | 12 ++ .../blocking/commentary_builtin_tool.txtar | 139 +++++++++++++++++ .../summary_and_commentary_builtin_tool.txtar | 146 ++++++++++++++++++ .../streaming/commentary_builtin_tool.txtar | 80 ++++++++++ .../summary_and_commentary_builtin_tool.txtar | 94 +++++++++++ intercept/responses/base.go | 47 ++++-- internal/integrationtest/responses_test.go | 24 +++ 7 files changed, 530 insertions(+), 12 deletions(-) create mode 100644 fixtures/openai/responses/blocking/commentary_builtin_tool.txtar create mode 100644 fixtures/openai/responses/blocking/summary_and_commentary_builtin_tool.txtar create mode 100644 fixtures/openai/responses/streaming/commentary_builtin_tool.txtar create mode 100644 fixtures/openai/responses/streaming/summary_and_commentary_builtin_tool.txtar diff --git a/fixtures/fixtures.go b/fixtures/fixtures.go index cacf657c..06447a67 100644 --- a/fixtures/fixtures.go +++ b/fixtures/fixtures.go @@ -67,6 +67,12 @@ var ( //go:embed openai/responses/blocking/multi_reasoning_builtin_tool.txtar OaiResponsesBlockingMultiReasoningBuiltinTool []byte + //go:embed openai/responses/blocking/commentary_builtin_tool.txtar + OaiResponsesBlockingCommentaryBuiltinTool []byte + + //go:embed openai/responses/blocking/summary_and_commentary_builtin_tool.txtar + OaiResponsesBlockingSummaryAndCommentaryBuiltinTool []byte + //go:embed openai/responses/blocking/cached_input_tokens.txtar OaiResponsesBlockingCachedInputTokens []byte @@ -105,6 +111,12 @@ var ( //go:embed openai/responses/streaming/multi_reasoning_builtin_tool.txtar OaiResponsesStreamingMultiReasoningBuiltinTool []byte + //go:embed openai/responses/streaming/commentary_builtin_tool.txtar + OaiResponsesStreamingCommentaryBuiltinTool []byte + + //go:embed openai/responses/streaming/summary_and_commentary_builtin_tool.txtar + OaiResponsesStreamingSummaryAndCommentaryBuiltinTool []byte + //go:embed openai/responses/streaming/cached_input_tokens.txtar OaiResponsesStreamingCachedInputTokens []byte diff --git a/fixtures/openai/responses/blocking/commentary_builtin_tool.txtar b/fixtures/openai/responses/blocking/commentary_builtin_tool.txtar new file mode 100644 index 00000000..d0e83dd7 --- /dev/null +++ b/fixtures/openai/responses/blocking/commentary_builtin_tool.txtar @@ -0,0 +1,139 @@ +-- request -- +{ + "input": [ + { + "role": "user", + "content": "Is 3 + 5 a prime number? Use the add function to calculate the sum." + } + ], + "model": "gpt-5.4", + "stream": false, + "tools": [ + { + "type": "function", + "name": "add", + "description": "Add two numbers together.", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ] + } + } + ] +} + +-- non-streaming -- +{ + "id": "resp_0aba2ac43dc240b30169b15720243c819ebb64977365d42cf5", + "object": "response", + "created_at": 1773229856, + "status": "completed", + "background": false, + "completed_at": 1773229861, + "error": null, + "incomplete_details": null, + "instructions": null, + "max_output_tokens": null, + "max_tool_calls": null, + "model": "gpt-5.4-2026-03-05", + "output": [ + { + "id": "rs_0aba2ac43dc240b30169b157208c88819e8238a91b5f7a919b", + "type": "reasoning", + "status": "completed", + "encrypted_content": "gAAAAA==", + "summary": [] + }, + { + "id": "msg_0aba2ac43dc240b30169b1572286d0819eb24b1d0f84c8fb3f", + "type": "message", + "status": "completed", + "content": [ + { + "type": "output_text", + "annotations": [], + "text": "Checking whether 3 + 5 is prime by calling the add function first." + } + ], + "phase": "commentary", + "role": "assistant" + }, + { + "id": "fc_0aba2ac43dc240b30169b157255604819e8a108124efc1635c", + "type": "function_call", + "status": "completed", + "arguments": "{\"a\":3,\"b\":5}", + "call_id": "call_A8TkZmIcKtw2Zw952Wc5QVe7", + "name": "add" + } + ], + "parallel_tool_calls": true, + "previous_response_id": null, + "prompt_cache_key": null, + "prompt_cache_retention": null, + "reasoning": { + "effort": "xhigh", + "summary": null + }, + "safety_identifier": null, + "service_tier": "default", + "store": false, + "temperature": 1.0, + "text": { + "format": { + "type": "text" + }, + "verbosity": "low" + }, + "tool_choice": "auto", + "tools": [ + { + "type": "function", + "description": "Add two numbers together.", + "name": "add", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ], + "additionalProperties": false + }, + "strict": true + } + ], + "top_logprobs": 0, + "top_p": 0.98, + "truncation": "disabled", + "usage": { + "input_tokens": 58, + "input_tokens_details": { + "cached_tokens": 0 + }, + "output_tokens": 30, + "output_tokens_details": { + "reasoning_tokens": 10 + }, + "total_tokens": 88 + }, + "user": null, + "metadata": {} +} diff --git a/fixtures/openai/responses/blocking/summary_and_commentary_builtin_tool.txtar b/fixtures/openai/responses/blocking/summary_and_commentary_builtin_tool.txtar new file mode 100644 index 00000000..15082c36 --- /dev/null +++ b/fixtures/openai/responses/blocking/summary_and_commentary_builtin_tool.txtar @@ -0,0 +1,146 @@ +Both a reasoning summary and a commentary message before a function_call. + +-- request -- +{ + "input": [ + { + "role": "user", + "content": "Is 3 + 5 a prime number? Use the add function to calculate the sum." + } + ], + "model": "gpt-5.4", + "stream": false, + "tools": [ + { + "type": "function", + "name": "add", + "description": "Add two numbers together.", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ] + } + } + ] +} + +-- non-streaming -- +{ + "id": "resp_1bba3bc54ed351c41270c26831354d920fcc75088476e53de6", + "object": "response", + "created_at": 1773229900, + "status": "completed", + "background": false, + "completed_at": 1773229905, + "error": null, + "incomplete_details": null, + "instructions": null, + "max_output_tokens": null, + "max_tool_calls": null, + "model": "gpt-5.4-2026-03-05", + "output": [ + { + "id": "rs_1bba3bc54ed351c41270c26831908d920fcc75088476e53de6", + "type": "reasoning", + "status": "completed", + "encrypted_content": "gAAAAA==", + "summary": [ + { + "type": "summary_text", + "text": "I need to add 3 and 5 to check primality." + } + ] + }, + { + "id": "msg_1bba3bc54ed351c41270c26831a09d920fdd86199587f64ef7", + "type": "message", + "status": "completed", + "content": [ + { + "type": "output_text", + "annotations": [], + "text": "Let me calculate the sum first using the add function." + } + ], + "phase": "commentary", + "role": "assistant" + }, + { + "id": "fc_1bba3bc54ed351c41270c26831b0ad920fee97200698074f08", + "type": "function_call", + "status": "completed", + "arguments": "{\"a\":3,\"b\":5}", + "call_id": "call_B9UjYX01Lvvv1XwjDsdmRW3f", + "name": "add" + } + ], + "parallel_tool_calls": true, + "previous_response_id": null, + "prompt_cache_key": null, + "prompt_cache_retention": null, + "reasoning": { + "effort": "xhigh", + "summary": null + }, + "safety_identifier": null, + "service_tier": "default", + "store": false, + "temperature": 1.0, + "text": { + "format": { + "type": "text" + }, + "verbosity": "low" + }, + "tool_choice": "auto", + "tools": [ + { + "type": "function", + "description": "Add two numbers together.", + "name": "add", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ], + "additionalProperties": false + }, + "strict": true + } + ], + "top_logprobs": 0, + "top_p": 0.98, + "truncation": "disabled", + "usage": { + "input_tokens": 58, + "input_tokens_details": { + "cached_tokens": 0 + }, + "output_tokens": 35, + "output_tokens_details": { + "reasoning_tokens": 10 + }, + "total_tokens": 93 + }, + "user": null, + "metadata": {} +} diff --git a/fixtures/openai/responses/streaming/commentary_builtin_tool.txtar b/fixtures/openai/responses/streaming/commentary_builtin_tool.txtar new file mode 100644 index 00000000..2f090f62 --- /dev/null +++ b/fixtures/openai/responses/streaming/commentary_builtin_tool.txtar @@ -0,0 +1,80 @@ +-- request -- +{ + "input": [ + { + "role": "user", + "content": "Is 3 + 5 a prime number? Use the add function to calculate the sum." + } + ], + "model": "gpt-5.4", + "stream": true, + "tools": [ + { + "type": "function", + "name": "add", + "description": "Add two numbers together.", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ] + } + } + ] +} + +-- streaming -- +event: response.created +data: {"type":"response.created","response":{"id":"resp_0aba2ac43dc240b30169b15720243c819ebb64977365d42cf5","object":"response","created_at":1773229856,"status":"in_progress","background":false,"completed_at":null,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-5.4-2026-03-05","output":[],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":"xhigh","summary":null},"safety_identifier":null,"service_tier":"default","store":false,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"low"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":0.98,"truncation":"disabled","usage":null,"user":null,"metadata":{}},"sequence_number":0} + +event: response.in_progress +data: {"type":"response.in_progress","response":{"id":"resp_0aba2ac43dc240b30169b15720243c819ebb64977365d42cf5","object":"response","created_at":1773229856,"status":"in_progress","background":false,"completed_at":null,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-5.4-2026-03-05","output":[],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":"xhigh","summary":null},"safety_identifier":null,"service_tier":"default","store":false,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"low"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":0.98,"truncation":"disabled","usage":null,"user":null,"metadata":{}},"sequence_number":1} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"rs_0aba2ac43dc240b30169b157208c88819e8238a91b5f7a919b","type":"reasoning","status":"in_progress","summary":[]},"output_index":0,"sequence_number":2} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"rs_0aba2ac43dc240b30169b157208c88819e8238a91b5f7a919b","type":"reasoning","status":"completed","encrypted_content":"gAAAAA==","summary":[]},"output_index":0,"sequence_number":3} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"msg_0aba2ac43dc240b30169b1572286d0819eb24b1d0f84c8fb3f","type":"message","status":"in_progress","content":[],"phase":"commentary","role":"assistant"},"output_index":1,"sequence_number":4} + +event: response.content_part.added +data: {"type":"response.content_part.added","item_id":"msg_0aba2ac43dc240b30169b1572286d0819eb24b1d0f84c8fb3f","output_index":1,"content_index":0,"part":{"type":"output_text","text":"","annotations":[]},"sequence_number":5} + +event: response.output_text.delta +data: {"type":"response.output_text.delta","item_id":"msg_0aba2ac43dc240b30169b1572286d0819eb24b1d0f84c8fb3f","output_index":1,"content_index":0,"delta":"Checking whether 3 + 5 is prime by calling the add function first.","sequence_number":6} + +event: response.output_text.done +data: {"type":"response.output_text.done","item_id":"msg_0aba2ac43dc240b30169b1572286d0819eb24b1d0f84c8fb3f","output_index":1,"content_index":0,"text":"Checking whether 3 + 5 is prime by calling the add function first.","sequence_number":7} + +event: response.content_part.done +data: {"type":"response.content_part.done","item_id":"msg_0aba2ac43dc240b30169b1572286d0819eb24b1d0f84c8fb3f","output_index":1,"content_index":0,"part":{"type":"output_text","text":"Checking whether 3 + 5 is prime by calling the add function first.","annotations":[]},"sequence_number":8} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"msg_0aba2ac43dc240b30169b1572286d0819eb24b1d0f84c8fb3f","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"text":"Checking whether 3 + 5 is prime by calling the add function first."}],"phase":"commentary","role":"assistant"},"output_index":1,"sequence_number":9} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"fc_0aba2ac43dc240b30169b157255604819e8a108124efc1635c","type":"function_call","status":"in_progress","arguments":"","call_id":"call_A8TkZmIcKtw2Zw952Wc5QVe7","name":"add"},"output_index":2,"sequence_number":10} + +event: response.function_call_arguments.delta +data: {"type":"response.function_call_arguments.delta","delta":"{\"a\":3,\"b\":5}","item_id":"fc_0aba2ac43dc240b30169b157255604819e8a108124efc1635c","output_index":2,"sequence_number":11} + +event: response.function_call_arguments.done +data: {"type":"response.function_call_arguments.done","arguments":"{\"a\":3,\"b\":5}","item_id":"fc_0aba2ac43dc240b30169b157255604819e8a108124efc1635c","output_index":2,"sequence_number":12} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"fc_0aba2ac43dc240b30169b157255604819e8a108124efc1635c","type":"function_call","status":"completed","arguments":"{\"a\":3,\"b\":5}","call_id":"call_A8TkZmIcKtw2Zw952Wc5QVe7","name":"add"},"output_index":2,"sequence_number":13} + +event: response.completed +data: {"type":"response.completed","response":{"id":"resp_0aba2ac43dc240b30169b15720243c819ebb64977365d42cf5","object":"response","created_at":1773229856,"status":"completed","background":false,"completed_at":1773229861,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-5.4-2026-03-05","output":[{"id":"rs_0aba2ac43dc240b30169b157208c88819e8238a91b5f7a919b","type":"reasoning","status":"completed","encrypted_content":"gAAAAA==","summary":[]},{"id":"msg_0aba2ac43dc240b30169b1572286d0819eb24b1d0f84c8fb3f","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"text":"Checking whether 3 + 5 is prime by calling the add function first."}],"phase":"commentary","role":"assistant"},{"id":"fc_0aba2ac43dc240b30169b157255604819e8a108124efc1635c","type":"function_call","status":"completed","arguments":"{\"a\":3,\"b\":5}","call_id":"call_A8TkZmIcKtw2Zw952Wc5QVe7","name":"add"}],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":"xhigh","summary":null},"safety_identifier":null,"service_tier":"default","store":false,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"low"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":0.98,"truncation":"disabled","usage":{"input_tokens":58,"input_tokens_details":{"cached_tokens":0},"output_tokens":30,"output_tokens_details":{"reasoning_tokens":10},"total_tokens":88},"user":null,"metadata":{}},"sequence_number":14} + diff --git a/fixtures/openai/responses/streaming/summary_and_commentary_builtin_tool.txtar b/fixtures/openai/responses/streaming/summary_and_commentary_builtin_tool.txtar new file mode 100644 index 00000000..172b0065 --- /dev/null +++ b/fixtures/openai/responses/streaming/summary_and_commentary_builtin_tool.txtar @@ -0,0 +1,94 @@ +Both a reasoning summary and a commentary message before a function_call. + +-- request -- +{ + "input": [ + { + "role": "user", + "content": "Is 3 + 5 a prime number? Use the add function to calculate the sum." + } + ], + "model": "gpt-5.4", + "stream": true, + "tools": [ + { + "type": "function", + "name": "add", + "description": "Add two numbers together.", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ] + } + } + ] +} + +-- streaming -- +event: response.created +data: {"type":"response.created","response":{"id":"resp_1bba3bc54ed351c41270c26831354d920fcc75088476e53de6","object":"response","created_at":1773229900,"status":"in_progress","background":false,"completed_at":null,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-5.4-2026-03-05","output":[],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":"xhigh","summary":null},"safety_identifier":null,"service_tier":"default","store":false,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"low"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":0.98,"truncation":"disabled","usage":null,"user":null,"metadata":{}},"sequence_number":0} + +event: response.in_progress +data: {"type":"response.in_progress","response":{"id":"resp_1bba3bc54ed351c41270c26831354d920fcc75088476e53de6","object":"response","created_at":1773229900,"status":"in_progress","background":false,"completed_at":null,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-5.4-2026-03-05","output":[],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":"xhigh","summary":null},"safety_identifier":null,"service_tier":"default","store":false,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"low"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":0.98,"truncation":"disabled","usage":null,"user":null,"metadata":{}},"sequence_number":1} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"rs_1bba3bc54ed351c41270c26831908d920fcc75088476e53de6","type":"reasoning","status":"in_progress","summary":[]},"output_index":0,"sequence_number":2} + +event: response.reasoning_summary_part.added +data: {"type":"response.reasoning_summary_part.added","item_id":"rs_1bba3bc54ed351c41270c26831908d920fcc75088476e53de6","output_index":0,"part":{"type":"summary_text","text":""},"summary_index":0,"sequence_number":3} + +event: response.reasoning_summary_text.delta +data: {"type":"response.reasoning_summary_text.delta","item_id":"rs_1bba3bc54ed351c41270c26831908d920fcc75088476e53de6","output_index":0,"summary_index":0,"delta":"I need to add 3 and 5 to check primality.","sequence_number":4} + +event: response.reasoning_summary_text.done +data: {"type":"response.reasoning_summary_text.done","item_id":"rs_1bba3bc54ed351c41270c26831908d920fcc75088476e53de6","output_index":0,"summary_index":0,"text":"I need to add 3 and 5 to check primality.","sequence_number":5} + +event: response.reasoning_summary_part.done +data: {"type":"response.reasoning_summary_part.done","item_id":"rs_1bba3bc54ed351c41270c26831908d920fcc75088476e53de6","output_index":0,"part":{"type":"summary_text","text":"I need to add 3 and 5 to check primality."},"summary_index":0,"sequence_number":6} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"rs_1bba3bc54ed351c41270c26831908d920fcc75088476e53de6","type":"reasoning","status":"completed","encrypted_content":"gAAAAA==","summary":[{"type":"summary_text","text":"I need to add 3 and 5 to check primality."}]},"output_index":0,"sequence_number":7} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"msg_1bba3bc54ed351c41270c26831a09d920fdd86199587f64ef7","type":"message","status":"in_progress","content":[],"phase":"commentary","role":"assistant"},"output_index":1,"sequence_number":8} + +event: response.content_part.added +data: {"type":"response.content_part.added","item_id":"msg_1bba3bc54ed351c41270c26831a09d920fdd86199587f64ef7","output_index":1,"content_index":0,"part":{"type":"output_text","text":"","annotations":[]},"sequence_number":9} + +event: response.output_text.delta +data: {"type":"response.output_text.delta","item_id":"msg_1bba3bc54ed351c41270c26831a09d920fdd86199587f64ef7","output_index":1,"content_index":0,"delta":"Let me calculate the sum first using the add function.","sequence_number":10} + +event: response.output_text.done +data: {"type":"response.output_text.done","item_id":"msg_1bba3bc54ed351c41270c26831a09d920fdd86199587f64ef7","output_index":1,"content_index":0,"text":"Let me calculate the sum first using the add function.","sequence_number":11} + +event: response.content_part.done +data: {"type":"response.content_part.done","item_id":"msg_1bba3bc54ed351c41270c26831a09d920fdd86199587f64ef7","output_index":1,"content_index":0,"part":{"type":"output_text","text":"Let me calculate the sum first using the add function.","annotations":[]},"sequence_number":12} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"msg_1bba3bc54ed351c41270c26831a09d920fdd86199587f64ef7","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"text":"Let me calculate the sum first using the add function."}],"phase":"commentary","role":"assistant"},"output_index":1,"sequence_number":13} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"fc_1bba3bc54ed351c41270c26831b0ad920fee97200698074f08","type":"function_call","status":"in_progress","arguments":"","call_id":"call_B9UjYX01Lvvv1XwjDsdmRW3f","name":"add"},"output_index":2,"sequence_number":14} + +event: response.function_call_arguments.delta +data: {"type":"response.function_call_arguments.delta","delta":"{\"a\":3,\"b\":5}","item_id":"fc_1bba3bc54ed351c41270c26831b0ad920fee97200698074f08","output_index":2,"sequence_number":15} + +event: response.function_call_arguments.done +data: {"type":"response.function_call_arguments.done","arguments":"{\"a\":3,\"b\":5}","item_id":"fc_1bba3bc54ed351c41270c26831b0ad920fee97200698074f08","output_index":2,"sequence_number":16} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"fc_1bba3bc54ed351c41270c26831b0ad920fee97200698074f08","type":"function_call","status":"completed","arguments":"{\"a\":3,\"b\":5}","call_id":"call_B9UjYX01Lvvv1XwjDsdmRW3f","name":"add"},"output_index":2,"sequence_number":17} + +event: response.completed +data: {"type":"response.completed","response":{"id":"resp_1bba3bc54ed351c41270c26831354d920fcc75088476e53de6","object":"response","created_at":1773229900,"status":"completed","background":false,"completed_at":1773229905,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-5.4-2026-03-05","output":[{"id":"rs_1bba3bc54ed351c41270c26831908d920fcc75088476e53de6","type":"reasoning","status":"completed","encrypted_content":"gAAAAA==","summary":[{"type":"summary_text","text":"I need to add 3 and 5 to check primality."}]},{"id":"msg_1bba3bc54ed351c41270c26831a09d920fdd86199587f64ef7","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"text":"Let me calculate the sum first using the add function."}],"phase":"commentary","role":"assistant"},{"id":"fc_1bba3bc54ed351c41270c26831b0ad920fee97200698074f08","type":"function_call","status":"completed","arguments":"{\"a\":3,\"b\":5}","call_id":"call_B9UjYX01Lvvv1XwjDsdmRW3f","name":"add"}],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":"xhigh","summary":null},"safety_identifier":null,"service_tier":"default","store":false,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"low"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":0.98,"truncation":"disabled","usage":{"input_tokens":58,"input_tokens_details":{"cached_tokens":0},"output_tokens":35,"output_tokens_details":{"reasoning_tokens":10},"total_tokens":93},"user":null,"metadata":{}},"sequence_number":18} + diff --git a/intercept/responses/base.go b/intercept/responses/base.go index 0a889bb5..c3550f30 100644 --- a/intercept/responses/base.go +++ b/intercept/responses/base.go @@ -333,8 +333,9 @@ func (i *responsesInterceptionBase) recordTokenUsage(ctx context.Context, respon } } -// extractModelThoughts extracts reasoning summary items from response output -// and converts them to ModelThoughtRecords for association with tool usage. +// extractModelThoughts extracts model thoughts from response output items. +// It captures both reasoning summary items and commentary messages (message +// output items with "phase": "commentary") as model thoughts. func (i *responsesInterceptionBase) extractModelThoughts(response *responses.Response) []*recorder.ModelThoughtRecord { if response == nil { return nil @@ -342,19 +343,41 @@ func (i *responsesInterceptionBase) extractModelThoughts(response *responses.Res var thoughts []*recorder.ModelThoughtRecord for _, item := range response.Output { - if item.Type != string(constant.ValueOf[constant.Reasoning]()) { - continue - } + switch item.Type { + case string(constant.ValueOf[constant.Reasoning]()): + reasoning := item.AsReasoning() + for _, summary := range reasoning.Summary { + if summary.Text == "" { + continue + } + thoughts = append(thoughts, &recorder.ModelThoughtRecord{ + Content: summary.Text, + CreatedAt: time.Now(), + }) + } - reasoning := item.AsReasoning() - for _, summary := range reasoning.Summary { - if summary.Text == "" { + case string(constant.ValueOf[constant.Message]()): + // The API sometimes returns commentary messages instead of reasoning + // summaries. These are assistant message output items with "phase": "commentary". + // The SDK doesn't expose a Phase field, so we extract it from raw JSON. + raw := item.RawJSON() + if gjson.Get(raw, "role").String() != string(constant.ValueOf[constant.Assistant]()) || + gjson.Get(raw, "phase").String() != "commentary" { continue } - thoughts = append(thoughts, &recorder.ModelThoughtRecord{ - Content: summary.Text, - CreatedAt: time.Now(), - }) + msg := item.AsMessage() + for _, part := range msg.Content { + if part.Type != string(constant.ValueOf[constant.OutputText]()) { + continue + } + if part.Text == "" { + continue + } + thoughts = append(thoughts, &recorder.ModelThoughtRecord{ + Content: part.Text, + CreatedAt: time.Now(), + }) + } } } diff --git a/internal/integrationtest/responses_test.go b/internal/integrationtest/responses_test.go index 2cee005a..1aceaacf 100644 --- a/internal/integrationtest/responses_test.go +++ b/internal/integrationtest/responses_test.go @@ -970,6 +970,30 @@ func TestResponsesModelThoughts(t *testing.T) { expectedToolCallID: "call_7VaiUXZYuuuwWwviCrckxq6t", expectedThoughts: []string{"The user wants to add 3 and 5", "After adding, I will check if the result is prime"}, }, + { + name: "commentary/blocking", + fixture: fixtures.OaiResponsesBlockingCommentaryBuiltinTool, + expectedToolCallID: "call_A8TkZmIcKtw2Zw952Wc5QVe7", + expectedThoughts: []string{"Checking whether 3 + 5 is prime by calling the add function first."}, + }, + { + name: "commentary/streaming", + fixture: fixtures.OaiResponsesStreamingCommentaryBuiltinTool, + expectedToolCallID: "call_A8TkZmIcKtw2Zw952Wc5QVe7", + expectedThoughts: []string{"Checking whether 3 + 5 is prime by calling the add function first."}, + }, + { + name: "summary and commentary/blocking", + fixture: fixtures.OaiResponsesBlockingSummaryAndCommentaryBuiltinTool, + expectedToolCallID: "call_B9UjYX01Lvvv1XwjDsdmRW3f", + expectedThoughts: []string{"I need to add 3 and 5 to check primality.", "Let me calculate the sum first using the add function."}, + }, + { + name: "summary and commentary/streaming", + fixture: fixtures.OaiResponsesStreamingSummaryAndCommentaryBuiltinTool, + expectedToolCallID: "call_B9UjYX01Lvvv1XwjDsdmRW3f", + expectedThoughts: []string{"I need to add 3 and 5 to check primality.", "Let me calculate the sum first using the add function."}, + }, { name: "no thoughts without tool calls", fixture: fixtures.OaiResponsesStreamingCodex, // This fixture contains reasoning, but it's not associated with tool calls. From bf7d2bc5a91cef2b20ebc9e5784a13173b7f2b5b Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 12 Mar 2026 12:20:35 +0200 Subject: [PATCH 08/14] chore: cleaning up /v1/messages impl Signed-off-by: Danny Kopping --- intercept/messages/base.go | 20 ++++++++++++++++++++ intercept/messages/blocking.go | 14 +------------- intercept/messages/streaming.go | 23 ++++++++++------------- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/intercept/messages/base.go b/intercept/messages/base.go index 37522380..58ac23a9 100644 --- a/intercept/messages/base.go +++ b/intercept/messages/base.go @@ -168,6 +168,26 @@ func (i *interceptionBase) disableParallelToolCalls() { } } +// extractModelThoughts returns any thinking blocks that were returned in the response. +func (i *interceptionBase) extractModelThoughts(msg *anthropic.Message) []*recorder.ModelThoughtRecord { + if msg == nil { + return nil + } + + var thoughtRecords []*recorder.ModelThoughtRecord + for _, block := range msg.Content { + switch variant := block.AsAny().(type) { + case anthropic.ThinkingBlock: + thoughtRecords = append(thoughtRecords, &recorder.ModelThoughtRecord{ + Content: variant.Thinking, + CreatedAt: time.Now(), + }) + } + // anthropic.RedactedThinkingBlock also exists, but there's nothing useful we can capture. + } + return thoughtRecords +} + // IsSmallFastModel checks if the model is a small/fast model (Haiku 3.5). // These models are optimized for tasks like code autocomplete and other small, quick operations. // See `ANTHROPIC_SMALL_FAST_MODEL`: https://docs.anthropic.com/en/docs/claude-code/settings#environment-variables diff --git a/intercept/messages/blocking.go b/intercept/messages/blocking.go index 4ba71874..e2bed379 100644 --- a/intercept/messages/blocking.go +++ b/intercept/messages/blocking.go @@ -136,19 +136,7 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req accumulateUsage(&cumulativeUsage, resp.Usage) // Capture any thinking blocks that were returned. - var thoughtRecords []*recorder.ModelThoughtRecord - for _, block := range resp.Content { - switch variant := block.AsAny().(type) { - case anthropic.ThinkingBlock: - thoughtRecords = append(thoughtRecords, &recorder.ModelThoughtRecord{ - Content: variant.Thinking, - CreatedAt: time.Now(), - }) - case anthropic.RedactedThinkingBlock: - // For redacted thinking, there's nothing useful we can capture. - continue - } - } + thoughtRecords := i.extractModelThoughts(resp) // Handle tool calls for non-streaming. var pendingToolCalls []anthropic.ToolUseBlock diff --git a/intercept/messages/streaming.go b/intercept/messages/streaming.go index 949401f9..6c2545c9 100644 --- a/intercept/messages/streaming.go +++ b/intercept/messages/streaming.go @@ -254,20 +254,9 @@ newStream: case string(constant.ValueOf[constant.MessageStop]()): // Capture any thinking blocks that were returned. - var thoughtRecords []*recorder.ModelThoughtRecord - for _, block := range message.Content { - switch variant := block.AsAny().(type) { - case anthropic.ThinkingBlock: - thoughtRecords = append(thoughtRecords, &recorder.ModelThoughtRecord{ - Content: variant.Thinking, - CreatedAt: time.Now(), - }) - case anthropic.RedactedThinkingBlock: - // For redacted thinking, there's nothing useful we can capture. - continue - } - } + thoughtRecords := i.extractModelThoughts(&message) + // Process injected tool if len(pendingToolCalls) > 0 { // Append the whole message from this stream as context since we'll be sending a new request with the tool results. messages.Messages = append(messages.Messages, message.ToParam()) @@ -322,8 +311,12 @@ newStream: InvocationError: err, ModelThoughts: thoughtRecords, }) + // Clear after first use to avoid duplicating across // multiple tool calls in the same message. + // + // This is not strictly need for injected tools since we disable parallel tool calls, + // but just adding this here for defensiveness. thoughtRecords = nil if err != nil { @@ -438,8 +431,12 @@ newStream: Injected: false, ModelThoughts: thoughtRecords, }) + // Clear after first use to avoid duplicating across // multiple tool calls in the same message. + // + // This effectively means that in the case of parallel tool calls + // the thoughts will only be associated to the first tool use which is fine. thoughtRecords = nil } } From 424382d620816146e631d72eda0bb396b942d107 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 12 Mar 2026 12:39:31 +0200 Subject: [PATCH 09/14] chore: update comments Signed-off-by: Danny Kopping --- intercept/messages/blocking.go | 4 ++++ intercept/responses/base.go | 4 ++++ intercept/responses/injected_tools.go | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/intercept/messages/blocking.go b/intercept/messages/blocking.go index e2bed379..b32f9e8d 100644 --- a/intercept/messages/blocking.go +++ b/intercept/messages/blocking.go @@ -161,8 +161,12 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req Injected: false, ModelThoughts: thoughtRecords, }) + // Clear after first use to avoid duplicating across // multiple tool calls in the same message. + // + // This effectively means that in the case of parallel tool calls + // the thoughts will only be associated to the first tool use which is fine. thoughtRecords = nil } diff --git a/intercept/responses/base.go b/intercept/responses/base.go index c3550f30..e61bc86c 100644 --- a/intercept/responses/base.go +++ b/intercept/responses/base.go @@ -287,8 +287,12 @@ func (i *responsesInterceptionBase) recordNonInjectedToolUsage(ctx context.Conte }); err != nil { i.logger.Warn(ctx, "failed to record tool usage", slog.Error(err), slog.F("tool", item.Name)) } + // Clear after first use to avoid duplicating across // multiple tool calls in the same message. + // + // This effectively means that in the case of parallel tool calls + // the thoughts will only be associated to the first tool use which is fine. thoughtRecords = nil } } diff --git a/intercept/responses/injected_tools.go b/intercept/responses/injected_tools.go index db81941f..7d95db8f 100644 --- a/intercept/responses/injected_tools.go +++ b/intercept/responses/injected_tools.go @@ -115,8 +115,12 @@ func (i *responsesInterceptionBase) handleInjectedToolCalls(ctx context.Context, var results []responses.ResponseInputItemUnionParam for _, fc := range pending { results = append(results, i.invokeInjectedTool(ctx, response.ID, fc, thoughtRecords)) + // Clear after first use to avoid duplicating across // multiple tool calls in the same message. + // + // This is not strictly need for injected tools since we disable parallel tool calls, + // but just adding this here for defensiveness. thoughtRecords = nil } From 7511cc42e2dddd927a24ee9619b1909b4280f659 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 12 Mar 2026 14:09:13 +0200 Subject: [PATCH 10/14] chore: add tests for parallel tool calls Signed-off-by: Danny Kopping --- .../multi_thinking_builtin_tool.txtar | 16 ++ .../single_builtin_tool_parallel.txtar | 175 ++++++++++++++++++ fixtures/fixtures.go | 9 + .../single_builtin_tool_parallel.txtar | 140 ++++++++++++++ .../single_builtin_tool_parallel.txtar | 86 +++++++++ intercept/messages/blocking.go | 5 + intercept/messages/streaming.go | 7 +- intercept/responses/injected_tools.go | 5 +- internal/integrationtest/bridge_test.go | 79 ++++---- internal/integrationtest/responses_test.go | 95 +++++----- 10 files changed, 537 insertions(+), 80 deletions(-) create mode 100644 fixtures/anthropic/single_builtin_tool_parallel.txtar create mode 100644 fixtures/openai/responses/blocking/single_builtin_tool_parallel.txtar create mode 100644 fixtures/openai/responses/streaming/single_builtin_tool_parallel.txtar diff --git a/fixtures/anthropic/multi_thinking_builtin_tool.txtar b/fixtures/anthropic/multi_thinking_builtin_tool.txtar index 633d11d9..d27ad63f 100644 --- a/fixtures/anthropic/multi_thinking_builtin_tool.txtar +++ b/fixtures/anthropic/multi_thinking_builtin_tool.txtar @@ -5,6 +5,22 @@ This fixture has two thinking blocks before the tool_use block. { "model": "claude-sonnet-4-20250514", "max_tokens": 1024, + "tools": [ + { + "name": "Read", + "description": "Read the contents of a file at the given path.", + "input_schema": { + "type": "object", + "properties": { + "file_path": { + "type": "string", + "description": "The absolute path to the file to read" + } + }, + "required": ["file_path"] + } + } + ], "messages": [ { "role": "user", diff --git a/fixtures/anthropic/single_builtin_tool_parallel.txtar b/fixtures/anthropic/single_builtin_tool_parallel.txtar new file mode 100644 index 00000000..9c53ed2c --- /dev/null +++ b/fixtures/anthropic/single_builtin_tool_parallel.txtar @@ -0,0 +1,175 @@ +Claude Code has builtin tools to (e.g.) explore the filesystem. +This fixture has a single thinking block followed by two parallel tool_use blocks. +The thinking should only be attributed to the first tool_use. + +-- request -- +{ + "model": "claude-sonnet-4-20250514", + "max_tokens": 1024, + "tools": [ + { + "name": "Read", + "description": "Read the contents of a file at the given path.", + "input_schema": { + "type": "object", + "properties": { + "file_path": { + "type": "string", + "description": "The absolute path to the file to read" + } + }, + "required": ["file_path"] + } + } + ], + "messages": [ + { + "role": "user", + "content": "read the foo and bar files" + } + ] +} + +-- streaming -- +event: message_start +data: {"type":"message_start","message":{"id":"msg_01ParallelToolStream","type":"message","role":"assistant","model":"claude-sonnet-4-20250514","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":2,"cache_creation_input_tokens":22,"cache_read_input_tokens":13993,"output_tokens":5,"service_tier":"standard"}} } + +event: content_block_start +data: {"type":"content_block_start","index":0,"content_block":{"type":"thinking","thinking":""}} + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"thinking_delta","thinking":"The user wants me to read two files: \"foo\" and \"bar\". I'll read both of them."}} + +event: content_block_delta +data: {"type":"content_block_delta","index":0,"delta":{"type":"signature_delta","signature":"Eu8BCkYICxgCKkBR++kFr7Za2JhF/9OCpjEc46/EcipL75RK+MEbxJ/VBJPWQTWrNGfwb5khWYJtKEpjjkH07cR/MQvThfb7t7CkEgwU4pKwL7NuZXd1/wgaDILyd0bYMqQovWo3dyIw95Ny7yZPljNBDLsvMBdBr7w+RtbU+AlSftjBuBZHp0VzI54/W+9u6f7qfx0JXsVBKldqqOjFvewT8Xm6Qp/77g6/j0zBiuAQABj/6vS1qATjd8KSIFDg9G/tCtzwmV/T/egmzswWd5CBiAhW6lgJgEDRr+gRUrFSOB7o3hypW8FUnUrr1JtzzwMYAQ=="}} + +event: content_block_stop +data: {"type":"content_block_stop","index":0} + +event: content_block_start +data: {"type":"content_block_start","index":1,"content_block":{"type":"tool_use","id":"toolu_01ParallelFirst000000000","name":"Read","input":{}}} + +event: content_block_delta +data: {"type":"content_block_delta","index":1,"delta":{"type":"input_json_delta","partial_json":""} } + +event: content_block_delta +data: {"type":"content_block_delta","index":1,"delta":{"type":"input_json_delta","partial_json":"{\"file_path\": \"/tmp/blah/foo"} } + +event: content_block_delta +data: {"type":"content_block_delta","index":1,"delta":{"type":"input_json_delta","partial_json":"\"}"} } + +event: content_block_stop +data: {"type":"content_block_stop","index":1 } + +event: content_block_start +data: {"type":"content_block_start","index":2,"content_block":{"type":"tool_use","id":"toolu_01ParallelSecond00000000","name":"Read","input":{}}} + +event: content_block_delta +data: {"type":"content_block_delta","index":2,"delta":{"type":"input_json_delta","partial_json":""} } + +event: content_block_delta +data: {"type":"content_block_delta","index":2,"delta":{"type":"input_json_delta","partial_json":"{\"file_path\": \"/tmp/blah/bar"} } + +event: content_block_delta +data: {"type":"content_block_delta","index":2,"delta":{"type":"input_json_delta","partial_json":"\"}"} } + +event: content_block_stop +data: {"type":"content_block_stop","index":2 } + +event: message_delta +data: {"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"output_tokens":72} } + +event: message_stop +data: {"type":"message_stop" } + + +-- non-streaming -- +{ + "id": "msg_01ParallelToolBlocking", + "container": { + "id": "", + "expires_at": "0001-01-01T00:00:00Z" + }, + "content": [ + { + "type": "thinking", + "thinking": "The user wants me to read two files: \"foo\" and \"bar\". I'll read both of them.", + "signature": "Eu8BCkYICxgCKkBR++kFr7Za2JhF/9OCpjEc46/EcipL75RK+MEbxJ/VBJPWQTWrNGfwb5khWYJtKEpjjkH07cR/MQvThfb7t7CkEgwU4pKwL7NuZXd1/wgaDILyd0bYMqQovWo3dyIw95Ny7yZPljNBDLsvMBdBr7w+RtbU+AlSftjBuBZHp0VzI54/W+9u6f7qfx0JXsVBKldqqOjFvewT8Xm6Qp/77g6/j0zBiuAQABj/6vS1qATjd8KSIFDg9G/tCtzwmV/T/egmzswWd5CBiAhW6lgJgEDRr+gRUrFSOB7o3hypW8FUnUrr1JtzzwMYAQ==" + }, + { + "citations": null, + "text": "", + "type": "tool_use", + "id": "toolu_01ParallelBlockFirst0000", + "input": { + "file_path": "/tmp/blah/foo" + }, + "name": "Read", + "content": { + "OfWebSearchResultBlockArray": null, + "OfString": "", + "OfMCPToolResultBlockContent": null, + "error_code": "", + "type": "", + "content": null, + "return_code": 0, + "stderr": "", + "stdout": "" + }, + "tool_use_id": "", + "server_name": "", + "is_error": false, + "file_id": "", + "signature": "", + "thinking": "", + "data": "" + }, + { + "citations": null, + "text": "", + "type": "tool_use", + "id": "toolu_01ParallelBlockSecond000", + "input": { + "file_path": "/tmp/blah/bar" + }, + "name": "Read", + "content": { + "OfWebSearchResultBlockArray": null, + "OfString": "", + "OfMCPToolResultBlockContent": null, + "error_code": "", + "type": "", + "content": null, + "return_code": 0, + "stderr": "", + "stdout": "" + }, + "tool_use_id": "", + "server_name": "", + "is_error": false, + "file_id": "", + "signature": "", + "thinking": "", + "data": "" + } + ], + "model": "claude-sonnet-4-20250514", + "role": "assistant", + "stop_reason": "tool_use", + "stop_sequence": "", + "type": "message", + "usage": { + "cache_creation": { + "ephemeral_1h_input_tokens": 0, + "ephemeral_5m_input_tokens": 0 + }, + "cache_creation_input_tokens": 0, + "cache_read_input_tokens": 23490, + "input_tokens": 5, + "output_tokens": 95, + "server_tool_use": { + "web_search_requests": 0 + }, + "service_tier": "standard" + } +} diff --git a/fixtures/fixtures.go b/fixtures/fixtures.go index 06447a67..8aaeef15 100644 --- a/fixtures/fixtures.go +++ b/fixtures/fixtures.go @@ -18,6 +18,9 @@ var ( //go:embed anthropic/multi_thinking_builtin_tool.txtar AntMultiThinkingBuiltinTool []byte + //go:embed anthropic/single_builtin_tool_parallel.txtar + AntSingleBuiltinToolParallel []byte + //go:embed anthropic/single_injected_tool.txtar AntSingleInjectedTool []byte @@ -88,6 +91,9 @@ var ( //go:embed openai/responses/blocking/prev_response_id.txtar OaiResponsesBlockingPrevResponseID []byte + //go:embed openai/responses/blocking/single_builtin_tool_parallel.txtar + OaiResponsesBlockingSingleBuiltinToolParallel []byte + //go:embed openai/responses/blocking/single_injected_tool.txtar OaiResponsesBlockingSingleInjectedTool []byte @@ -132,6 +138,9 @@ var ( //go:embed openai/responses/streaming/prev_response_id.txtar OaiResponsesStreamingPrevResponseID []byte + //go:embed openai/responses/streaming/single_builtin_tool_parallel.txtar + OaiResponsesStreamingSingleBuiltinToolParallel []byte + //go:embed openai/responses/streaming/single_injected_tool.txtar OaiResponsesStreamingSingleInjectedTool []byte diff --git a/fixtures/openai/responses/blocking/single_builtin_tool_parallel.txtar b/fixtures/openai/responses/blocking/single_builtin_tool_parallel.txtar new file mode 100644 index 00000000..4be0d240 --- /dev/null +++ b/fixtures/openai/responses/blocking/single_builtin_tool_parallel.txtar @@ -0,0 +1,140 @@ +-- request -- +{ + "input": [ + { + "role": "user", + "content": "Is 3 + 5 a prime number? Also add 10 + 20. Use the add function for both." + } + ], + "model": "gpt-4.1", + "stream": false, + "tools": [ + { + "type": "function", + "name": "add", + "description": "Add two numbers together.", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ] + } + } + ] +} + +-- non-streaming -- +{ + "id": "resp_parallel_blocking_001", + "object": "response", + "created_at": 1767875133, + "status": "completed", + "background": false, + "billing": { + "payer": "developer" + }, + "completed_at": 1767875134, + "error": null, + "incomplete_details": null, + "instructions": null, + "max_output_tokens": null, + "max_tool_calls": null, + "model": "gpt-4.1-2025-04-14", + "output": [ + { + "id": "rs_parallel_blocking_reasoning_001", + "type": "reasoning", + "status": "completed", + "summary": [ + { + "type": "summary_text", + "text": "The user wants two additions: 3+5 and 10+20. I'll call add for both." + } + ] + }, + { + "id": "fc_parallel_blocking_first_001", + "type": "function_call", + "status": "completed", + "arguments": "{\"a\":3,\"b\":5}", + "call_id": "call_ParallelBlockingFirst001", + "name": "add" + }, + { + "id": "fc_parallel_blocking_second_001", + "type": "function_call", + "status": "completed", + "arguments": "{\"a\":10,\"b\":20}", + "call_id": "call_ParallelBlockingSecond01", + "name": "add" + } + ], + "parallel_tool_calls": true, + "previous_response_id": null, + "prompt_cache_key": null, + "prompt_cache_retention": null, + "reasoning": { + "effort": null, + "summary": null + }, + "safety_identifier": null, + "service_tier": "default", + "store": true, + "temperature": 1.0, + "text": { + "format": { + "type": "text" + }, + "verbosity": "medium" + }, + "tool_choice": "auto", + "tools": [ + { + "type": "function", + "description": "Add two numbers together.", + "name": "add", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ], + "additionalProperties": false + }, + "strict": true + } + ], + "top_logprobs": 0, + "top_p": 1.0, + "truncation": "disabled", + "usage": { + "input_tokens": 65, + "input_tokens_details": { + "cached_tokens": 0 + }, + "output_tokens": 30, + "output_tokens_details": { + "reasoning_tokens": 0 + }, + "total_tokens": 95 + }, + "user": null, + "metadata": {} +} diff --git a/fixtures/openai/responses/streaming/single_builtin_tool_parallel.txtar b/fixtures/openai/responses/streaming/single_builtin_tool_parallel.txtar new file mode 100644 index 00000000..0319cab0 --- /dev/null +++ b/fixtures/openai/responses/streaming/single_builtin_tool_parallel.txtar @@ -0,0 +1,86 @@ +-- request -- +{ + "input": [ + { + "role": "user", + "content": "Is 3 + 5 a prime number? Also add 10 + 20. Use the add function for both." + } + ], + "model": "gpt-4.1", + "stream": true, + "tools": [ + { + "type": "function", + "name": "add", + "description": "Add two numbers together.", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number" + }, + "b": { + "type": "number" + } + }, + "required": [ + "a", + "b" + ] + } + } + ] +} + +-- streaming -- +event: response.created +data: {"type":"response.created","response":{"id":"resp_parallel_streaming_001","object":"response","created_at":1767875312,"status":"in_progress","background":false,"completed_at":null,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4.1-2025-04-14","output":[],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}},"sequence_number":0} + +event: response.in_progress +data: {"type":"response.in_progress","response":{"id":"resp_parallel_streaming_001","object":"response","created_at":1767875312,"status":"in_progress","background":false,"completed_at":null,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4.1-2025-04-14","output":[],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}},"sequence_number":1} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"rs_parallel_streaming_reasoning_001","type":"reasoning","status":"in_progress","summary":[]},"output_index":0,"sequence_number":2} + +event: response.reasoning_summary_part.added +data: {"type":"response.reasoning_summary_part.added","item_id":"rs_parallel_streaming_reasoning_001","output_index":0,"part":{"type":"summary_text","text":""},"summary_index":0,"sequence_number":3} + +event: response.reasoning_summary_text.delta +data: {"type":"response.reasoning_summary_text.delta","item_id":"rs_parallel_streaming_reasoning_001","output_index":0,"summary_index":0,"delta":"The user wants two additions: 3+5 and 10+20. I'll call add for both.","sequence_number":4} + +event: response.reasoning_summary_text.done +data: {"type":"response.reasoning_summary_text.done","item_id":"rs_parallel_streaming_reasoning_001","output_index":0,"summary_index":0,"text":"The user wants two additions: 3+5 and 10+20. I'll call add for both.","sequence_number":5} + +event: response.reasoning_summary_part.done +data: {"type":"response.reasoning_summary_part.done","item_id":"rs_parallel_streaming_reasoning_001","output_index":0,"part":{"type":"summary_text","text":"The user wants two additions: 3+5 and 10+20. I'll call add for both."},"summary_index":0,"sequence_number":6} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"rs_parallel_streaming_reasoning_001","type":"reasoning","status":"completed","summary":[{"type":"summary_text","text":"The user wants two additions: 3+5 and 10+20. I'll call add for both."}]},"output_index":0,"sequence_number":7} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"fc_parallel_streaming_first_001","type":"function_call","status":"in_progress","arguments":"","call_id":"call_ParallelStreamFirst001","name":"add"},"output_index":1,"sequence_number":8} + +event: response.function_call_arguments.delta +data: {"type":"response.function_call_arguments.delta","delta":"{\"a\":3,\"b\":5}","item_id":"fc_parallel_streaming_first_001","output_index":1,"sequence_number":9} + +event: response.function_call_arguments.done +data: {"type":"response.function_call_arguments.done","arguments":"{\"a\":3,\"b\":5}","item_id":"fc_parallel_streaming_first_001","output_index":1,"sequence_number":10} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"fc_parallel_streaming_first_001","type":"function_call","status":"completed","arguments":"{\"a\":3,\"b\":5}","call_id":"call_ParallelStreamFirst001","name":"add"},"output_index":1,"sequence_number":11} + +event: response.output_item.added +data: {"type":"response.output_item.added","item":{"id":"fc_parallel_streaming_second_001","type":"function_call","status":"in_progress","arguments":"","call_id":"call_ParallelStreamSecond01","name":"add"},"output_index":2,"sequence_number":12} + +event: response.function_call_arguments.delta +data: {"type":"response.function_call_arguments.delta","delta":"{\"a\":10,\"b\":20}","item_id":"fc_parallel_streaming_second_001","output_index":2,"sequence_number":13} + +event: response.function_call_arguments.done +data: {"type":"response.function_call_arguments.done","arguments":"{\"a\":10,\"b\":20}","item_id":"fc_parallel_streaming_second_001","output_index":2,"sequence_number":14} + +event: response.output_item.done +data: {"type":"response.output_item.done","item":{"id":"fc_parallel_streaming_second_001","type":"function_call","status":"completed","arguments":"{\"a\":10,\"b\":20}","call_id":"call_ParallelStreamSecond01","name":"add"},"output_index":2,"sequence_number":15} + +event: response.completed +data: {"type":"response.completed","response":{"id":"resp_parallel_streaming_001","object":"response","created_at":1767875312,"status":"completed","background":false,"completed_at":1767875312,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4.1-2025-04-14","output":[{"id":"rs_parallel_streaming_reasoning_001","type":"reasoning","status":"completed","summary":[{"type":"summary_text","text":"The user wants two additions: 3+5 and 10+20. I'll call add for both."}]},{"id":"fc_parallel_streaming_first_001","type":"function_call","status":"completed","arguments":"{\"a\":3,\"b\":5}","call_id":"call_ParallelStreamFirst001","name":"add"},{"id":"fc_parallel_streaming_second_001","type":"function_call","status":"completed","arguments":"{\"a\":10,\"b\":20}","call_id":"call_ParallelStreamSecond01","name":"add"}],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"prompt_cache_retention":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"default","store":true,"temperature":1.0,"text":{"format":{"type":"text"},"verbosity":"medium"},"tool_choice":"auto","tools":[{"type":"function","description":"Add two numbers together.","name":"add","parameters":{"type":"object","properties":{"a":{"type":"number"},"b":{"type":"number"}},"required":["a","b"],"additionalProperties":false},"strict":true}],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":{"input_tokens":65,"input_tokens_details":{"cached_tokens":0},"output_tokens":30,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":95},"user":null,"metadata":{}},"sequence_number":16} + diff --git a/intercept/messages/blocking.go b/intercept/messages/blocking.go index b32f9e8d..0c888d0a 100644 --- a/intercept/messages/blocking.go +++ b/intercept/messages/blocking.go @@ -208,8 +208,13 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req InvocationError: err, ModelThoughts: thoughtRecords, }) + // Clear after first use to avoid duplicating across // multiple tool calls in the same message. + // + // This is not strictly needed for injected tools since we + // disable parallel tool calls, but just adding this here + // for defensiveness. thoughtRecords = nil if err != nil { diff --git a/intercept/messages/streaming.go b/intercept/messages/streaming.go index 6c2545c9..878565c3 100644 --- a/intercept/messages/streaming.go +++ b/intercept/messages/streaming.go @@ -256,7 +256,7 @@ newStream: // Capture any thinking blocks that were returned. thoughtRecords := i.extractModelThoughts(&message) - // Process injected tool + // Process injected tools. if len(pendingToolCalls) > 0 { // Append the whole message from this stream as context since we'll be sending a new request with the tool results. messages.Messages = append(messages.Messages, message.ToParam()) @@ -315,8 +315,9 @@ newStream: // Clear after first use to avoid duplicating across // multiple tool calls in the same message. // - // This is not strictly need for injected tools since we disable parallel tool calls, - // but just adding this here for defensiveness. + // This is not strictly needed for injected tools since we + // disable parallel tool calls, but just adding this here + // for defensiveness. thoughtRecords = nil if err != nil { diff --git a/intercept/responses/injected_tools.go b/intercept/responses/injected_tools.go index 7d95db8f..9f813720 100644 --- a/intercept/responses/injected_tools.go +++ b/intercept/responses/injected_tools.go @@ -119,8 +119,9 @@ func (i *responsesInterceptionBase) handleInjectedToolCalls(ctx context.Context, // Clear after first use to avoid duplicating across // multiple tool calls in the same message. // - // This is not strictly need for injected tools since we disable parallel tool calls, - // but just adding this here for defensiveness. + // This is not strictly needed for injected tools since we + // disable parallel tool calls, but just adding this here + // for defensiveness. thoughtRecords = nil } diff --git a/internal/integrationtest/bridge_test.go b/internal/integrationtest/bridge_test.go index 6150173a..feb03c58 100644 --- a/internal/integrationtest/bridge_test.go +++ b/internal/integrationtest/bridge_test.go @@ -128,39 +128,46 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { t.Parallel() cases := []struct { - name string - streaming bool - fixture []byte - expectedToolCallID string - expectedThoughts []string // nil means no tool usages expected at all + name string + streaming bool + fixture []byte + expectedThoughts []string // nil means no tool usages expected at all }{ { - name: "single thinking block/streaming", - streaming: true, - fixture: fixtures.AntSingleBuiltinTool, - expectedToolCallID: "toolu_01RX68weRSquLx6HUTj65iBo", - expectedThoughts: []string{"The user wants me to read"}, + name: "single thinking block/streaming", + streaming: true, + fixture: fixtures.AntSingleBuiltinTool, + expectedThoughts: []string{"The user wants me to read"}, + }, + { + name: "single thinking block/blocking", + streaming: false, + fixture: fixtures.AntSingleBuiltinTool, + expectedThoughts: []string{"The user wants me to read"}, + }, + { + name: "multiple thinking blocks/streaming", + streaming: true, + fixture: fixtures.AntMultiThinkingBuiltinTool, + expectedThoughts: []string{"The user wants me to read", "I should use the Read tool"}, }, { - name: "single thinking block/blocking", - streaming: false, - fixture: fixtures.AntSingleBuiltinTool, - expectedToolCallID: "toolu_01AusGgY5aKFhzWrFBv9JfHq", - expectedThoughts: []string{"The user wants me to read"}, + name: "multiple thinking blocks/blocking", + streaming: false, + fixture: fixtures.AntMultiThinkingBuiltinTool, + expectedThoughts: []string{"The user wants me to read", "I should use the Read tool"}, }, { - name: "multiple thinking blocks/streaming", - streaming: true, - fixture: fixtures.AntMultiThinkingBuiltinTool, - expectedToolCallID: "toolu_01RX68weRSquLx6HUTj65iBo", - expectedThoughts: []string{"The user wants me to read", "I should use the Read tool"}, + name: "parallel tool calls/streaming", + streaming: true, + fixture: fixtures.AntSingleBuiltinToolParallel, + expectedThoughts: []string{"The user wants me to read two files"}, }, { - name: "multiple thinking blocks/blocking", - streaming: false, - fixture: fixtures.AntMultiThinkingBuiltinTool, - expectedToolCallID: "toolu_01AusGgY5aKFhzWrFBv9JfHq", - expectedThoughts: []string{"The user wants me to read", "I should use the Read tool"}, + name: "parallel tool calls/blocking", + streaming: false, + fixture: fixtures.AntSingleBuiltinToolParallel, + expectedThoughts: []string{"The user wants me to read two files"}, }, { name: "no thoughts without tool calls", @@ -197,14 +204,22 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { if tc.expectedThoughts == nil { assert.Empty(t, toolUsages) } else { - require.Len(t, toolUsages, 1) - assert.Equal(t, "Read", toolUsages[0].Tool) - assert.Equal(t, tc.expectedToolCallID, toolUsages[0].ToolCallID) - - require.Len(t, toolUsages[0].ModelThoughts, len(tc.expectedThoughts)) - for i, expected := range tc.expectedThoughts { - assert.Contains(t, toolUsages[0].ModelThoughts[i].Content, expected) + require.NotEmpty(t, toolUsages) + + // Exactly one tool usage should have the expected thoughts; + // all others should have none. + var withThoughts int + for _, tu := range toolUsages { + assert.Equal(t, "Read", tu.Tool) + if len(tu.ModelThoughts) > 0 { + withThoughts++ + require.Len(t, tu.ModelThoughts, len(tc.expectedThoughts)) + for i, expected := range tc.expectedThoughts { + assert.Contains(t, tu.ModelThoughts[i].Content, expected) + } + } } + assert.Equal(t, 1, withThoughts, "expected exactly one tool usage with model thoughts") } bridgeServer.Recorder.VerifyAllInterceptionsEnded(t) diff --git a/internal/integrationtest/responses_test.go b/internal/integrationtest/responses_test.go index 1aceaacf..d120c3bb 100644 --- a/internal/integrationtest/responses_test.go +++ b/internal/integrationtest/responses_test.go @@ -941,58 +941,59 @@ func TestResponsesModelThoughts(t *testing.T) { t.Parallel() cases := []struct { - name string - fixture []byte - expectedToolCallID string - expectedThoughts []string // nil means no tool usages expected at all + name string + fixture []byte + expectedThoughts []string // nil means no tool usages expected at all }{ { - name: "single reasoning/blocking", - fixture: fixtures.OaiResponsesBlockingSingleBuiltinTool, - expectedToolCallID: "call_CJSaa2u51JG996575oVljuNq", - expectedThoughts: []string{"The user wants to add 3 and 5"}, + name: "single reasoning/blocking", + fixture: fixtures.OaiResponsesBlockingSingleBuiltinTool, + expectedThoughts: []string{"The user wants to add 3 and 5"}, }, { - name: "single reasoning/streaming", - fixture: fixtures.OaiResponsesStreamingBuiltinTool, - expectedToolCallID: "call_7VaiUXZYuuuwWwviCrckxq6t", - expectedThoughts: []string{"The user wants to add 3 and 5"}, + name: "single reasoning/streaming", + fixture: fixtures.OaiResponsesStreamingBuiltinTool, + expectedThoughts: []string{"The user wants to add 3 and 5"}, }, { - name: "multiple reasoning items/blocking", - fixture: fixtures.OaiResponsesBlockingMultiReasoningBuiltinTool, - expectedToolCallID: "call_CJSaa2u51JG996575oVljuNq", - expectedThoughts: []string{"The user wants to add 3 and 5", "After adding, I will check if the result is prime"}, + name: "multiple reasoning items/blocking", + fixture: fixtures.OaiResponsesBlockingMultiReasoningBuiltinTool, + expectedThoughts: []string{"The user wants to add 3 and 5", "After adding, I will check if the result is prime"}, }, { - name: "multiple reasoning items/streaming", - fixture: fixtures.OaiResponsesStreamingMultiReasoningBuiltinTool, - expectedToolCallID: "call_7VaiUXZYuuuwWwviCrckxq6t", - expectedThoughts: []string{"The user wants to add 3 and 5", "After adding, I will check if the result is prime"}, + name: "multiple reasoning items/streaming", + fixture: fixtures.OaiResponsesStreamingMultiReasoningBuiltinTool, + expectedThoughts: []string{"The user wants to add 3 and 5", "After adding, I will check if the result is prime"}, }, { - name: "commentary/blocking", - fixture: fixtures.OaiResponsesBlockingCommentaryBuiltinTool, - expectedToolCallID: "call_A8TkZmIcKtw2Zw952Wc5QVe7", - expectedThoughts: []string{"Checking whether 3 + 5 is prime by calling the add function first."}, + name: "commentary/blocking", + fixture: fixtures.OaiResponsesBlockingCommentaryBuiltinTool, + expectedThoughts: []string{"Checking whether 3 + 5 is prime by calling the add function first."}, }, { - name: "commentary/streaming", - fixture: fixtures.OaiResponsesStreamingCommentaryBuiltinTool, - expectedToolCallID: "call_A8TkZmIcKtw2Zw952Wc5QVe7", - expectedThoughts: []string{"Checking whether 3 + 5 is prime by calling the add function first."}, + name: "commentary/streaming", + fixture: fixtures.OaiResponsesStreamingCommentaryBuiltinTool, + expectedThoughts: []string{"Checking whether 3 + 5 is prime by calling the add function first."}, }, { - name: "summary and commentary/blocking", - fixture: fixtures.OaiResponsesBlockingSummaryAndCommentaryBuiltinTool, - expectedToolCallID: "call_B9UjYX01Lvvv1XwjDsdmRW3f", - expectedThoughts: []string{"I need to add 3 and 5 to check primality.", "Let me calculate the sum first using the add function."}, + name: "summary and commentary/blocking", + fixture: fixtures.OaiResponsesBlockingSummaryAndCommentaryBuiltinTool, + expectedThoughts: []string{"I need to add 3 and 5 to check primality.", "Let me calculate the sum first using the add function."}, }, { - name: "summary and commentary/streaming", - fixture: fixtures.OaiResponsesStreamingSummaryAndCommentaryBuiltinTool, - expectedToolCallID: "call_B9UjYX01Lvvv1XwjDsdmRW3f", - expectedThoughts: []string{"I need to add 3 and 5 to check primality.", "Let me calculate the sum first using the add function."}, + name: "summary and commentary/streaming", + fixture: fixtures.OaiResponsesStreamingSummaryAndCommentaryBuiltinTool, + expectedThoughts: []string{"I need to add 3 and 5 to check primality.", "Let me calculate the sum first using the add function."}, + }, + { + name: "parallel tool calls/blocking", + fixture: fixtures.OaiResponsesBlockingSingleBuiltinToolParallel, + expectedThoughts: []string{"The user wants two additions"}, + }, + { + name: "parallel tool calls/streaming", + fixture: fixtures.OaiResponsesStreamingSingleBuiltinToolParallel, + expectedThoughts: []string{"The user wants two additions"}, }, { name: "no thoughts without tool calls", @@ -1022,14 +1023,22 @@ func TestResponsesModelThoughts(t *testing.T) { if tc.expectedThoughts == nil { require.Empty(t, toolUsages) } else { - require.Len(t, toolUsages, 1) - require.Equal(t, "add", toolUsages[0].Tool) - require.Equal(t, tc.expectedToolCallID, toolUsages[0].ToolCallID) - - require.Len(t, toolUsages[0].ModelThoughts, len(tc.expectedThoughts)) - for i, expected := range tc.expectedThoughts { - require.Contains(t, toolUsages[0].ModelThoughts[i].Content, expected) + require.NotEmpty(t, toolUsages) + + // Exactly one tool usage should have the expected thoughts; + // all others should have none. + var withThoughts int + for _, tu := range toolUsages { + require.Equal(t, "add", tu.Tool) + if len(tu.ModelThoughts) > 0 { + withThoughts++ + require.Len(t, tu.ModelThoughts, len(tc.expectedThoughts)) + for i, expected := range tc.expectedThoughts { + require.Contains(t, tu.ModelThoughts[i].Content, expected) + } + } } + require.Equal(t, 1, withThoughts, "expected exactly one tool usage with model thoughts") } }) } From befe9d94061533350ea45392b490d9c7cee8c3e0 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 12 Mar 2026 14:34:43 +0200 Subject: [PATCH 11/14] chore: capture source of thinking/reasoning Signed-off-by: Danny Kopping --- intercept/messages/base.go | 1 + intercept/responses/base.go | 2 + internal/integrationtest/bridge_test.go | 2 + internal/integrationtest/responses_test.go | 59 ++++++++++++++-------- 4 files changed, 44 insertions(+), 20 deletions(-) diff --git a/intercept/messages/base.go b/intercept/messages/base.go index 58ac23a9..b50de147 100644 --- a/intercept/messages/base.go +++ b/intercept/messages/base.go @@ -180,6 +180,7 @@ func (i *interceptionBase) extractModelThoughts(msg *anthropic.Message) []*recor case anthropic.ThinkingBlock: thoughtRecords = append(thoughtRecords, &recorder.ModelThoughtRecord{ Content: variant.Thinking, + Metadata: recorder.Metadata{"source": "thinking"}, CreatedAt: time.Now(), }) } diff --git a/intercept/responses/base.go b/intercept/responses/base.go index e61bc86c..12cddebf 100644 --- a/intercept/responses/base.go +++ b/intercept/responses/base.go @@ -356,6 +356,7 @@ func (i *responsesInterceptionBase) extractModelThoughts(response *responses.Res } thoughts = append(thoughts, &recorder.ModelThoughtRecord{ Content: summary.Text, + Metadata: recorder.Metadata{"source": "reasoning_summary"}, CreatedAt: time.Now(), }) } @@ -379,6 +380,7 @@ func (i *responsesInterceptionBase) extractModelThoughts(response *responses.Res } thoughts = append(thoughts, &recorder.ModelThoughtRecord{ Content: part.Text, + Metadata: recorder.Metadata{"source": "commentary"}, CreatedAt: time.Now(), }) } diff --git a/internal/integrationtest/bridge_test.go b/internal/integrationtest/bridge_test.go index feb03c58..4a424f32 100644 --- a/internal/integrationtest/bridge_test.go +++ b/internal/integrationtest/bridge_test.go @@ -216,6 +216,8 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { require.Len(t, tu.ModelThoughts, len(tc.expectedThoughts)) for i, expected := range tc.expectedThoughts { assert.Contains(t, tu.ModelThoughts[i].Content, expected) + assert.Equal(t, "thinking", tu.ModelThoughts[i].Metadata["source"], + "thought %d should have source \"thinking\"", i) } } } diff --git a/internal/integrationtest/responses_test.go b/internal/integrationtest/responses_test.go index d120c3bb..2cff3e63 100644 --- a/internal/integrationtest/responses_test.go +++ b/internal/integrationtest/responses_test.go @@ -940,60 +940,77 @@ func TestResponsesInjectedTool(t *testing.T) { func TestResponsesModelThoughts(t *testing.T) { t.Parallel() + type expectedThought struct { + content string + source string // "reasoning_summary" or "commentary" + } + cases := []struct { name string fixture []byte - expectedThoughts []string // nil means no tool usages expected at all + expectedThoughts []expectedThought // nil means no tool usages expected at all }{ { name: "single reasoning/blocking", fixture: fixtures.OaiResponsesBlockingSingleBuiltinTool, - expectedThoughts: []string{"The user wants to add 3 and 5"}, + expectedThoughts: []expectedThought{{content: "The user wants to add 3 and 5", source: "reasoning_summary"}}, }, { name: "single reasoning/streaming", fixture: fixtures.OaiResponsesStreamingBuiltinTool, - expectedThoughts: []string{"The user wants to add 3 and 5"}, + expectedThoughts: []expectedThought{{content: "The user wants to add 3 and 5", source: "reasoning_summary"}}, }, { - name: "multiple reasoning items/blocking", - fixture: fixtures.OaiResponsesBlockingMultiReasoningBuiltinTool, - expectedThoughts: []string{"The user wants to add 3 and 5", "After adding, I will check if the result is prime"}, + name: "multiple reasoning items/blocking", + fixture: fixtures.OaiResponsesBlockingMultiReasoningBuiltinTool, + expectedThoughts: []expectedThought{ + {content: "The user wants to add 3 and 5", source: "reasoning_summary"}, + {content: "After adding, I will check if the result is prime", source: "reasoning_summary"}, + }, }, { - name: "multiple reasoning items/streaming", - fixture: fixtures.OaiResponsesStreamingMultiReasoningBuiltinTool, - expectedThoughts: []string{"The user wants to add 3 and 5", "After adding, I will check if the result is prime"}, + name: "multiple reasoning items/streaming", + fixture: fixtures.OaiResponsesStreamingMultiReasoningBuiltinTool, + expectedThoughts: []expectedThought{ + {content: "The user wants to add 3 and 5", source: "reasoning_summary"}, + {content: "After adding, I will check if the result is prime", source: "reasoning_summary"}, + }, }, { name: "commentary/blocking", fixture: fixtures.OaiResponsesBlockingCommentaryBuiltinTool, - expectedThoughts: []string{"Checking whether 3 + 5 is prime by calling the add function first."}, + expectedThoughts: []expectedThought{{content: "Checking whether 3 + 5 is prime by calling the add function first.", source: "commentary"}}, }, { name: "commentary/streaming", fixture: fixtures.OaiResponsesStreamingCommentaryBuiltinTool, - expectedThoughts: []string{"Checking whether 3 + 5 is prime by calling the add function first."}, + expectedThoughts: []expectedThought{{content: "Checking whether 3 + 5 is prime by calling the add function first.", source: "commentary"}}, }, { - name: "summary and commentary/blocking", - fixture: fixtures.OaiResponsesBlockingSummaryAndCommentaryBuiltinTool, - expectedThoughts: []string{"I need to add 3 and 5 to check primality.", "Let me calculate the sum first using the add function."}, + name: "summary and commentary/blocking", + fixture: fixtures.OaiResponsesBlockingSummaryAndCommentaryBuiltinTool, + expectedThoughts: []expectedThought{ + {content: "I need to add 3 and 5 to check primality.", source: "reasoning_summary"}, + {content: "Let me calculate the sum first using the add function.", source: "commentary"}, + }, }, { - name: "summary and commentary/streaming", - fixture: fixtures.OaiResponsesStreamingSummaryAndCommentaryBuiltinTool, - expectedThoughts: []string{"I need to add 3 and 5 to check primality.", "Let me calculate the sum first using the add function."}, + name: "summary and commentary/streaming", + fixture: fixtures.OaiResponsesStreamingSummaryAndCommentaryBuiltinTool, + expectedThoughts: []expectedThought{ + {content: "I need to add 3 and 5 to check primality.", source: "reasoning_summary"}, + {content: "Let me calculate the sum first using the add function.", source: "commentary"}, + }, }, { name: "parallel tool calls/blocking", fixture: fixtures.OaiResponsesBlockingSingleBuiltinToolParallel, - expectedThoughts: []string{"The user wants two additions"}, + expectedThoughts: []expectedThought{{content: "The user wants two additions", source: "reasoning_summary"}}, }, { name: "parallel tool calls/streaming", fixture: fixtures.OaiResponsesStreamingSingleBuiltinToolParallel, - expectedThoughts: []string{"The user wants two additions"}, + expectedThoughts: []expectedThought{{content: "The user wants two additions", source: "reasoning_summary"}}, }, { name: "no thoughts without tool calls", @@ -1034,7 +1051,9 @@ func TestResponsesModelThoughts(t *testing.T) { withThoughts++ require.Len(t, tu.ModelThoughts, len(tc.expectedThoughts)) for i, expected := range tc.expectedThoughts { - require.Contains(t, tu.ModelThoughts[i].Content, expected) + require.Contains(t, tu.ModelThoughts[i].Content, expected.content) + require.Equal(t, expected.source, tu.ModelThoughts[i].Metadata["source"], + "thought %d should have source %q", i, expected.source) } } } From 7fbfb8e3983449217f9d4554ca31bf4bab98aafe Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 13 Mar 2026 17:24:30 +0200 Subject: [PATCH 12/14] chore: break association between thoughts and tools Signed-off-by: Danny Kopping --- api.go | 1 + intercept/messages/base.go | 8 ++-- intercept/messages/blocking.go | 27 ++++-------- intercept/messages/streaming.go | 25 +++-------- intercept/responses/base.go | 32 +++++++------- intercept/responses/blocking.go | 1 + intercept/responses/injected_tools.go | 18 ++------ intercept/responses/streaming.go | 2 + internal/integrationtest/bridge_test.go | 51 +++++++++++++--------- internal/integrationtest/responses_test.go | 43 +++++++++--------- internal/integrationtest/trace_test.go | 2 + internal/testutil/mock_recorder.go | 16 +++++++ recorder/recorder.go | 34 +++++++++++++++ recorder/types.go | 18 +++++--- 14 files changed, 160 insertions(+), 118 deletions(-) diff --git a/api.go b/api.go index acc789ef..e0486d77 100644 --- a/api.go +++ b/api.go @@ -30,6 +30,7 @@ type ( TokenUsageRecord = recorder.TokenUsageRecord PromptUsageRecord = recorder.PromptUsageRecord ToolUsageRecord = recorder.ToolUsageRecord + ModelThoughtRecord = recorder.ModelThoughtRecord Recorder = recorder.Recorder Metadata = recorder.Metadata diff --git a/intercept/messages/base.go b/intercept/messages/base.go index b50de147..f1a123ad 100644 --- a/intercept/messages/base.go +++ b/intercept/messages/base.go @@ -178,10 +178,12 @@ func (i *interceptionBase) extractModelThoughts(msg *anthropic.Message) []*recor for _, block := range msg.Content { switch variant := block.AsAny().(type) { case anthropic.ThinkingBlock: + if variant.Thinking == "" { + continue + } thoughtRecords = append(thoughtRecords, &recorder.ModelThoughtRecord{ - Content: variant.Thinking, - Metadata: recorder.Metadata{"source": "thinking"}, - CreatedAt: time.Now(), + Content: variant.Thinking, + Metadata: recorder.Metadata{"source": recorder.ThoughtSourceThinking}, }) } // anthropic.RedactedThinkingBlock also exists, but there's nothing useful we can capture. diff --git a/intercept/messages/blocking.go b/intercept/messages/blocking.go index 0c888d0a..6d2ed6f8 100644 --- a/intercept/messages/blocking.go +++ b/intercept/messages/blocking.go @@ -136,9 +136,15 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req accumulateUsage(&cumulativeUsage, resp.Usage) // Capture any thinking blocks that were returned. - thoughtRecords := i.extractModelThoughts(resp) + for _, t := range i.extractModelThoughts(resp) { + _ = i.recorder.RecordModelThought(ctx, &recorder.ModelThoughtRecord{ + InterceptionID: i.ID().String(), + Content: t.Content, + Metadata: t.Metadata, + }) + } - // Handle tool calls for non-streaming. + // Handle tool calls. var pendingToolCalls []anthropic.ToolUseBlock for _, c := range resp.Content { toolUse := c.AsToolUse() @@ -159,15 +165,7 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req Tool: toolUse.Name, Args: toolUse.Input, Injected: false, - ModelThoughts: thoughtRecords, }) - - // Clear after first use to avoid duplicating across - // multiple tool calls in the same message. - // - // This effectively means that in the case of parallel tool calls - // the thoughts will only be associated to the first tool use which is fine. - thoughtRecords = nil } // If no injected tool calls, we're done. @@ -206,17 +204,8 @@ func (i *BlockingInterception) ProcessRequest(w http.ResponseWriter, r *http.Req Args: tc.Input, Injected: true, InvocationError: err, - ModelThoughts: thoughtRecords, }) - // Clear after first use to avoid duplicating across - // multiple tool calls in the same message. - // - // This is not strictly needed for injected tools since we - // disable parallel tool calls, but just adding this here - // for defensiveness. - thoughtRecords = nil - if err != nil { // Always provide a tool_result even if the tool call failed messages.Messages = append(messages.Messages, diff --git a/intercept/messages/streaming.go b/intercept/messages/streaming.go index 878565c3..595cfe44 100644 --- a/intercept/messages/streaming.go +++ b/intercept/messages/streaming.go @@ -254,7 +254,13 @@ newStream: case string(constant.ValueOf[constant.MessageStop]()): // Capture any thinking blocks that were returned. - thoughtRecords := i.extractModelThoughts(&message) + for _, t := range i.extractModelThoughts(&message) { + _ = i.recorder.RecordModelThought(ctx, &recorder.ModelThoughtRecord{ + InterceptionID: i.ID().String(), + Content: t.Content, + Metadata: t.Metadata, + }) + } // Process injected tools. if len(pendingToolCalls) > 0 { @@ -309,17 +315,8 @@ newStream: Args: input, Injected: true, InvocationError: err, - ModelThoughts: thoughtRecords, }) - // Clear after first use to avoid duplicating across - // multiple tool calls in the same message. - // - // This is not strictly needed for injected tools since we - // disable parallel tool calls, but just adding this here - // for defensiveness. - thoughtRecords = nil - if err != nil { // Always provide a tool_result even if the tool call failed messages.Messages = append(messages.Messages, @@ -430,15 +427,7 @@ newStream: Tool: variant.Name, Args: variant.Input, Injected: false, - ModelThoughts: thoughtRecords, }) - - // Clear after first use to avoid duplicating across - // multiple tool calls in the same message. - // - // This effectively means that in the case of parallel tool calls - // the thoughts will only be associated to the first tool use which is fine. - thoughtRecords = nil } } } diff --git a/intercept/responses/base.go b/intercept/responses/base.go index 12cddebf..59a14b02 100644 --- a/intercept/responses/base.go +++ b/intercept/responses/base.go @@ -254,15 +254,22 @@ func (i *responsesInterceptionBase) recordUserPrompt(ctx context.Context, respon } } +func (i *responsesInterceptionBase) recordModelThoughts(ctx context.Context, response *responses.Response) { + for _, t := range i.extractModelThoughts(response) { + _ = i.recorder.RecordModelThought(ctx, &recorder.ModelThoughtRecord{ + InterceptionID: i.ID().String(), + Content: t.Content, + Metadata: t.Metadata, + }) + } +} + func (i *responsesInterceptionBase) recordNonInjectedToolUsage(ctx context.Context, response *responses.Response) { if response == nil { i.logger.Warn(ctx, "got empty response, skipping tool usage recording") return } - // Capture any reasoning items from the response output as model thoughts. - thoughtRecords := i.extractModelThoughts(response) - for _, item := range response.Output { var args recorder.ToolArgs @@ -283,17 +290,9 @@ func (i *responsesInterceptionBase) recordNonInjectedToolUsage(ctx context.Conte Tool: item.Name, Args: args, Injected: false, - ModelThoughts: thoughtRecords, }); err != nil { i.logger.Warn(ctx, "failed to record tool usage", slog.Error(err), slog.F("tool", item.Name)) } - - // Clear after first use to avoid duplicating across - // multiple tool calls in the same message. - // - // This effectively means that in the case of parallel tool calls - // the thoughts will only be associated to the first tool use which is fine. - thoughtRecords = nil } } @@ -355,9 +354,8 @@ func (i *responsesInterceptionBase) extractModelThoughts(response *responses.Res continue } thoughts = append(thoughts, &recorder.ModelThoughtRecord{ - Content: summary.Text, - Metadata: recorder.Metadata{"source": "reasoning_summary"}, - CreatedAt: time.Now(), + Content: summary.Text, + Metadata: recorder.Metadata{"source": recorder.ThoughtSourceReasoningSummary}, }) } @@ -365,6 +363,7 @@ func (i *responsesInterceptionBase) extractModelThoughts(response *responses.Res // The API sometimes returns commentary messages instead of reasoning // summaries. These are assistant message output items with "phase": "commentary". // The SDK doesn't expose a Phase field, so we extract it from raw JSON. + // TODO: revisit when the OpenAI SDK adds a proper Phase field. raw := item.RawJSON() if gjson.Get(raw, "role").String() != string(constant.ValueOf[constant.Assistant]()) || gjson.Get(raw, "phase").String() != "commentary" { @@ -379,9 +378,8 @@ func (i *responsesInterceptionBase) extractModelThoughts(response *responses.Res continue } thoughts = append(thoughts, &recorder.ModelThoughtRecord{ - Content: part.Text, - Metadata: recorder.Metadata{"source": "commentary"}, - CreatedAt: time.Now(), + Content: part.Text, + Metadata: recorder.Metadata{"source": recorder.ThoughtSourceCommentary}, }) } } diff --git a/intercept/responses/blocking.go b/intercept/responses/blocking.go index 0c11a541..48491e43 100644 --- a/intercept/responses/blocking.go +++ b/intercept/responses/blocking.go @@ -94,6 +94,7 @@ func (i *BlockingResponsesInterceptor) ProcessRequest(w http.ResponseWriter, r * } i.recordTokenUsage(ctx, response) + i.recordModelThoughts(ctx, response) // Check if there any injected tools to invoke. pending := i.getPendingInjectedToolCalls(response) diff --git a/intercept/responses/injected_tools.go b/intercept/responses/injected_tools.go index 9f813720..e3720230 100644 --- a/intercept/responses/injected_tools.go +++ b/intercept/responses/injected_tools.go @@ -109,20 +109,9 @@ func (i *responsesInterceptionBase) handleInjectedToolCalls(ctx context.Context, return nil, nil } - // Capture any reasoning items from the response output as model thoughts. - thoughtRecords := i.extractModelThoughts(response) - var results []responses.ResponseInputItemUnionParam for _, fc := range pending { - results = append(results, i.invokeInjectedTool(ctx, response.ID, fc, thoughtRecords)) - - // Clear after first use to avoid duplicating across - // multiple tool calls in the same message. - // - // This is not strictly needed for injected tools since we - // disable parallel tool calls, but just adding this here - // for defensiveness. - thoughtRecords = nil + results = append(results, i.invokeInjectedTool(ctx, response.ID, fc)) } return results, nil @@ -182,7 +171,7 @@ func (i *responsesInterceptionBase) prepareRequestForAgenticLoop(ctx context.Con return nil } -// getPendingInjectedToolCalls extracts function calls from the response that are managed by MCP proxy +// getPendingInjectedToolCalls extracts function calls from the response that are managed by MCP proxy. func (i *responsesInterceptionBase) getPendingInjectedToolCalls(response *responses.Response) []responses.ResponseFunctionToolCall { var calls []responses.ResponseFunctionToolCall @@ -207,7 +196,7 @@ func (i *responsesInterceptionBase) getPendingInjectedToolCalls(response *respon return calls } -func (i *responsesInterceptionBase) invokeInjectedTool(ctx context.Context, responseID string, fc responses.ResponseFunctionToolCall, thoughtRecords []*recorder.ModelThoughtRecord) responses.ResponseInputItemUnionParam { +func (i *responsesInterceptionBase) invokeInjectedTool(ctx context.Context, responseID string, fc responses.ResponseFunctionToolCall) responses.ResponseInputItemUnionParam { tool := i.mcpProxy.GetTool(fc.Name) if tool == nil { return responses.ResponseInputItemParamOfFunctionCallOutput(fc.CallID, fmt.Sprintf("error: unknown injected function %q", fc.ID)) @@ -224,7 +213,6 @@ func (i *responsesInterceptionBase) invokeInjectedTool(ctx context.Context, resp Args: args, Injected: true, InvocationError: err, - ModelThoughts: thoughtRecords, }) var output string diff --git a/intercept/responses/streaming.go b/intercept/responses/streaming.go index 38f5771b..32ee1f02 100644 --- a/intercept/responses/streaming.go +++ b/intercept/responses/streaming.go @@ -172,6 +172,8 @@ func (i *StreamingResponsesInterceptor) ProcessRequest(w http.ResponseWriter, r // Record token usage for each inner loop iteration i.recordTokenUsage(ctx, completedResponse) } + + i.recordModelThoughts(ctx, completedResponse) } if promptFound { diff --git a/internal/integrationtest/bridge_test.go b/internal/integrationtest/bridge_test.go index 4a424f32..9ceafe0d 100644 --- a/internal/integrationtest/bridge_test.go +++ b/internal/integrationtest/bridge_test.go @@ -131,7 +131,7 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { name string streaming bool fixture []byte - expectedThoughts []string // nil means no tool usages expected at all + expectedThoughts []string // nil means no model thoughts expected }{ { name: "single thinking block/streaming", @@ -170,9 +170,16 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { expectedThoughts: []string{"The user wants me to read two files"}, }, { - name: "no thoughts without tool calls", - streaming: true, - fixture: fixtures.AntSimple, // This fixture contains thoughts, but they're not associated with tool calls. + name: "thoughts without tool calls/streaming", + streaming: true, + fixture: fixtures.AntSimple, + expectedThoughts: []string{"This is a classic philosophical question about medieval scholasticism"}, + }, + { + name: "thoughts without tool calls/blocking", + streaming: false, + fixture: fixtures.AntSimple, + expectedThoughts: []string{"This is a classic philosophical question about medieval scholasticism"}, }, } @@ -200,28 +207,30 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { assert.Contains(t, sp.AllEvents(), "message_stop") } - toolUsages := bridgeServer.Recorder.RecordedToolUsages() + interceptions := bridgeServer.Recorder.RecordedInterceptions() + require.GreaterOrEqual(t, len(interceptions), 1) + + thoughts := bridgeServer.Recorder.RecordedModelThoughts() if tc.expectedThoughts == nil { - assert.Empty(t, toolUsages) + assert.Empty(t, thoughts) } else { - require.NotEmpty(t, toolUsages) - - // Exactly one tool usage should have the expected thoughts; - // all others should have none. - var withThoughts int - for _, tu := range toolUsages { - assert.Equal(t, "Read", tu.Tool) - if len(tu.ModelThoughts) > 0 { - withThoughts++ - require.Len(t, tu.ModelThoughts, len(tc.expectedThoughts)) - for i, expected := range tc.expectedThoughts { - assert.Contains(t, tu.ModelThoughts[i].Content, expected) - assert.Equal(t, "thinking", tu.ModelThoughts[i].Metadata["source"], - "thought %d should have source \"thinking\"", i) + require.Len(t, thoughts, len(tc.expectedThoughts), "unexpected number of model thoughts") + + // We can't guarantee the order of model thoughts since they're recorded separately, so + // we have to scan all thoughts for a match. + + for _, expected := range tc.expectedThoughts { + var matched *aibridge.ModelThoughtRecord + for _, thought := range thoughts { + if strings.Contains(thought.Content, expected) { + matched = thought } } + + require.NotNil(t, matched, "could not find thought matching %q", expected) + require.Equal(t, interceptions[0].ID, matched.InterceptionID) + require.Equal(t, "thinking", matched.Metadata["source"]) } - assert.Equal(t, 1, withThoughts, "expected exactly one tool usage with model thoughts") } bridgeServer.Recorder.VerifyAllInterceptionsEnded(t) diff --git a/internal/integrationtest/responses_test.go b/internal/integrationtest/responses_test.go index 2cff3e63..358175a9 100644 --- a/internal/integrationtest/responses_test.go +++ b/internal/integrationtest/responses_test.go @@ -10,6 +10,7 @@ import ( "net/http/httptest" "slices" "strconv" + "strings" "sync" "testing" "time" @@ -374,7 +375,6 @@ func TestResponsesOutputMatchesUpstream(t *testing.T) { require.Len(t, recordedTools, 1) recordedTools[0].InterceptionID = tc.expectToolRecorded.InterceptionID // ignore interception id (interception id is not constant and response doesn't contain it) recordedTools[0].CreatedAt = tc.expectToolRecorded.CreatedAt // ignore time - recordedTools[0].ModelThoughts = tc.expectToolRecorded.ModelThoughts // ignore model thoughts (tested separately) require.Equal(t, tc.expectToolRecorded, recordedTools[0]) } else { require.Empty(t, recordedTools) @@ -1013,8 +1013,9 @@ func TestResponsesModelThoughts(t *testing.T) { expectedThoughts: []expectedThought{{content: "The user wants two additions", source: "reasoning_summary"}}, }, { - name: "no thoughts without tool calls", - fixture: fixtures.OaiResponsesStreamingCodex, // This fixture contains reasoning, but it's not associated with tool calls. + name: "thoughts without tool calls", + fixture: fixtures.OaiResponsesStreamingCodex, // This fixture contains reasoning, but it's not associated with tool calls. + expectedThoughts: []expectedThought{{content: "Preparing simple response", source: "reasoning_summary"}}, }, } @@ -1036,28 +1037,30 @@ func TestResponsesModelThoughts(t *testing.T) { _, err := io.ReadAll(resp.Body) require.NoError(t, err) - toolUsages := bridgeServer.Recorder.RecordedToolUsages() + interceptions := bridgeServer.Recorder.RecordedInterceptions() + require.GreaterOrEqual(t, len(interceptions), 1) + + thoughts := bridgeServer.Recorder.RecordedModelThoughts() if tc.expectedThoughts == nil { - require.Empty(t, toolUsages) + assert.Empty(t, thoughts) } else { - require.NotEmpty(t, toolUsages) - - // Exactly one tool usage should have the expected thoughts; - // all others should have none. - var withThoughts int - for _, tu := range toolUsages { - require.Equal(t, "add", tu.Tool) - if len(tu.ModelThoughts) > 0 { - withThoughts++ - require.Len(t, tu.ModelThoughts, len(tc.expectedThoughts)) - for i, expected := range tc.expectedThoughts { - require.Contains(t, tu.ModelThoughts[i].Content, expected.content) - require.Equal(t, expected.source, tu.ModelThoughts[i].Metadata["source"], - "thought %d should have source %q", i, expected.source) + require.Len(t, thoughts, len(tc.expectedThoughts), "unexpected number of model thoughts") + + // We can't guarantee the order of model thoughts since they're recorded separately, so + // we have to scan all thoughts for a match. + + for _, expected := range tc.expectedThoughts { + var matched *aibridge.ModelThoughtRecord + for _, thought := range thoughts { + if strings.Contains(thought.Content, expected.content) { + matched = thought } } + + require.NotNil(t, matched, "could not find thought matching %q", expected) + require.Equal(t, interceptions[0].ID, matched.InterceptionID) + require.Equal(t, expected.source, matched.Metadata["source"]) } - require.Equal(t, 1, withThoughts, "expected exactly one tool usage with model thoughts") } }) } diff --git a/internal/integrationtest/trace_test.go b/internal/integrationtest/trace_test.go index 88bec31c..bdfb7f7f 100644 --- a/internal/integrationtest/trace_test.go +++ b/internal/integrationtest/trace_test.go @@ -51,6 +51,7 @@ func TestTraceAnthropic(t *testing.T) { {"Intercept.RecordPromptUsage", 1, codes.Unset}, {"Intercept.RecordTokenUsage", 1, codes.Unset}, {"Intercept.RecordToolUsage", 1, codes.Unset}, + {"Intercept.RecordModelThought", 1, codes.Unset}, {"Intercept.ProcessRequest.Upstream", 1, codes.Unset}, } @@ -63,6 +64,7 @@ func TestTraceAnthropic(t *testing.T) { {"Intercept.RecordPromptUsage", 1, codes.Unset}, {"Intercept.RecordTokenUsage", 2, codes.Unset}, {"Intercept.RecordToolUsage", 1, codes.Unset}, + {"Intercept.RecordModelThought", 1, codes.Unset}, {"Intercept.ProcessRequest.Upstream", 1, codes.Unset}, } diff --git a/internal/testutil/mock_recorder.go b/internal/testutil/mock_recorder.go index 09bcac39..5cd4420f 100644 --- a/internal/testutil/mock_recorder.go +++ b/internal/testutil/mock_recorder.go @@ -20,6 +20,7 @@ type MockRecorder struct { tokenUsages []*recorder.TokenUsageRecord userPrompts []*recorder.PromptUsageRecord toolUsages []*recorder.ToolUsageRecord + modelThoughts []*recorder.ModelThoughtRecord interceptionsEnd map[string]*recorder.InterceptionRecordEnded } @@ -64,6 +65,13 @@ func (m *MockRecorder) RecordToolUsage(ctx context.Context, req *recorder.ToolUs return nil } +func (m *MockRecorder) RecordModelThought(ctx context.Context, req *recorder.ModelThoughtRecord) error { + m.mu.Lock() + defer m.mu.Unlock() + m.modelThoughts = append(m.modelThoughts, req) + return nil +} + // RecordedTokenUsages returns a copy of recorded token usages in a thread-safe manner. // Note: This is a shallow clone - the slice is copied but the pointers reference the // same underlying records. This is sufficient for our test assertions which only read @@ -112,6 +120,14 @@ func (m *MockRecorder) RecordedToolUsages() []*recorder.ToolUsageRecord { return slices.Clone(m.toolUsages) } +// RecordedModelThoughts returns a copy of recorded model thoughts in a thread-safe manner. +// Note: This is a shallow clone (see RecordedTokenUsages for details). +func (m *MockRecorder) RecordedModelThoughts() []*recorder.ModelThoughtRecord { + m.mu.Lock() + defer m.mu.Unlock() + return slices.Clone(m.modelThoughts) +} + // RecordedInterceptions returns a copy of recorded interceptions in a thread-safe manner. // Note: This is a shallow clone (see RecordedTokenUsages for details). func (m *MockRecorder) RecordedInterceptions() []*recorder.InterceptionRecord { diff --git a/recorder/recorder.go b/recorder/recorder.go index 6e37b632..c4f427c5 100644 --- a/recorder/recorder.go +++ b/recorder/recorder.go @@ -116,6 +116,24 @@ func (r *RecorderWrapper) RecordToolUsage(ctx context.Context, req *ToolUsageRec return err } +func (r *RecorderWrapper) RecordModelThought(ctx context.Context, req *ModelThoughtRecord) (outErr error) { + ctx, span := r.tracer.Start(ctx, "Intercept.RecordModelThought", trace.WithAttributes(tracing.InterceptionAttributesFromContext(ctx)...)) + defer tracing.EndSpanErr(span, &outErr) + + client, err := r.clientFn() + if err != nil { + return fmt.Errorf("acquire client: %w", err) + } + + req.CreatedAt = time.Now() + if err = client.RecordModelThought(ctx, req); err == nil { + return nil + } + + r.logger.Warn(ctx, "failed to record model thought", slog.Error(err), slog.F("interception_id", req.InterceptionID)) + return err +} + func NewRecorder(logger slog.Logger, tracer trace.Tracer, clientFn func() (Recorder, error)) *RecorderWrapper { return &RecorderWrapper{ logger: logger, @@ -259,6 +277,22 @@ func (a *AsyncRecorder) RecordToolUsage(ctx context.Context, req *ToolUsageRecor return nil // Caller is not interested in error. } +func (a *AsyncRecorder) RecordModelThought(ctx context.Context, req *ModelThoughtRecord) error { + a.wg.Add(1) + go func() { + defer a.wg.Done() + timedCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), a.timeout) + defer cancel() + + err := a.wrapped.RecordModelThought(timedCtx, req) + if err != nil { + a.logger.Warn(timedCtx, "failed to record model thought", slog.F("type", "model_thought"), slog.Error(err), slog.F("payload", req)) + } + }() + + return nil // Caller is not interested in error. +} + func (a *AsyncRecorder) Wait() { a.wg.Wait() } diff --git a/recorder/types.go b/recorder/types.go index d3cbaf73..20e735f4 100644 --- a/recorder/types.go +++ b/recorder/types.go @@ -18,8 +18,9 @@ type Recorder interface { // RecordPromptUsage records the prompts used in an interception with an upstream AI provider. RecordPromptUsage(ctx context.Context, req *PromptUsageRecord) error // RecordToolUsage records the tools used in an interception with an upstream AI provider. - // Any associated model thoughts should be included in the ToolUsageRecord. RecordToolUsage(ctx context.Context, req *ToolUsageRecord) error + // RecordModelThought records model thoughts produced in an interception with an upstream AI provider. + RecordModelThought(ctx context.Context, req *ModelThoughtRecord) error } type ToolArgs any @@ -73,11 +74,18 @@ type ToolUsageRecord struct { InvocationError error Metadata Metadata CreatedAt time.Time - ModelThoughts []*ModelThoughtRecord } +// Model thought source constants. +const ( + ThoughtSourceThinking = "thinking" + ThoughtSourceReasoningSummary = "reasoning_summary" + ThoughtSourceCommentary = "commentary" +) + type ModelThoughtRecord struct { - Content string - Metadata Metadata - CreatedAt time.Time + InterceptionID string + Content string + Metadata Metadata + CreatedAt time.Time } From 33cec2a0feb47178b478a4f68de27c4b5775d5ff Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 16 Mar 2026 15:35:51 +0200 Subject: [PATCH 13/14] chore: expand trace testing Signed-off-by: Danny Kopping --- internal/integrationtest/trace_test.go | 85 +++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 7 deletions(-) diff --git a/internal/integrationtest/trace_test.go b/internal/integrationtest/trace_test.go index bdfb7f7f..e9b27d64 100644 --- a/internal/integrationtest/trace_test.go +++ b/internal/integrationtest/trace_test.go @@ -70,36 +70,70 @@ func TestTraceAnthropic(t *testing.T) { cases := []struct { name string + fixture []byte streaming bool bedrock bool expect []expectTrace }{ { - name: "trace_anthr_non_streaming", - expect: expectNonStreaming, + name: "trace_anthr_non_streaming", + expect: expectNonStreaming, + fixture: fixtures.AntSingleBuiltinTool, }, { name: "trace_bedrock_non_streaming", bedrock: true, expect: expectNonStreaming, + fixture: fixtures.AntSingleBuiltinTool, }, { name: "trace_anthr_streaming", streaming: true, expect: expectStreaming, + fixture: fixtures.AntSingleBuiltinTool, }, { name: "trace_bedrock_streaming", streaming: true, bedrock: true, expect: expectStreaming, + fixture: fixtures.AntSingleBuiltinTool, + }, + { + name: "trace_multi_thinking_non_streaming", + fixture: fixtures.AntMultiThinkingBuiltinTool, + expect: []expectTrace{ + {"Intercept", 1, codes.Unset}, + {"Intercept.CreateInterceptor", 1, codes.Unset}, + {"Intercept.RecordInterception", 1, codes.Unset}, + {"Intercept.ProcessRequest", 1, codes.Unset}, + {"Intercept.RecordInterceptionEnded", 1, codes.Unset}, + {"Intercept.RecordPromptUsage", 1, codes.Unset}, + {"Intercept.RecordTokenUsage", 1, codes.Unset}, + {"Intercept.RecordToolUsage", 1, codes.Unset}, + {"Intercept.RecordModelThought", 2, codes.Unset}, + {"Intercept.ProcessRequest.Upstream", 1, codes.Unset}, + }, + }, + { + name: "trace_multi_thinking_streaming", + fixture: fixtures.AntMultiThinkingBuiltinTool, + streaming: true, + expect: []expectTrace{ + {"Intercept", 1, codes.Unset}, + {"Intercept.CreateInterceptor", 1, codes.Unset}, + {"Intercept.RecordInterception", 1, codes.Unset}, + {"Intercept.ProcessRequest", 1, codes.Unset}, + {"Intercept.RecordInterceptionEnded", 1, codes.Unset}, + {"Intercept.RecordPromptUsage", 1, codes.Unset}, + {"Intercept.RecordTokenUsage", 2, codes.Unset}, + {"Intercept.RecordToolUsage", 1, codes.Unset}, + {"Intercept.RecordModelThought", 2, codes.Unset}, + {"Intercept.ProcessRequest.Upstream", 1, codes.Unset}, + }, }, } - fix := fixtures.Parse(t, fixtures.AntSingleBuiltinTool) - - fixtureReqBody := fix.Request() - for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { ctx, cancel := context.WithTimeout(t.Context(), time.Second*30) @@ -107,6 +141,7 @@ func TestTraceAnthropic(t *testing.T) { sr, tracer := setupTracer(t) + fix := fixtures.Parse(t, tc.fixture) upstream := newMockUpstream(t, ctx, newFixtureResponse(fix)) opts := []bridgeOption{ @@ -117,7 +152,7 @@ func TestTraceAnthropic(t *testing.T) { } bridgeServer := newBridgeTestServer(t, ctx, upstream.URL, opts...) - reqBody, err := sjson.SetBytes(fixtureReqBody, "stream", tc.streaming) + reqBody, err := sjson.SetBytes(fix.Request(), "stream", tc.streaming) require.NoError(t, err) resp := bridgeServer.makeRequest(t, http.MethodPost, pathAnthropicMessages, reqBody) require.Equal(t, http.StatusOK, resp.StatusCode) @@ -453,6 +488,42 @@ func TestTraceOpenAI(t *testing.T) { {"Intercept.ProcessRequest.Upstream", 1, codes.Unset}, }, }, + { + name: "trace_openai_responses_streaming_with_reasoning", + fixture: fixtures.OaiResponsesStreamingMultiReasoningBuiltinTool, + streaming: true, + path: pathOpenAIResponses, + expect: []expectTrace{ + {"Intercept", 1, codes.Unset}, + {"Intercept.CreateInterceptor", 1, codes.Unset}, + {"Intercept.RecordInterception", 1, codes.Unset}, + {"Intercept.ProcessRequest", 1, codes.Unset}, + {"Intercept.RecordInterceptionEnded", 1, codes.Unset}, + {"Intercept.RecordPromptUsage", 1, codes.Unset}, + {"Intercept.RecordTokenUsage", 1, codes.Unset}, + {"Intercept.RecordToolUsage", 1, codes.Unset}, + {"Intercept.RecordModelThought", 2, codes.Unset}, + {"Intercept.ProcessRequest.Upstream", 1, codes.Unset}, + }, + }, + { + name: "trace_openai_responses_blocking_with_reasoning", + fixture: fixtures.OaiResponsesBlockingMultiReasoningBuiltinTool, + streaming: false, + path: pathOpenAIResponses, + expect: []expectTrace{ + {"Intercept", 1, codes.Unset}, + {"Intercept.CreateInterceptor", 1, codes.Unset}, + {"Intercept.RecordInterception", 1, codes.Unset}, + {"Intercept.ProcessRequest", 1, codes.Unset}, + {"Intercept.RecordInterceptionEnded", 1, codes.Unset}, + {"Intercept.RecordPromptUsage", 1, codes.Unset}, + {"Intercept.RecordTokenUsage", 1, codes.Unset}, + {"Intercept.RecordToolUsage", 1, codes.Unset}, + {"Intercept.RecordModelThought", 2, codes.Unset}, + {"Intercept.ProcessRequest.Upstream", 1, codes.Unset}, + }, + }, } for _, tc := range cases { From 92078562c87329b4e51f327230b245cccca75265 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 16 Mar 2026 16:56:41 +0200 Subject: [PATCH 14/14] chore: refactor model thought assertions into common func Signed-off-by: Danny Kopping --- internal/integrationtest/bridge_test.go | 69 +++++++++----------- internal/integrationtest/helpers.go | 10 +++ internal/integrationtest/responses_test.go | 73 +++++++--------------- internal/testutil/mock_recorder.go | 26 ++++++++ 4 files changed, 86 insertions(+), 92 deletions(-) diff --git a/internal/integrationtest/bridge_test.go b/internal/integrationtest/bridge_test.go index 9ceafe0d..a2a746e3 100644 --- a/internal/integrationtest/bridge_test.go +++ b/internal/integrationtest/bridge_test.go @@ -131,55 +131,67 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { name string streaming bool fixture []byte - expectedThoughts []string // nil means no model thoughts expected + expectedThoughts []recorder.ModelThoughtRecord // nil means no model thoughts expected }{ { name: "single thinking block/streaming", streaming: true, fixture: fixtures.AntSingleBuiltinTool, - expectedThoughts: []string{"The user wants me to read"}, + expectedThoughts: []recorder.ModelThoughtRecord{newModelThought("The user wants me to read", recorder.ThoughtSourceThinking)}, }, { name: "single thinking block/blocking", streaming: false, fixture: fixtures.AntSingleBuiltinTool, - expectedThoughts: []string{"The user wants me to read"}, + expectedThoughts: []recorder.ModelThoughtRecord{newModelThought("The user wants me to read", recorder.ThoughtSourceThinking)}, }, { - name: "multiple thinking blocks/streaming", - streaming: true, - fixture: fixtures.AntMultiThinkingBuiltinTool, - expectedThoughts: []string{"The user wants me to read", "I should use the Read tool"}, + name: "multiple thinking blocks/streaming", + streaming: true, + fixture: fixtures.AntMultiThinkingBuiltinTool, + expectedThoughts: []recorder.ModelThoughtRecord{ + newModelThought("The user wants me to read", recorder.ThoughtSourceThinking), + newModelThought("I should use the Read tool", recorder.ThoughtSourceThinking), + }, }, { - name: "multiple thinking blocks/blocking", - streaming: false, - fixture: fixtures.AntMultiThinkingBuiltinTool, - expectedThoughts: []string{"The user wants me to read", "I should use the Read tool"}, + name: "multiple thinking blocks/blocking", + streaming: false, + fixture: fixtures.AntMultiThinkingBuiltinTool, + expectedThoughts: []recorder.ModelThoughtRecord{ + newModelThought("The user wants me to read", recorder.ThoughtSourceThinking), + newModelThought("I should use the Read tool", recorder.ThoughtSourceThinking), + }, }, { name: "parallel tool calls/streaming", streaming: true, fixture: fixtures.AntSingleBuiltinToolParallel, - expectedThoughts: []string{"The user wants me to read two files"}, + expectedThoughts: []recorder.ModelThoughtRecord{newModelThought("The user wants me to read two files", recorder.ThoughtSourceThinking)}, }, { name: "parallel tool calls/blocking", streaming: false, fixture: fixtures.AntSingleBuiltinToolParallel, - expectedThoughts: []string{"The user wants me to read two files"}, + expectedThoughts: []recorder.ModelThoughtRecord{newModelThought("The user wants me to read two files", recorder.ThoughtSourceThinking)}, }, { name: "thoughts without tool calls/streaming", streaming: true, fixture: fixtures.AntSimple, - expectedThoughts: []string{"This is a classic philosophical question about medieval scholasticism"}, + expectedThoughts: []recorder.ModelThoughtRecord{newModelThought("This is a classic philosophical question about medieval scholasticism", recorder.ThoughtSourceThinking)}, }, { name: "thoughts without tool calls/blocking", streaming: false, fixture: fixtures.AntSimple, - expectedThoughts: []string{"This is a classic philosophical question about medieval scholasticism"}, + expectedThoughts: []recorder.ModelThoughtRecord{newModelThought("This is a classic philosophical question about medieval scholasticism", recorder.ThoughtSourceThinking)}, + }, + { + name: "no thoughts captured", + streaming: false, + fixture: fixtures.AntSingleInjectedTool, + expectedThoughts: nil, }, } @@ -207,32 +219,7 @@ func TestAnthropicMessagesModelThoughts(t *testing.T) { assert.Contains(t, sp.AllEvents(), "message_stop") } - interceptions := bridgeServer.Recorder.RecordedInterceptions() - require.GreaterOrEqual(t, len(interceptions), 1) - - thoughts := bridgeServer.Recorder.RecordedModelThoughts() - if tc.expectedThoughts == nil { - assert.Empty(t, thoughts) - } else { - require.Len(t, thoughts, len(tc.expectedThoughts), "unexpected number of model thoughts") - - // We can't guarantee the order of model thoughts since they're recorded separately, so - // we have to scan all thoughts for a match. - - for _, expected := range tc.expectedThoughts { - var matched *aibridge.ModelThoughtRecord - for _, thought := range thoughts { - if strings.Contains(thought.Content, expected) { - matched = thought - } - } - - require.NotNil(t, matched, "could not find thought matching %q", expected) - require.Equal(t, interceptions[0].ID, matched.InterceptionID) - require.Equal(t, "thinking", matched.Metadata["source"]) - } - } - + bridgeServer.Recorder.VerifyModelThoughtsRecorded(t, tc.expectedThoughts) bridgeServer.Recorder.VerifyAllInterceptionsEnded(t) }) } diff --git a/internal/integrationtest/helpers.go b/internal/integrationtest/helpers.go index 84bd64d5..038e6335 100644 --- a/internal/integrationtest/helpers.go +++ b/internal/integrationtest/helpers.go @@ -6,6 +6,7 @@ import ( "cdr.dev/slog/v3" "cdr.dev/slog/v3/sloggers/slogtest" "github.com/coder/aibridge/config" + "github.com/coder/aibridge/recorder" ) // anthropicCfg creates a minimal Anthropic config for testing. @@ -53,3 +54,12 @@ func newLogger(t *testing.T) slog.Logger { t.Helper() return slogtest.Make(t, &slogtest.Options{}).Leveled(slog.LevelDebug) } + +func newModelThought(content, source string) recorder.ModelThoughtRecord { + return recorder.ModelThoughtRecord{ + Content: content, + Metadata: recorder.Metadata{ + "source": source, + }, + } +} diff --git a/internal/integrationtest/responses_test.go b/internal/integrationtest/responses_test.go index 358175a9..43e09023 100644 --- a/internal/integrationtest/responses_test.go +++ b/internal/integrationtest/responses_test.go @@ -10,7 +10,6 @@ import ( "net/http/httptest" "slices" "strconv" - "strings" "sync" "testing" "time" @@ -940,82 +939,77 @@ func TestResponsesInjectedTool(t *testing.T) { func TestResponsesModelThoughts(t *testing.T) { t.Parallel() - type expectedThought struct { - content string - source string // "reasoning_summary" or "commentary" - } - cases := []struct { name string fixture []byte - expectedThoughts []expectedThought // nil means no tool usages expected at all + expectedThoughts []recorder.ModelThoughtRecord // nil means no tool usages expected at all }{ { name: "single reasoning/blocking", fixture: fixtures.OaiResponsesBlockingSingleBuiltinTool, - expectedThoughts: []expectedThought{{content: "The user wants to add 3 and 5", source: "reasoning_summary"}}, + expectedThoughts: []recorder.ModelThoughtRecord{newModelThought("The user wants to add 3 and 5", recorder.ThoughtSourceReasoningSummary)}, }, { name: "single reasoning/streaming", fixture: fixtures.OaiResponsesStreamingBuiltinTool, - expectedThoughts: []expectedThought{{content: "The user wants to add 3 and 5", source: "reasoning_summary"}}, + expectedThoughts: []recorder.ModelThoughtRecord{newModelThought("The user wants to add 3 and 5", recorder.ThoughtSourceReasoningSummary)}, }, { name: "multiple reasoning items/blocking", fixture: fixtures.OaiResponsesBlockingMultiReasoningBuiltinTool, - expectedThoughts: []expectedThought{ - {content: "The user wants to add 3 and 5", source: "reasoning_summary"}, - {content: "After adding, I will check if the result is prime", source: "reasoning_summary"}, + expectedThoughts: []recorder.ModelThoughtRecord{ + newModelThought("The user wants to add 3 and 5", recorder.ThoughtSourceReasoningSummary), + newModelThought("After adding, I will check if the result is prime", recorder.ThoughtSourceReasoningSummary), }, }, { name: "multiple reasoning items/streaming", fixture: fixtures.OaiResponsesStreamingMultiReasoningBuiltinTool, - expectedThoughts: []expectedThought{ - {content: "The user wants to add 3 and 5", source: "reasoning_summary"}, - {content: "After adding, I will check if the result is prime", source: "reasoning_summary"}, + expectedThoughts: []recorder.ModelThoughtRecord{ + newModelThought("The user wants to add 3 and 5", recorder.ThoughtSourceReasoningSummary), + newModelThought("After adding, I will check if the result is prime", recorder.ThoughtSourceReasoningSummary), }, }, { name: "commentary/blocking", fixture: fixtures.OaiResponsesBlockingCommentaryBuiltinTool, - expectedThoughts: []expectedThought{{content: "Checking whether 3 + 5 is prime by calling the add function first.", source: "commentary"}}, + expectedThoughts: []recorder.ModelThoughtRecord{newModelThought("Checking whether 3 + 5 is prime by calling the add function first.", recorder.ThoughtSourceCommentary)}, }, { name: "commentary/streaming", fixture: fixtures.OaiResponsesStreamingCommentaryBuiltinTool, - expectedThoughts: []expectedThought{{content: "Checking whether 3 + 5 is prime by calling the add function first.", source: "commentary"}}, + expectedThoughts: []recorder.ModelThoughtRecord{newModelThought("Checking whether 3 + 5 is prime by calling the add function first.", recorder.ThoughtSourceCommentary)}, }, { name: "summary and commentary/blocking", fixture: fixtures.OaiResponsesBlockingSummaryAndCommentaryBuiltinTool, - expectedThoughts: []expectedThought{ - {content: "I need to add 3 and 5 to check primality.", source: "reasoning_summary"}, - {content: "Let me calculate the sum first using the add function.", source: "commentary"}, + expectedThoughts: []recorder.ModelThoughtRecord{ + newModelThought("I need to add 3 and 5 to check primality.", recorder.ThoughtSourceReasoningSummary), + newModelThought("Let me calculate the sum first using the add function.", recorder.ThoughtSourceCommentary), }, }, { name: "summary and commentary/streaming", fixture: fixtures.OaiResponsesStreamingSummaryAndCommentaryBuiltinTool, - expectedThoughts: []expectedThought{ - {content: "I need to add 3 and 5 to check primality.", source: "reasoning_summary"}, - {content: "Let me calculate the sum first using the add function.", source: "commentary"}, + expectedThoughts: []recorder.ModelThoughtRecord{ + newModelThought("I need to add 3 and 5 to check primality.", recorder.ThoughtSourceReasoningSummary), + newModelThought("Let me calculate the sum first using the add function.", recorder.ThoughtSourceCommentary), }, }, { name: "parallel tool calls/blocking", fixture: fixtures.OaiResponsesBlockingSingleBuiltinToolParallel, - expectedThoughts: []expectedThought{{content: "The user wants two additions", source: "reasoning_summary"}}, + expectedThoughts: []recorder.ModelThoughtRecord{newModelThought("The user wants two additions", recorder.ThoughtSourceReasoningSummary)}, }, { name: "parallel tool calls/streaming", fixture: fixtures.OaiResponsesStreamingSingleBuiltinToolParallel, - expectedThoughts: []expectedThought{{content: "The user wants two additions", source: "reasoning_summary"}}, + expectedThoughts: []recorder.ModelThoughtRecord{newModelThought("The user wants two additions", recorder.ThoughtSourceReasoningSummary)}, }, { name: "thoughts without tool calls", fixture: fixtures.OaiResponsesStreamingCodex, // This fixture contains reasoning, but it's not associated with tool calls. - expectedThoughts: []expectedThought{{content: "Preparing simple response", source: "reasoning_summary"}}, + expectedThoughts: []recorder.ModelThoughtRecord{newModelThought("Preparing simple response", recorder.ThoughtSourceReasoningSummary)}, }, } @@ -1037,31 +1031,8 @@ func TestResponsesModelThoughts(t *testing.T) { _, err := io.ReadAll(resp.Body) require.NoError(t, err) - interceptions := bridgeServer.Recorder.RecordedInterceptions() - require.GreaterOrEqual(t, len(interceptions), 1) - - thoughts := bridgeServer.Recorder.RecordedModelThoughts() - if tc.expectedThoughts == nil { - assert.Empty(t, thoughts) - } else { - require.Len(t, thoughts, len(tc.expectedThoughts), "unexpected number of model thoughts") - - // We can't guarantee the order of model thoughts since they're recorded separately, so - // we have to scan all thoughts for a match. - - for _, expected := range tc.expectedThoughts { - var matched *aibridge.ModelThoughtRecord - for _, thought := range thoughts { - if strings.Contains(thought.Content, expected.content) { - matched = thought - } - } - - require.NotNil(t, matched, "could not find thought matching %q", expected) - require.Equal(t, interceptions[0].ID, matched.InterceptionID) - require.Equal(t, expected.source, matched.Metadata["source"]) - } - } + bridgeServer.Recorder.VerifyModelThoughtsRecorded(t, tc.expectedThoughts) + bridgeServer.Recorder.VerifyAllInterceptionsEnded(t) }) } } diff --git a/internal/testutil/mock_recorder.go b/internal/testutil/mock_recorder.go index 5cd4420f..991d0904 100644 --- a/internal/testutil/mock_recorder.go +++ b/internal/testutil/mock_recorder.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "slices" + "strings" "sync" "testing" @@ -163,3 +164,28 @@ func (m *MockRecorder) VerifyAllInterceptionsEnded(t *testing.T) { require.Containsf(t, m.interceptionsEnd, intc.ID, "interception with id: %v has not been ended", intc.ID) } } + +func (m *MockRecorder) VerifyModelThoughtsRecorded(t *testing.T, expected []recorder.ModelThoughtRecord) { + thoughts := m.RecordedModelThoughts() + if expected == nil { + require.Empty(t, thoughts) + return + } + + require.Len(t, thoughts, len(expected), "unexpected number of model thoughts") + + // We can't guarantee the order of model thoughts since they're recorded separately, so + // we have to scan all thoughts for a match. + + for _, exp := range expected { + var matched *recorder.ModelThoughtRecord + for _, thought := range thoughts { + if strings.Contains(thought.Content, exp.Content) { + matched = thought + } + } + + require.NotNil(t, matched, "could not find thought matching %q", exp.Content) + require.EqualValues(t, exp.Metadata, matched.Metadata) + } +}