Skip to content

Commit 01363cc

Browse files
fix(ai): redact message parts content of type blob (#5243)
Substitute blob data in AI client responses with a generic placeholder.
1 parent 543cf4a commit 01363cc

File tree

3 files changed

+300
-1
lines changed

3 files changed

+300
-1
lines changed

sentry_sdk/_types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77

88
SENSITIVE_DATA_SUBSTITUTE = "[Filtered]"
9+
BLOB_DATA_SUBSTITUTE = "[Blob substitute]"
910

1011

1112
class AnnotatedValue:

sentry_sdk/ai/utils.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from sys import getsizeof
66
from typing import TYPE_CHECKING
77

8+
from sentry_sdk._types import BLOB_DATA_SUBSTITUTE
9+
810
if TYPE_CHECKING:
911
from typing import Any, Callable, Dict, List, Optional, Tuple
1012

@@ -141,6 +143,85 @@ def _find_truncation_index(messages: "List[Dict[str, Any]]", max_bytes: int) ->
141143
return 0
142144

143145

146+
def redact_blob_message_parts(
147+
messages: "List[Dict[str, Any]]",
148+
) -> "List[Dict[str, Any]]":
149+
"""
150+
Redact blob message parts from the messages by replacing blob content with "[Filtered]".
151+
152+
This function creates a deep copy of messages that contain blob content to avoid
153+
mutating the original message dictionaries. Messages without blob content are
154+
returned as-is to minimize copying overhead.
155+
156+
e.g:
157+
{
158+
"role": "user",
159+
"content": [
160+
{
161+
"text": "How many ponies do you see in the image?",
162+
"type": "text"
163+
},
164+
{
165+
"type": "blob",
166+
"modality": "image",
167+
"mime_type": "image/jpeg",
168+
"content": "data:image/jpeg;base64,..."
169+
}
170+
]
171+
}
172+
becomes:
173+
{
174+
"role": "user",
175+
"content": [
176+
{
177+
"text": "How many ponies do you see in the image?",
178+
"type": "text"
179+
},
180+
{
181+
"type": "blob",
182+
"modality": "image",
183+
"mime_type": "image/jpeg",
184+
"content": "[Filtered]"
185+
}
186+
]
187+
}
188+
"""
189+
190+
# First pass: check if any message contains blob content
191+
has_blobs = False
192+
for message in messages:
193+
if not isinstance(message, dict):
194+
continue
195+
content = message.get("content")
196+
if isinstance(content, list):
197+
for item in content:
198+
if isinstance(item, dict) and item.get("type") == "blob":
199+
has_blobs = True
200+
break
201+
if has_blobs:
202+
break
203+
204+
# If no blobs found, return original messages to avoid unnecessary copying
205+
if not has_blobs:
206+
return messages
207+
208+
# Deep copy messages to avoid mutating the original
209+
messages_copy = deepcopy(messages)
210+
211+
# Second pass: redact blob content in the copy
212+
for message in messages_copy:
213+
if not isinstance(message, dict):
214+
continue
215+
216+
content = message.get("content")
217+
if isinstance(content, list):
218+
for item in content:
219+
if isinstance(item, dict) and item.get("type") == "blob":
220+
item["content"] = BLOB_DATA_SUBSTITUTE
221+
222+
return messages_copy
223+
224+
144225
def truncate_messages_by_size(
145226
messages: "List[Dict[str, Any]]",
146227
max_bytes: int = MAX_GEN_AI_MESSAGE_BYTES,
@@ -186,6 +267,8 @@ def truncate_and_annotate_messages(
186267
if not messages:
187268
return None
188269

270+
messages = redact_blob_message_parts(messages)
271+
189272
truncated_messages, removed_count = truncate_messages_by_size(messages, max_bytes)
190273
if removed_count > 0:
191274
scope._gen_ai_original_message_count[span.span_id] = len(messages)

tests/test_ai_monitoring.py

Lines changed: 216 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@
44
import pytest
55

66
import sentry_sdk
7-
from sentry_sdk._types import AnnotatedValue
7+
from sentry_sdk._types import (
8+
AnnotatedValue,
9+
SENSITIVE_DATA_SUBSTITUTE,
10+
BLOB_DATA_SUBSTITUTE,
11+
)
812
from sentry_sdk.ai.monitoring import ai_track
913
from sentry_sdk.ai.utils import (
1014
MAX_GEN_AI_MESSAGE_BYTES,
@@ -13,6 +17,7 @@
1317
truncate_and_annotate_messages,
1418
truncate_messages_by_size,
1519
_find_truncation_index,
20+
redact_blob_message_parts,
1621
)
1722
from sentry_sdk.serializer import serialize
1823
from sentry_sdk.utils import safe_serialize
@@ -425,6 +430,49 @@ def __init__(self):
425430
assert isinstance(result, list)
426431
assert result[0] == large_messages[-len(result)]
427432

433+
def test_preserves_original_messages_with_blobs(self):
434+
"""Test that truncate_and_annotate_messages doesn't mutate the original messages"""
435+
436+
class MockSpan:
437+
def __init__(self):
438+
self.span_id = "test_span_id"
439+
self.data = {}
440+
441+
def set_data(self, key, value):
442+
self.data[key] = value
443+
444+
class MockScope:
445+
def __init__(self):
446+
self._gen_ai_original_message_count = {}
447+
448+
messages = [
449+
{
450+
"role": "user",
451+
"content": [
452+
{"text": "What's in this image?", "type": "text"},
453+
{
454+
"type": "blob",
455+
"modality": "image",
456+
"content": "data:image/jpeg;base64,original_content",
457+
},
458+
],
459+
}
460+
]
461+
462+
original_blob_content = messages[0]["content"][1]["content"]
463+
464+
span = MockSpan()
465+
scope = MockScope()
466+
467+
# This should NOT mutate the original messages
468+
result = truncate_and_annotate_messages(messages, span, scope)
469+
470+
# Verify original is unchanged
471+
assert messages[0]["content"][1]["content"] == original_blob_content
472+
473+
# Verify result has redacted content
474+
assert result[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE
475+
428476

429477
class TestClientAnnotation:
430478
def test_client_wraps_truncated_messages_in_annotated_value(self, large_messages):
@@ -542,3 +590,170 @@ def __init__(self):
542590
assert isinstance(messages_value, AnnotatedValue)
543591
assert messages_value.metadata["len"] == stored_original_length
544592
assert len(messages_value.value) == len(truncated_messages)
593+
594+
595+
class TestRedactBlobMessageParts:
596+
def test_redacts_single_blob_content(self):
597+
"""Test that blob content is redacted without mutating original messages"""
598+
messages = [
599+
{
600+
"role": "user",
601+
"content": [
602+
{
603+
"text": "How many ponies do you see in the image?",
604+
"type": "text",
605+
},
606+
{
607+
"type": "blob",
608+
"modality": "image",
609+
"mime_type": "image/jpeg",
610+
"content": "data:image/jpeg;base64,/9j/4AAQSkZJRg==",
611+
},
612+
],
613+
}
614+
]
615+
616+
# Save original blob content for comparison
617+
original_blob_content = messages[0]["content"][1]["content"]
618+
619+
result = redact_blob_message_parts(messages)
620+
621+
# Original messages should be UNCHANGED
622+
assert messages[0]["content"][1]["content"] == original_blob_content
623+
624+
# Result should have redacted content
625+
assert (
626+
result[0]["content"][0]["text"]
627+
== "How many ponies do you see in the image?"
628+
)
629+
assert result[0]["content"][0]["type"] == "text"
630+
assert result[0]["content"][1]["type"] == "blob"
631+
assert result[0]["content"][1]["modality"] == "image"
632+
assert result[0]["content"][1]["mime_type"] == "image/jpeg"
633+
assert result[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE
634+
635+
def test_redacts_multiple_blob_parts(self):
636+
"""Test that multiple blob parts are redacted without mutation"""
637+
messages = [
638+
{
639+
"role": "user",
640+
"content": [
641+
{"text": "Compare these images", "type": "text"},
642+
{
643+
"type": "blob",
644+
"modality": "image",
645+
"mime_type": "image/jpeg",
646+
"content": "data:image/jpeg;base64,first_image",
647+
},
648+
{
649+
"type": "blob",
650+
"modality": "image",
651+
"mime_type": "image/png",
652+
"content": "data:image/png;base64,second_image",
653+
},
654+
],
655+
}
656+
]
657+
658+
original_first = messages[0]["content"][1]["content"]
659+
original_second = messages[0]["content"][2]["content"]
660+
661+
result = redact_blob_message_parts(messages)
662+
663+
# Original should be unchanged
664+
assert messages[0]["content"][1]["content"] == original_first
665+
assert messages[0]["content"][2]["content"] == original_second
666+
667+
# Result should be redacted
668+
assert result[0]["content"][0]["text"] == "Compare these images"
669+
assert result[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE
670+
assert result[0]["content"][2]["content"] == BLOB_DATA_SUBSTITUTE
671+
672+
def test_redacts_blobs_in_multiple_messages(self):
673+
"""Test that blob parts are redacted across multiple messages without mutation"""
674+
messages = [
675+
{
676+
"role": "user",
677+
"content": [
678+
{"text": "First message", "type": "text"},
679+
{
680+
"type": "blob",
681+
"modality": "image",
682+
"content": "data:image/jpeg;base64,first",
683+
},
684+
],
685+
},
686+
{
687+
"role": "assistant",
688+
"content": "I see the image.",
689+
},
690+
{
691+
"role": "user",
692+
"content": [
693+
{"text": "Second message", "type": "text"},
694+
{
695+
"type": "blob",
696+
"modality": "image",
697+
"content": "data:image/jpeg;base64,second",
698+
},
699+
],
700+
},
701+
]
702+
703+
original_first = messages[0]["content"][1]["content"]
704+
original_second = messages[2]["content"][1]["content"]
705+
706+
result = redact_blob_message_parts(messages)
707+
708+
# Original should be unchanged
709+
assert messages[0]["content"][1]["content"] == original_first
710+
assert messages[2]["content"][1]["content"] == original_second
711+
712+
# Result should be redacted
713+
assert result[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE
714+
assert result[1]["content"] == "I see the image." # Unchanged
715+
assert result[2]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE
716+
717+
def test_no_blobs_returns_original_list(self):
718+
"""Test that messages without blobs are returned as-is (performance optimization)"""
719+
messages = [
720+
{"role": "user", "content": "Simple text message"},
721+
{"role": "assistant", "content": "Simple response"},
722+
]
723+
724+
result = redact_blob_message_parts(messages)
725+
726+
# Should return the same list object when no blobs present
727+
assert result is messages
728+
729+
def test_handles_non_dict_messages(self):
730+
"""Test that non-dict messages are handled gracefully"""
731+
messages = [
732+
"string message",
733+
{"role": "user", "content": "text"},
734+
None,
735+
123,
736+
]
737+
738+
result = redact_blob_message_parts(messages)
739+
740+
# Should return same list since no blobs
741+
assert result is messages
742+
743+
def test_handles_non_dict_content_items(self):
744+
"""Test that non-dict content items in arrays are handled"""
745+
messages = [
746+
{
747+
"role": "user",
748+
"content": [
749+
"string item",
750+
{"text": "text item", "type": "text"},
751+
None,
752+
],
753+
}
754+
]
755+
756+
result = redact_blob_message_parts(messages)
757+
758+
# Should return same list since no blobs
759+
assert result is messages

0 commit comments

Comments
 (0)