From 1f32952d0066a9dc1ff1482cef48c3cbe0acb663 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Wed, 17 Dec 2025 10:45:45 +0100 Subject: [PATCH 1/8] fix(ai): redact message parts content of type blob --- sentry_sdk/ai/utils.py | 51 +++++++++++++++++ tests/test_ai_monitoring.py | 106 +++++++++++++++++++++++++++++++++++- 2 files changed, 156 insertions(+), 1 deletion(-) diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py index 1d2b4483c9..73155b0305 100644 --- a/sentry_sdk/ai/utils.py +++ b/sentry_sdk/ai/utils.py @@ -5,6 +5,8 @@ from sys import getsizeof from typing import TYPE_CHECKING +from sentry_sdk._types import SENSITIVE_DATA_SUBSTITUTE + if TYPE_CHECKING: from typing import Any, Callable, Dict, List, Optional, Tuple @@ -141,6 +143,53 @@ def _find_truncation_index(messages: "List[Dict[str, Any]]", max_bytes: int) -> return 0 +def redact_blob_message_parts(messages): + # type: (List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], int] + """ + Redact blob message parts from the messages, by removing the "content" key. + e.g: + { + "role": "user", + "content": [ + { + "text": "How many ponies do you see in the image?", + "type": "text" + }, + { + "type": "blob", + "modality": "image", + "mime_type": "image/jpeg", + "content": "data:image/jpeg;base64,..." + } + ] + } + becomes: + { + "role": "user", + "content": [ + { + "text": "How many ponies do you see in the image?", + "type": "text" + }, + { + "type": "blob", + "modality": "image", + "mime_type": "image/jpeg", + "content": "[Filtered]" + } + ] + } + """ + + for message in messages: + content = message.get("content") + if isinstance(content, list): + for item in content: + if item.get("type") == "blob": + item["content"] = SENSITIVE_DATA_SUBSTITUTE + return messages + + def truncate_messages_by_size( messages: "List[Dict[str, Any]]", max_bytes: int = MAX_GEN_AI_MESSAGE_BYTES, @@ -186,6 +235,8 @@ def truncate_and_annotate_messages( if not messages: return None + messages = redact_blob_message_parts(messages) + truncated_messages, removed_count = truncate_messages_by_size(messages, max_bytes) if removed_count > 0: scope._gen_ai_original_message_count[span.span_id] = len(messages) diff --git a/tests/test_ai_monitoring.py b/tests/test_ai_monitoring.py index 8d3d4ba204..e9f3712cd3 100644 --- a/tests/test_ai_monitoring.py +++ b/tests/test_ai_monitoring.py @@ -4,7 +4,7 @@ import pytest import sentry_sdk -from sentry_sdk._types import AnnotatedValue +from sentry_sdk._types import AnnotatedValue, SENSITIVE_DATA_SUBSTITUTE from sentry_sdk.ai.monitoring import ai_track from sentry_sdk.ai.utils import ( MAX_GEN_AI_MESSAGE_BYTES, @@ -13,6 +13,7 @@ truncate_and_annotate_messages, truncate_messages_by_size, _find_truncation_index, + redact_blob_message_parts, ) from sentry_sdk.serializer import serialize from sentry_sdk.utils import safe_serialize @@ -542,3 +543,106 @@ def __init__(self): assert isinstance(messages_value, AnnotatedValue) assert messages_value.metadata["len"] == stored_original_length assert len(messages_value.value) == len(truncated_messages) + + +class TestRedactBlobMessageParts: + def test_redacts_single_blob_content(self): + """Test that blob content is redacted in a message with single blob part""" + messages = [ + { + "role": "user", + "content": [ + { + "text": "How many ponies do you see in the image?", + "type": "text", + }, + { + "type": "blob", + "modality": "image", + "mime_type": "image/jpeg", + "content": "data:image/jpeg;base64,/9j/4AAQSkZJRg==", + }, + ], + } + ] + + result = redact_blob_message_parts(messages) + + assert result == messages # Returns the same list + assert ( + messages[0]["content"][0]["text"] + == "How many ponies do you see in the image?" + ) + assert messages[0]["content"][0]["type"] == "text" + assert messages[0]["content"][1]["type"] == "blob" + assert messages[0]["content"][1]["modality"] == "image" + assert messages[0]["content"][1]["mime_type"] == "image/jpeg" + assert messages[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + + def test_redacts_multiple_blob_parts(self): + """Test that multiple blob parts in a single message are all redacted""" + messages = [ + { + "role": "user", + "content": [ + {"text": "Compare these images", "type": "text"}, + { + "type": "blob", + "modality": "image", + "mime_type": "image/jpeg", + "content": "data:image/jpeg;base64,first_image", + }, + { + "type": "blob", + "modality": "image", + "mime_type": "image/png", + "content": "data:image/png;base64,second_image", + }, + ], + } + ] + + result = redact_blob_message_parts(messages) + + assert result == messages + assert messages[0]["content"][0]["text"] == "Compare these images" + assert messages[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + assert messages[0]["content"][2]["content"] == SENSITIVE_DATA_SUBSTITUTE + + def test_redacts_blobs_in_multiple_messages(self): + """Test that blob parts are redacted across multiple messages""" + messages = [ + { + "role": "user", + "content": [ + {"text": "First message", "type": "text"}, + { + "type": "blob", + "modality": "image", + "content": "data:image/jpeg;base64,first", + }, + ], + }, + { + "role": "assistant", + "content": "I see the image.", + }, + { + "role": "user", + "content": [ + {"text": "Second message", "type": "text"}, + { + "type": "blob", + "modality": "image", + "content": "data:image/jpeg;base64,second", + }, + ], + }, + ] + + result = redact_blob_message_parts(messages) + + assert result == messages + assert messages[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE + assert messages[1]["content"] == "I see the image." # Unchanged + assert messages[2]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE From 795bcea241f7777e646a4da14c870a3049bdbe90 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Wed, 17 Dec 2025 11:05:04 +0100 Subject: [PATCH 2/8] fix(ai): skip non dict messages --- sentry_sdk/ai/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py index 73155b0305..ae507e898b 100644 --- a/sentry_sdk/ai/utils.py +++ b/sentry_sdk/ai/utils.py @@ -182,6 +182,9 @@ def redact_blob_message_parts(messages): """ for message in messages: + if not isinstance(message, dict): + continue + content = message.get("content") if isinstance(content, list): for item in content: From a623e137d26e982c0d85258256c0ba013f9ecb24 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Wed, 17 Dec 2025 11:21:43 +0100 Subject: [PATCH 3/8] fix(ai): typing --- sentry_sdk/ai/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py index ae507e898b..1b61c7a113 100644 --- a/sentry_sdk/ai/utils.py +++ b/sentry_sdk/ai/utils.py @@ -143,8 +143,9 @@ def _find_truncation_index(messages: "List[Dict[str, Any]]", max_bytes: int) -> return 0 -def redact_blob_message_parts(messages): - # type: (List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], int] +def redact_blob_message_parts( + messages: "List[Dict[str, Any]]", +) -> "List[Dict[str, Any]]": """ Redact blob message parts from the messages, by removing the "content" key. e.g: From 3d3ce5bbdca43f14194edbbbee11d3b6dcd6d8a3 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Wed, 17 Dec 2025 11:37:12 +0100 Subject: [PATCH 4/8] fix(ai): content items may not be dicts --- sentry_sdk/ai/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py index 1b61c7a113..78a64ab737 100644 --- a/sentry_sdk/ai/utils.py +++ b/sentry_sdk/ai/utils.py @@ -189,7 +189,7 @@ def redact_blob_message_parts( content = message.get("content") if isinstance(content, list): for item in content: - if item.get("type") == "blob": + if isinstance(item, dict) and item.get("type") == "blob": item["content"] = SENSITIVE_DATA_SUBSTITUTE return messages From c606b66f1dbe62f3235f0b501c9250ba2b54632a Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Mon, 5 Jan 2026 20:15:27 +0100 Subject: [PATCH 5/8] fix(integrations): langchain add multimodal content transformation functions for images, audio, and files --- sentry_sdk/integrations/langchain.py | 122 ++++++++- .../integrations/langchain/test_langchain.py | 242 ++++++++++++++++++ 2 files changed, 363 insertions(+), 1 deletion(-) diff --git a/sentry_sdk/integrations/langchain.py b/sentry_sdk/integrations/langchain.py index 950f437d4c..51cce8942d 100644 --- a/sentry_sdk/integrations/langchain.py +++ b/sentry_sdk/integrations/langchain.py @@ -116,6 +116,124 @@ "top_p": SPANDATA.GEN_AI_REQUEST_TOP_P, } +# Map LangChain content types to Sentry modalities +LANGCHAIN_TYPE_TO_MODALITY = { + "image": "image", + "image_url": "image", + "audio": "audio", + "video": "video", + "file": "document", +} + + +def _transform_langchain_content_block( + content_block: "Dict[str, Any]", +) -> "Dict[str, Any]": + """ + Transform a LangChain content block to Sentry-compatible format. + + Handles multimodal content (images, audio, video, documents) by converting them + to the standardized format: + - base64 encoded data -> type: "blob" + - URL references -> type: "uri" + - file_id references -> type: "file" + """ + if not isinstance(content_block, dict): + return content_block + + block_type = content_block.get("type") + + # Handle standard multimodal content types (image, audio, video, file) + if block_type in ("image", "audio", "video", "file"): + modality = LANGCHAIN_TYPE_TO_MODALITY.get(block_type, block_type) + mime_type = content_block.get("mime_type", "") + + # Check for base64 encoded content + if "base64" in content_block: + return { + "type": "blob", + "modality": modality, + "mime_type": mime_type, + "content": content_block.get("base64", ""), + } + # Check for URL reference + elif "url" in content_block: + return { + "type": "uri", + "modality": modality, + "mime_type": mime_type, + "uri": content_block.get("url", ""), + } + # Check for file_id reference + elif "file_id" in content_block: + return { + "type": "file", + "modality": modality, + "mime_type": mime_type, + "file_id": content_block.get("file_id", ""), + } + + # Handle legacy image_url format (OpenAI style) + elif block_type == "image_url": + image_url_data = content_block.get("image_url", {}) + if isinstance(image_url_data, dict): + url = image_url_data.get("url", "") + else: + url = str(image_url_data) + + # Check if it's a data URI (base64 encoded) + if url.startswith("data:"): + # Parse data URI: data:mime_type;base64,content + try: + # Format: data:image/jpeg;base64,/9j/4AAQ... + header, content = url.split(",", 1) + mime_type = header.split(":")[1].split(";")[0] if ":" in header else "" + return { + "type": "blob", + "modality": "image", + "mime_type": mime_type, + "content": content, + } + except (ValueError, IndexError): + # If parsing fails, return as URI + return { + "type": "uri", + "modality": "image", + "mime_type": "", + "uri": url, + } + else: + # Regular URL + return { + "type": "uri", + "modality": "image", + "mime_type": "", + "uri": url, + } + + # For text blocks and other types, return as-is + return content_block + + +def _transform_langchain_message_content(content: "Any") -> "Any": + """ + Transform LangChain message content, handling both string content and + list of content blocks. + """ + if isinstance(content, str): + return content + + if isinstance(content, (list, tuple)): + transformed = [] + for block in content: + if isinstance(block, dict): + transformed.append(_transform_langchain_content_block(block)) + else: + transformed.append(block) + return transformed + + return content + # Contextvar to track agent names in a stack for re-entrant agent support _agent_stack: "contextvars.ContextVar[Optional[List[Optional[str]]]]" = ( @@ -234,7 +352,9 @@ def _handle_error(self, run_id: "UUID", error: "Any") -> None: del self.span_map[run_id] def _normalize_langchain_message(self, message: "BaseMessage") -> "Any": - parsed = {"role": message.type, "content": message.content} + # Transform content to handle multimodal data (images, audio, video, files) + transformed_content = _transform_langchain_message_content(message.content) + parsed = {"role": message.type, "content": transformed_content} parsed.update(message.additional_kwargs) return parsed diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py index 114e819bfb..07a37f2382 100644 --- a/tests/integrations/langchain/test_langchain.py +++ b/tests/integrations/langchain/test_langchain.py @@ -25,6 +25,8 @@ from sentry_sdk.integrations.langchain import ( LangchainIntegration, SentryLangchainCallback, + _transform_langchain_content_block, + _transform_langchain_message_content, ) try: @@ -1747,3 +1749,243 @@ def test_langchain_response_model_extraction( assert llm_span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == expected_model else: assert SPANDATA.GEN_AI_RESPONSE_MODEL not in llm_span.get("data", {}) + + +# Tests for multimodal content transformation functions + + +class TestTransformLangchainContentBlock: + """Tests for _transform_langchain_content_block function.""" + + def test_transform_image_base64(self): + """Test transformation of base64-encoded image content.""" + content_block = { + "type": "image", + "base64": "/9j/4AAQSkZJRgABAQAAAQABAAD...", + "mime_type": "image/jpeg", + } + result = _transform_langchain_content_block(content_block) + assert result == { + "type": "blob", + "modality": "image", + "mime_type": "image/jpeg", + "content": "/9j/4AAQSkZJRgABAQAAAQABAAD...", + } + + def test_transform_image_url(self): + """Test transformation of URL-referenced image content.""" + content_block = { + "type": "image", + "url": "https://example.com/image.jpg", + "mime_type": "image/jpeg", + } + result = _transform_langchain_content_block(content_block) + assert result == { + "type": "uri", + "modality": "image", + "mime_type": "image/jpeg", + "uri": "https://example.com/image.jpg", + } + + def test_transform_image_file_id(self): + """Test transformation of file_id-referenced image content.""" + content_block = { + "type": "image", + "file_id": "file-abc123", + "mime_type": "image/png", + } + result = _transform_langchain_content_block(content_block) + assert result == { + "type": "file", + "modality": "image", + "mime_type": "image/png", + "file_id": "file-abc123", + } + + def test_transform_image_url_legacy_with_data_uri(self): + """Test transformation of legacy image_url format with data: URI (base64).""" + content_block = { + "type": "image_url", + "image_url": {"url": "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD"}, + } + result = _transform_langchain_content_block(content_block) + assert result == { + "type": "blob", + "modality": "image", + "mime_type": "image/jpeg", + "content": "/9j/4AAQSkZJRgABAQAAAQABAAD", + } + + def test_transform_image_url_legacy_with_http_url(self): + """Test transformation of legacy image_url format with HTTP URL.""" + content_block = { + "type": "image_url", + "image_url": {"url": "https://example.com/image.png"}, + } + result = _transform_langchain_content_block(content_block) + assert result == { + "type": "uri", + "modality": "image", + "mime_type": "", + "uri": "https://example.com/image.png", + } + + def test_transform_image_url_legacy_string_url(self): + """Test transformation of legacy image_url format with string URL.""" + content_block = { + "type": "image_url", + "image_url": "https://example.com/image.gif", + } + result = _transform_langchain_content_block(content_block) + assert result == { + "type": "uri", + "modality": "image", + "mime_type": "", + "uri": "https://example.com/image.gif", + } + + def test_transform_image_url_legacy_data_uri_png(self): + """Test transformation of legacy image_url format with PNG data URI.""" + content_block = { + "type": "image_url", + "image_url": { + "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==" + }, + } + result = _transform_langchain_content_block(content_block) + assert result == { + "type": "blob", + "modality": "image", + "mime_type": "image/png", + "content": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==", + } + + def test_transform_missing_mime_type(self): + """Test transformation when mime_type is not provided.""" + content_block = { + "type": "image", + "base64": "/9j/4AAQSkZJRgABAQAAAQABAAD...", + } + result = _transform_langchain_content_block(content_block) + assert result == { + "type": "blob", + "modality": "image", + "mime_type": "", + "content": "/9j/4AAQSkZJRgABAQAAAQABAAD...", + } + + +class TestTransformLangchainMessageContent: + """Tests for _transform_langchain_message_content function.""" + + def test_transform_string_content(self): + """Test that string content is returned unchanged.""" + result = _transform_langchain_message_content("Hello, world!") + assert result == "Hello, world!" + + def test_transform_list_with_text_blocks(self): + """Test transformation of list with text blocks (unchanged).""" + content = [ + {"type": "text", "text": "First message"}, + {"type": "text", "text": "Second message"}, + ] + result = _transform_langchain_message_content(content) + assert result == content + + def test_transform_list_with_image_blocks(self): + """Test transformation of list containing image blocks.""" + content = [ + {"type": "text", "text": "Check out this image:"}, + { + "type": "image", + "base64": "/9j/4AAQSkZJRgABAQAAAQABAAD...", + "mime_type": "image/jpeg", + }, + ] + result = _transform_langchain_message_content(content) + assert len(result) == 2 + assert result[0] == {"type": "text", "text": "Check out this image:"} + assert result[1] == { + "type": "blob", + "modality": "image", + "mime_type": "image/jpeg", + "content": "/9j/4AAQSkZJRgABAQAAAQABAAD...", + } + + def test_transform_list_with_mixed_content(self): + """Test transformation of list with mixed content types.""" + content = [ + {"type": "text", "text": "Here are some files:"}, + { + "type": "image", + "url": "https://example.com/image.jpg", + "mime_type": "image/jpeg", + }, + { + "type": "file", + "file_id": "doc-123", + "mime_type": "application/pdf", + }, + {"type": "audio", "base64": "audio_data...", "mime_type": "audio/mp3"}, + ] + result = _transform_langchain_message_content(content) + assert len(result) == 4 + assert result[0] == {"type": "text", "text": "Here are some files:"} + assert result[1] == { + "type": "uri", + "modality": "image", + "mime_type": "image/jpeg", + "uri": "https://example.com/image.jpg", + } + assert result[2] == { + "type": "file", + "modality": "document", + "mime_type": "application/pdf", + "file_id": "doc-123", + } + assert result[3] == { + "type": "blob", + "modality": "audio", + "mime_type": "audio/mp3", + "content": "audio_data...", + } + + def test_transform_list_with_non_dict_items(self): + """Test transformation handles non-dict items in list.""" + content = ["plain string", {"type": "text", "text": "dict text"}] + result = _transform_langchain_message_content(content) + assert result == ["plain string", {"type": "text", "text": "dict text"}] + + def test_transform_tuple_content(self): + """Test transformation of tuple content.""" + content = ( + {"type": "text", "text": "Message"}, + {"type": "image", "base64": "data...", "mime_type": "image/png"}, + ) + result = _transform_langchain_message_content(content) + assert len(result) == 2 + assert result[1] == { + "type": "blob", + "modality": "image", + "mime_type": "image/png", + "content": "data...", + } + + def test_transform_list_with_legacy_image_url(self): + """Test transformation of list containing legacy image_url blocks.""" + content = [ + {"type": "text", "text": "Check this:"}, + { + "type": "image_url", + "image_url": {"url": "data:image/jpeg;base64,/9j/4AAQ..."}, + }, + ] + result = _transform_langchain_message_content(content) + assert len(result) == 2 + assert result[0] == {"type": "text", "text": "Check this:"} + assert result[1] == { + "type": "blob", + "modality": "image", + "mime_type": "image/jpeg", + "content": "/9j/4AAQ...", + } From c650799f0b7de741cd77811732644aaa2d722686 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Thu, 8 Jan 2026 14:22:59 +0100 Subject: [PATCH 6/8] fix(integrations): ensure URL check for data URIs handles empty strings --- sentry_sdk/integrations/langchain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentry_sdk/integrations/langchain.py b/sentry_sdk/integrations/langchain.py index 51cce8942d..1b9389c23a 100644 --- a/sentry_sdk/integrations/langchain.py +++ b/sentry_sdk/integrations/langchain.py @@ -182,7 +182,7 @@ def _transform_langchain_content_block( url = str(image_url_data) # Check if it's a data URI (base64 encoded) - if url.startswith("data:"): + if url and url.startswith("data:"): # Parse data URI: data:mime_type;base64,content try: # Format: data:image/jpeg;base64,/9j/4AAQ... From 510e2ed206be5a01667bdd03719fa2ee7be45876 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Wed, 14 Jan 2026 14:22:00 +0100 Subject: [PATCH 7/8] fix(integrations): Langchain: Handle Anthropic and Google provider-native content formats --- sentry_sdk/integrations/langchain.py | 49 +++++++++++ .../integrations/langchain/test_langchain.py | 86 +++++++++++++++++++ 2 files changed, 135 insertions(+) diff --git a/sentry_sdk/integrations/langchain.py b/sentry_sdk/integrations/langchain.py index 1b9389c23a..68f5d0ad95 100644 --- a/sentry_sdk/integrations/langchain.py +++ b/sentry_sdk/integrations/langchain.py @@ -137,6 +137,12 @@ def _transform_langchain_content_block( - base64 encoded data -> type: "blob" - URL references -> type: "uri" - file_id references -> type: "file" + + Supports multiple content block formats: + - LangChain standard: type + base64/url/file_id fields + - OpenAI legacy: image_url with nested url field + - Anthropic: type + source dict with type/media_type/data or url + - Google: inline_data or file_data dicts """ if not isinstance(content_block, dict): return content_block @@ -172,6 +178,27 @@ def _transform_langchain_content_block( "mime_type": mime_type, "file_id": content_block.get("file_id", ""), } + # Handle Anthropic-style format with nested "source" dict + elif "source" in content_block: + source = content_block.get("source", {}) + if isinstance(source, dict): + source_type = source.get("type") + media_type = source.get("media_type", "") or mime_type + + if source_type == "base64": + return { + "type": "blob", + "modality": modality, + "mime_type": media_type, + "content": source.get("data", ""), + } + elif source_type == "url": + return { + "type": "uri", + "modality": modality, + "mime_type": media_type, + "uri": source.get("url", ""), + } # Handle legacy image_url format (OpenAI style) elif block_type == "image_url": @@ -211,6 +238,28 @@ def _transform_langchain_content_block( "uri": url, } + # Handle Google-style inline_data format + if "inline_data" in content_block: + inline_data = content_block.get("inline_data", {}) + if isinstance(inline_data, dict): + return { + "type": "blob", + "modality": "image", + "mime_type": inline_data.get("mime_type", ""), + "content": inline_data.get("data", ""), + } + + # Handle Google-style file_data format + if "file_data" in content_block: + file_data = content_block.get("file_data", {}) + if isinstance(file_data, dict): + return { + "type": "uri", + "modality": "image", + "mime_type": file_data.get("mime_type", ""), + "uri": file_data.get("file_uri", ""), + } + # For text blocks and other types, return as-is return content_block diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py index 07a37f2382..de5f5841ca 100644 --- a/tests/integrations/langchain/test_langchain.py +++ b/tests/integrations/langchain/test_langchain.py @@ -1874,6 +1874,92 @@ def test_transform_missing_mime_type(self): "content": "/9j/4AAQSkZJRgABAQAAAQABAAD...", } + def test_transform_anthropic_source_base64(self): + """Test transformation of Anthropic-style image with base64 source.""" + content_block = { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": "iVBORw0KGgoAAAANSUhEUgAAAAE...", + }, + } + result = _transform_langchain_content_block(content_block) + assert result == { + "type": "blob", + "modality": "image", + "mime_type": "image/png", + "content": "iVBORw0KGgoAAAANSUhEUgAAAAE...", + } + + def test_transform_anthropic_source_url(self): + """Test transformation of Anthropic-style image with URL source.""" + content_block = { + "type": "image", + "source": { + "type": "url", + "media_type": "image/jpeg", + "url": "https://example.com/image.jpg", + }, + } + result = _transform_langchain_content_block(content_block) + assert result == { + "type": "uri", + "modality": "image", + "mime_type": "image/jpeg", + "uri": "https://example.com/image.jpg", + } + + def test_transform_anthropic_source_without_media_type(self): + """Test transformation of Anthropic-style image without media_type falls back to mime_type.""" + content_block = { + "type": "image", + "mime_type": "image/webp", + "source": { + "type": "base64", + "data": "UklGRh4AAABXRUJQVlA4IBIAAAAwAQCdASoBAAEAAQAcJYgCdAEO", + }, + } + result = _transform_langchain_content_block(content_block) + assert result == { + "type": "blob", + "modality": "image", + "mime_type": "image/webp", + "content": "UklGRh4AAABXRUJQVlA4IBIAAAAwAQCdASoBAAEAAQAcJYgCdAEO", + } + + def test_transform_google_inline_data(self): + """Test transformation of Google-style inline_data format.""" + content_block = { + "inline_data": { + "mime_type": "image/jpeg", + "data": "/9j/4AAQSkZJRgABAQAAAQABAAD...", + } + } + result = _transform_langchain_content_block(content_block) + assert result == { + "type": "blob", + "modality": "image", + "mime_type": "image/jpeg", + "content": "/9j/4AAQSkZJRgABAQAAAQABAAD...", + } + + def test_transform_google_file_data(self): + """Test transformation of Google-style file_data format.""" + content_block = { + "file_data": { + "mime_type": "image/png", + "file_uri": "gs://bucket/path/to/image.png", + } + } + result = _transform_langchain_content_block(content_block) + assert result == { + "type": "uri", + "modality": "image", + "mime_type": "image/png", + "uri": "gs://bucket/path/to/image.png", + } + class TestTransformLangchainMessageContent: """Tests for _transform_langchain_message_content function.""" From 1764e571247a12963148b7bbecec37a6b23bfb4e Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Wed, 14 Jan 2026 16:51:10 +0100 Subject: [PATCH 8/8] fix(integrations): Use correct modality for Google-style content formats and use common function for data URI parsing --- sentry_sdk/integrations/langchain.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/sentry_sdk/integrations/langchain.py b/sentry_sdk/integrations/langchain.py index 68f5d0ad95..f29dfbe870 100644 --- a/sentry_sdk/integrations/langchain.py +++ b/sentry_sdk/integrations/langchain.py @@ -12,6 +12,7 @@ GEN_AI_ALLOWED_MESSAGE_ROLES, get_start_span_function, normalize_message_roles, + parse_data_uri, set_data_normalized, truncate_and_annotate_messages, ) @@ -199,6 +200,26 @@ def _transform_langchain_content_block( "mime_type": media_type, "uri": source.get("url", ""), } + # Handle Google-style inline_data format with standard type + elif "inline_data" in content_block: + inline_data = content_block.get("inline_data", {}) + if isinstance(inline_data, dict): + return { + "type": "blob", + "modality": modality, + "mime_type": inline_data.get("mime_type", "") or mime_type, + "content": inline_data.get("data", ""), + } + # Handle Google-style file_data format with standard type + elif "file_data" in content_block: + file_data = content_block.get("file_data", {}) + if isinstance(file_data, dict): + return { + "type": "uri", + "modality": modality, + "mime_type": file_data.get("mime_type", "") or mime_type, + "uri": file_data.get("file_uri", ""), + } # Handle legacy image_url format (OpenAI style) elif block_type == "image_url": @@ -210,18 +231,15 @@ def _transform_langchain_content_block( # Check if it's a data URI (base64 encoded) if url and url.startswith("data:"): - # Parse data URI: data:mime_type;base64,content try: - # Format: data:image/jpeg;base64,/9j/4AAQ... - header, content = url.split(",", 1) - mime_type = header.split(":")[1].split(";")[0] if ":" in header else "" + mime_type, content = parse_data_uri(url) return { "type": "blob", "modality": "image", "mime_type": mime_type, "content": content, } - except (ValueError, IndexError): + except ValueError: # If parsing fails, return as URI return { "type": "uri",