feat(ai): implement parse_data_uri function and integrate it into OpenAI message handling

constantinius · constantinius · commit c1a2239c7946 · 2026-01-08T15:19:04.000+01:00
diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py
@@ -40,6 +40,39 @@ class GEN_AI_ALLOWED_MESSAGE_ROLES:
         GEN_AI_MESSAGE_ROLE_MAPPING[source_role] = target_role
 
 
+def parse_data_uri(url):
+    # type: (str) -> Tuple[str, str]
+    """
+    Parse a data URI and return (mime_type, content).
+
+    Data URI format (RFC 2397): data:[<mediatype>][;base64],<data>
+
+    Examples:
+        data:image/jpeg;base64,/9j/4AAQ... → ("image/jpeg", "/9j/4AAQ...")
+        data:text/plain,Hello → ("text/plain", "Hello")
+        data:;base64,SGVsbG8= → ("", "SGVsbG8=")
+
+    Raises:
+        ValueError: If the URL is not a valid data URI (missing comma separator)
+    """
+    if "," not in url:
+        raise ValueError("Invalid data URI: missing comma separator")
+
+    header, content = url.split(",", 1)
+
+    # Extract mime type from header
+    # Format: "data:<mime>[;param1][;param2]..." e.g. "data:image/jpeg;base64"
+    # Remove "data:" prefix, then take everything before the first semicolon
+    if header.startswith("data:"):
+        mime_part = header[5:]  # Remove "data:" prefix
+    else:
+        mime_part = header
+
+    mime_type = mime_part.split(";")[0]
+
+    return mime_type, content
+
+
 def _normalize_data(data: "Any", unpack: bool = True) -> "Any":
     # convert pydantic data (e.g. OpenAI v1+) to json compatible format
     if hasattr(data, "model_dump"):
diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py
@@ -6,6 +6,7 @@
 from sentry_sdk.ai.utils import (
     set_data_normalized,
     normalize_message_roles,
+    parse_data_uri,
     truncate_and_annotate_messages,
 )
 from sentry_sdk.consts import SPANDATA
@@ -218,21 +219,33 @@ def _convert_message_parts(messages: "List[Dict[str, Any]]") -> "List[Dict[str,
     def _map_item(item: "Dict[str, Any]") -> "Dict[str, Any]":
         if item.get("type") == "image_url":
             image_url = item.get("image_url") or {}
-            if image_url.get("url", "").startswith("data:"):
-                return {
-                    "type": "blob",
-                    "modality": "image",
-                    "mime_type": item["image_url"]["url"].split(";base64,")[0],
-                    "content": item["image_url"]["url"].split(";base64,")[1],
-                }
+            url = image_url.get("url", "")
+            if url.startswith("data:"):
+                try:
+                    mime_type, content = parse_data_uri(url)
+                    return {
+                        "type": "blob",
+                        "modality": "image",
+                        "mime_type": mime_type,
+                        "content": content,
+                    }
+                except ValueError:
+                    # If parsing fails, return as URI
+                    return {
+                        "type": "uri",
+                        "modality": "image",
+                        "uri": url,
+                    }
             else:
                 return {
                     "type": "uri",
-                    "uri": item["image_url"]["url"],
+                    "uri": url,
                 }
         return item
 
     for message in messages:
+        if not isinstance(message, dict):
+            continue
         content = message.get("content")
         if isinstance(content, list):
             message["content"] = [_map_item(item) for item in content]
diff --git a/sentry_sdk/integrations/openai_agents/utils.py b/sentry_sdk/integrations/openai_agents/utils.py
@@ -2,6 +2,7 @@
 from sentry_sdk.ai.utils import (
     GEN_AI_ALLOWED_MESSAGE_ROLES,
     normalize_message_roles,
+    parse_data_uri,
     set_data_normalized,
     normalize_message_role,
     truncate_and_annotate_messages,
@@ -66,17 +67,15 @@ def _transform_openai_agents_content_part(
             url = content_part.get("image_url", "")
 
         if url.startswith("data:"):
-            # Parse data URI: data:image/jpeg;base64,/9j/4AAQ...
             try:
-                header, content = url.split(",", 1)
-                mime_type = header.split(":")[1].split(";")[0] if ":" in header else ""
+                mime_type, content = parse_data_uri(url)
                 return {
                     "type": "blob",
                     "modality": "image",
                     "mime_type": mime_type,
                     "content": content,
                 }
-            except (ValueError, IndexError):
+            except ValueError:
                 # If parsing fails, return as URI
                 return {
                     "type": "uri",
diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py
@@ -1548,7 +1548,7 @@ def test_convert_message_parts_image_url_to_blob():
     blob_item = converted[0]["content"][1]
     assert blob_item["type"] == "blob"
     assert blob_item["modality"] == "image"
-    assert blob_item["mime_type"] == "data:image/jpeg"
+    assert blob_item["mime_type"] == "image/jpeg"
     assert blob_item["content"] == "/9j/4AAQSkZJRg=="
     # Verify the original image_url structure is replaced
     assert "image_url" not in blob_item
@@ -1581,6 +1581,34 @@ def test_convert_message_parts_image_url_to_uri():
     assert "image_url" not in uri_item
 
 
+def test_convert_message_parts_malformed_data_uri():
+    """Test that malformed data URIs are handled gracefully without crashing"""
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        # Malformed: missing ;base64, and comma separator
+                        "url": "data:image/jpeg",
+                    },
+                },
+            ],
+        }
+    ]
+
+    # Should not raise an exception
+    converted = _convert_message_parts(messages)
+
+    assert len(converted) == 1
+    # Malformed data URI should fall back to uri type
+    item = converted[0]["content"][0]
+    assert item["type"] == "uri"
+    assert item["uri"] == "data:image/jpeg"
+    assert item["modality"] == "image"
+
+
 def test_openai_message_truncation(sentry_init, capture_events):
     """Test that large messages are truncated properly in OpenAI integration."""
     sentry_init(
diff --git a/tests/test_ai_monitoring.py b/tests/test_ai_monitoring.py
@@ -13,6 +13,7 @@
     truncate_and_annotate_messages,
     truncate_messages_by_size,
     _find_truncation_index,
+    parse_data_uri,
     redact_blob_message_parts,
 )
 from sentry_sdk.serializer import serialize
@@ -646,3 +647,69 @@ def test_redacts_blobs_in_multiple_messages(self):
         assert messages[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE
         assert messages[1]["content"] == "I see the image."  # Unchanged
         assert messages[2]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE
+
+
+class TestParseDataUri:
+    """Tests for the parse_data_uri utility function."""
+
+    def test_standard_base64_image(self):
+        """Test parsing a standard base64 encoded image data URI."""
+        url = "data:image/jpeg;base64,/9j/4AAQSkZJRg=="
+        mime_type, content = parse_data_uri(url)
+        assert mime_type == "image/jpeg"
+        assert content == "/9j/4AAQSkZJRg=="
+
+    def test_png_image(self):
+        """Test parsing a PNG image data URI."""
+        url = "data:image/png;base64,iVBORw0KGgo="
+        mime_type, content = parse_data_uri(url)
+        assert mime_type == "image/png"
+        assert content == "iVBORw0KGgo="
+
+    def test_plain_text_without_base64(self):
+        """Test parsing a plain text data URI without base64 encoding."""
+        url = "data:text/plain,Hello%20World"
+        mime_type, content = parse_data_uri(url)
+        assert mime_type == "text/plain"
+        assert content == "Hello%20World"
+
+    def test_no_mime_type_with_base64(self):
+        """Test parsing a data URI with no mime type but base64 encoding."""
+        url = "data:;base64,SGVsbG8="
+        mime_type, content = parse_data_uri(url)
+        assert mime_type == ""
+        assert content == "SGVsbG8="
+
+    def test_no_mime_type_no_base64(self):
+        """Test parsing a minimal data URI."""
+        url = "data:,Hello"
+        mime_type, content = parse_data_uri(url)
+        assert mime_type == ""
+        assert content == "Hello"
+
+    def test_content_with_commas(self):
+        """Test that content with commas is handled correctly."""
+        url = "data:text/csv,a,b,c,d"
+        mime_type, content = parse_data_uri(url)
+        assert mime_type == "text/csv"
+        assert content == "a,b,c,d"
+
+    def test_missing_comma_raises_value_error(self):
+        """Test that a data URI without a comma raises ValueError."""
+        url = "data:image/jpeg"
+        with pytest.raises(ValueError, match="missing comma separator"):
+            parse_data_uri(url)
+
+    def test_empty_content(self):
+        """Test parsing a data URI with empty content."""
+        url = "data:text/plain,"
+        mime_type, content = parse_data_uri(url)
+        assert mime_type == "text/plain"
+        assert content == ""
+
+    def test_mime_type_with_charset(self):
+        """Test parsing a data URI with charset parameter."""
+        url = "data:text/html;charset=utf-8,<h1>Hello</h1>"
+        mime_type, content = parse_data_uri(url)
+        assert mime_type == "text/html"
+        assert content == "<h1>Hello</h1>"