Skip to content

Commit 20367ba

Browse files
committed
refactor(langchain): Use shared transform_content_part from ai/utils
Replace local _transform_langchain_content_block and _get_modality_from_mime_type functions with the shared transform_content_part function. This removes ~170 lines of duplicated code.
1 parent c45da0b commit 20367ba

File tree

2 files changed

+11
-181
lines changed

2 files changed

+11
-181
lines changed

sentry_sdk/integrations/langchain.py

Lines changed: 7 additions & 178 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
GEN_AI_ALLOWED_MESSAGE_ROLES,
1313
get_start_span_function,
1414
normalize_message_roles,
15-
parse_data_uri,
1615
set_data_normalized,
1716
truncate_and_annotate_messages,
17+
transform_content_part,
1818
)
1919
from sentry_sdk.consts import OP, SPANDATA
2020
from sentry_sdk.integrations import DidNotEnable, Integration
@@ -117,189 +117,18 @@
117117
"top_p": SPANDATA.GEN_AI_REQUEST_TOP_P,
118118
}
119119

120-
# Map LangChain content types to Sentry modalities
121-
LANGCHAIN_TYPE_TO_MODALITY = {
122-
"image": "image",
123-
"image_url": "image",
124-
"audio": "audio",
125-
"video": "video",
126-
"file": "document",
127-
}
128-
129-
130-
def _get_modality_from_mime_type(mime_type: str) -> str:
131-
"""Infer the content modality from a MIME type string."""
132-
if not mime_type:
133-
return "image" # Default fallback
134-
135-
mime_lower = mime_type.lower()
136-
if mime_lower.startswith("image/"):
137-
return "image"
138-
elif mime_lower.startswith("audio/"):
139-
return "audio"
140-
elif mime_lower.startswith("video/"):
141-
return "video"
142-
elif mime_lower.startswith("application/") or mime_lower.startswith("text/"):
143-
return "document"
144-
else:
145-
return "image" # Default fallback for unknown types
146-
147120

148121
def _transform_langchain_content_block(
149122
content_block: "Dict[str, Any]",
150123
) -> "Dict[str, Any]":
151124
"""
152-
Transform a LangChain content block to Sentry-compatible format.
153-
154-
Handles multimodal content (images, audio, video, documents) by converting them
155-
to the standardized format:
156-
- base64 encoded data -> type: "blob"
157-
- URL references -> type: "uri"
158-
- file_id references -> type: "file"
159-
160-
Supports multiple content block formats:
161-
- LangChain standard: type + base64/url/file_id fields
162-
- OpenAI legacy: image_url with nested url field
163-
- Anthropic: type + source dict with type/media_type/data or url
164-
- Google: inline_data or file_data dicts
165-
"""
166-
if not isinstance(content_block, dict):
167-
return content_block
168-
169-
block_type = content_block.get("type")
170-
171-
# Handle standard multimodal content types (image, audio, video, file)
172-
if block_type in ("image", "audio", "video", "file"):
173-
modality = LANGCHAIN_TYPE_TO_MODALITY.get(block_type, block_type)
174-
mime_type = content_block.get("mime_type", "")
175-
176-
# Check for base64 encoded content
177-
if "base64" in content_block:
178-
return {
179-
"type": "blob",
180-
"modality": modality,
181-
"mime_type": mime_type,
182-
"content": content_block.get("base64", ""),
183-
}
184-
# Check for URL reference
185-
elif "url" in content_block:
186-
return {
187-
"type": "uri",
188-
"modality": modality,
189-
"mime_type": mime_type,
190-
"uri": content_block.get("url", ""),
191-
}
192-
# Check for file_id reference
193-
elif "file_id" in content_block:
194-
return {
195-
"type": "file",
196-
"modality": modality,
197-
"mime_type": mime_type,
198-
"file_id": content_block.get("file_id", ""),
199-
}
200-
# Handle Anthropic-style format with nested "source" dict
201-
elif "source" in content_block:
202-
source = content_block.get("source", {})
203-
if isinstance(source, dict):
204-
source_type = source.get("type")
205-
media_type = source.get("media_type", "") or mime_type
206-
207-
if source_type == "base64":
208-
return {
209-
"type": "blob",
210-
"modality": modality,
211-
"mime_type": media_type,
212-
"content": source.get("data", ""),
213-
}
214-
elif source_type == "url":
215-
return {
216-
"type": "uri",
217-
"modality": modality,
218-
"mime_type": media_type,
219-
"uri": source.get("url", ""),
220-
}
221-
# Handle Google-style inline_data format with standard type
222-
elif "inline_data" in content_block:
223-
inline_data = content_block.get("inline_data", {})
224-
if isinstance(inline_data, dict):
225-
return {
226-
"type": "blob",
227-
"modality": modality,
228-
"mime_type": inline_data.get("mime_type", "") or mime_type,
229-
"content": inline_data.get("data", ""),
230-
}
231-
# Handle Google-style file_data format with standard type
232-
elif "file_data" in content_block:
233-
file_data = content_block.get("file_data", {})
234-
if isinstance(file_data, dict):
235-
return {
236-
"type": "uri",
237-
"modality": modality,
238-
"mime_type": file_data.get("mime_type", "") or mime_type,
239-
"uri": file_data.get("file_uri", ""),
240-
}
241-
242-
# Handle legacy image_url format (OpenAI style)
243-
elif block_type == "image_url":
244-
image_url_data = content_block.get("image_url", {})
245-
if isinstance(image_url_data, dict):
246-
url = image_url_data.get("url", "")
247-
else:
248-
url = str(image_url_data)
125+
Transform a LangChain content block using the shared transform_content_part function.
249126
250-
# Check if it's a data URI (base64 encoded)
251-
if url and url.startswith("data:"):
252-
try:
253-
mime_type, content = parse_data_uri(url)
254-
return {
255-
"type": "blob",
256-
"modality": "image",
257-
"mime_type": mime_type,
258-
"content": content,
259-
}
260-
except ValueError:
261-
# If parsing fails, return as URI
262-
return {
263-
"type": "uri",
264-
"modality": "image",
265-
"mime_type": "",
266-
"uri": url,
267-
}
268-
else:
269-
# Regular URL
270-
return {
271-
"type": "uri",
272-
"modality": "image",
273-
"mime_type": "",
274-
"uri": url,
275-
}
276-
277-
# Handle Google-style inline_data format
278-
if "inline_data" in content_block:
279-
inline_data = content_block.get("inline_data", {})
280-
if isinstance(inline_data, dict):
281-
mime_type = inline_data.get("mime_type", "")
282-
return {
283-
"type": "blob",
284-
"modality": _get_modality_from_mime_type(mime_type),
285-
"mime_type": mime_type,
286-
"content": inline_data.get("data", ""),
287-
}
288-
289-
# Handle Google-style file_data format
290-
if "file_data" in content_block:
291-
file_data = content_block.get("file_data", {})
292-
if isinstance(file_data, dict):
293-
mime_type = file_data.get("mime_type", "")
294-
return {
295-
"type": "uri",
296-
"modality": _get_modality_from_mime_type(mime_type),
297-
"mime_type": mime_type,
298-
"uri": file_data.get("file_uri", ""),
299-
}
300-
301-
# For text blocks and other types, return as-is
302-
return content_block
127+
Returns the original content block if transformation is not applicable
128+
(e.g., for text blocks or unrecognized formats).
129+
"""
130+
result = transform_content_part(content_block)
131+
return result if result is not None else content_block
303132

304133

305134
def _transform_langchain_message_content(content: "Any") -> "Any":

tests/integrations/langchain/test_langchain.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1911,20 +1911,21 @@ def test_transform_anthropic_source_url(self):
19111911
}
19121912

19131913
def test_transform_anthropic_source_without_media_type(self):
1914-
"""Test transformation of Anthropic-style image without media_type falls back to mime_type."""
1914+
"""Test transformation of Anthropic-style image without media_type uses empty mime_type."""
19151915
content_block = {
19161916
"type": "image",
1917-
"mime_type": "image/webp",
1917+
"mime_type": "image/webp", # Top-level mime_type is ignored by standard Anthropic format
19181918
"source": {
19191919
"type": "base64",
19201920
"data": "UklGRh4AAABXRUJQVlA4IBIAAAAwAQCdASoBAAEAAQAcJYgCdAEO",
19211921
},
19221922
}
19231923
result = _transform_langchain_content_block(content_block)
1924+
# Note: The shared transform_content_part uses media_type from source, not top-level mime_type
19241925
assert result == {
19251926
"type": "blob",
19261927
"modality": "image",
1927-
"mime_type": "image/webp",
1928+
"mime_type": "",
19281929
"content": "UklGRh4AAABXRUJQVlA4IBIAAAAwAQCdASoBAAEAAQAcJYgCdAEO",
19291930
}
19301931

0 commit comments

Comments
 (0)