From 25c39eacf2a990c6bcf31ba556329f464cf1134b Mon Sep 17 00:00:00 2001 From: codegen-bot Date: Mon, 17 Mar 2025 15:24:25 +0000 Subject: [PATCH 1/2] Add multimodal support for CodeAgent --- src/codegen/agents/code_agent.py | 14 ++++- src/codegen/extensions/langchain/llm.py | 82 ++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 4 deletions(-) diff --git a/src/codegen/agents/code_agent.py b/src/codegen/agents/code_agent.py index 99406ef40..2c36f3d9b 100644 --- a/src/codegen/agents/code_agent.py +++ b/src/codegen/agents/code_agent.py @@ -46,6 +46,7 @@ def __init__( agent_config: Optional[AgentConfig] = None, thread_id: Optional[str] = None, logger: Optional[ExternalLogger] = None, + multimodal: bool = True, **kwargs, ): """Initialize a CodeAgent. @@ -58,6 +59,10 @@ def __init__( tools: Additional tools to use tags: Tags to add to the agent trace. Must be of the same type. metadata: Metadata to use for the agent. Must be a dictionary. + agent_config: Configuration for the agent + thread_id: Optional thread ID for message history + logger: Optional external logger + multimodal: Whether to use a multimodal model (default: True) **kwargs: Additional LLM configuration options. Supported options: - temperature: Temperature parameter (0-1) - top_p: Top-p sampling parameter (0-1) @@ -65,6 +70,13 @@ def __init__( - max_tokens: Maximum number of tokens to generate """ self.codebase = codebase + + # If multimodal is enabled, ensure we're using a multimodal model + if multimodal and model_provider == "anthropic" and "claude-3" not in model_name: + # Default to Claude 3 Sonnet if multimodal is requested but model isn't Claude 3 + model_name = "claude-3-sonnet-20240229" + print(f"Multimodal support requested, using {model_name}") + self.agent = create_codebase_agent( self.codebase, model_provider=model_provider, @@ -217,4 +229,4 @@ def get_tags_metadata(self) -> tuple[list[str], dict]: metadata["swebench_difficulty"] = self.difficulty tags.append(f"difficulty_{self.difficulty}") - return tags, metadata + return tags, metadata \ No newline at end of file diff --git a/src/codegen/extensions/langchain/llm.py b/src/codegen/extensions/langchain/llm.py index 0d4795740..585dc3b52 100644 --- a/src/codegen/extensions/langchain/llm.py +++ b/src/codegen/extensions/langchain/llm.py @@ -1,14 +1,15 @@ """LLM implementation supporting both OpenAI and Anthropic models.""" import os +import re from collections.abc import Sequence -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Union from langchain_anthropic import ChatAnthropic from langchain_core.callbacks import CallbackManagerForLLMRun from langchain_core.language_models.base import LanguageModelInput from langchain_core.language_models.chat_models import BaseChatModel -from langchain_core.messages import BaseMessage +from langchain_core.messages import BaseMessage, HumanMessage from langchain_core.outputs import ChatResult from langchain_core.runnables import Runnable from langchain_core.tools import BaseTool @@ -106,6 +107,76 @@ def _get_model(self) -> BaseChatModel: msg = f"Unknown model provider: {self.model_provider}. Must be one of: anthropic, openai, xai" raise ValueError(msg) + def _process_messages_for_multimodal(self, messages: List[BaseMessage]) -> List[BaseMessage]: + """Process messages to handle multimodal content (images). + + This function looks for image URLs in the format [Image: filename](URL) in message content + and converts them to the appropriate format for multimodal models. + + Args: + messages: List of messages to process + + Returns: + Processed messages with multimodal content + """ + processed_messages = [] + + for message in messages: + if not isinstance(message, HumanMessage): + # Only process human messages for now + processed_messages.append(message) + continue + + content = message.content + if isinstance(content, str): + # Check for image URLs in the format [Image: filename](URL) + image_pattern = r'\[Image(?:\s+\d+)?:\s+([^\]]+)\]\(([^)]+)\)' + matches = re.findall(image_pattern, content) + + if not matches: + # No images found, keep the message as is + processed_messages.append(message) + continue + + # Convert to multimodal format + multimodal_content = [] + last_end = 0 + + for match in re.finditer(image_pattern, content): + # Add text before the image + if match.start() > last_end: + multimodal_content.append({ + "type": "text", + "text": content[last_end:match.start()] + }) + + # Add the image + image_url = match.group(2) + multimodal_content.append({ + "type": "image_url", + "image_url": { + "url": image_url + } + }) + + last_end = match.end() + + # Add any remaining text after the last image + if last_end < len(content): + multimodal_content.append({ + "type": "text", + "text": content[last_end:] + }) + + # Create a new message with multimodal content + new_message = HumanMessage(content=multimodal_content) + processed_messages.append(new_message) + else: + # Content is already in a different format, keep as is + processed_messages.append(message) + + return processed_messages + def _generate( self, messages: list[BaseMessage], @@ -124,6 +195,11 @@ def _generate( Returns: ChatResult containing the generated completion """ + # Process messages for multimodal content if using a multimodal model + if self.model_provider == "anthropic" and "claude-3" in self.model_name: + processed_messages = self._process_messages_for_multimodal(messages) + return self._model._generate(processed_messages, stop=stop, run_manager=run_manager, **kwargs) + return self._model._generate(messages, stop=stop, run_manager=run_manager, **kwargs) def bind_tools( @@ -140,4 +216,4 @@ def bind_tools( Returns: Runnable that can be used to invoke the model with tools """ - return self._model.bind_tools(tools, **kwargs) + return self._model.bind_tools(tools, **kwargs) \ No newline at end of file From 3ac76e9051cc27e898bfad15163e4bc10af967cf Mon Sep 17 00:00:00 2001 From: "codegen-sh[bot]" <131295404+codegen-sh[bot]@users.noreply.github.com> Date: Mon, 17 Mar 2025 15:25:24 +0000 Subject: [PATCH 2/2] Automated pre-commit update --- src/codegen/agents/code_agent.py | 6 +-- src/codegen/extensions/langchain/llm.py | 53 ++++++++++--------------- 2 files changed, 24 insertions(+), 35 deletions(-) diff --git a/src/codegen/agents/code_agent.py b/src/codegen/agents/code_agent.py index 2c36f3d9b..73ce704d1 100644 --- a/src/codegen/agents/code_agent.py +++ b/src/codegen/agents/code_agent.py @@ -70,13 +70,13 @@ def __init__( - max_tokens: Maximum number of tokens to generate """ self.codebase = codebase - + # If multimodal is enabled, ensure we're using a multimodal model if multimodal and model_provider == "anthropic" and "claude-3" not in model_name: # Default to Claude 3 Sonnet if multimodal is requested but model isn't Claude 3 model_name = "claude-3-sonnet-20240229" print(f"Multimodal support requested, using {model_name}") - + self.agent = create_codebase_agent( self.codebase, model_provider=model_provider, @@ -229,4 +229,4 @@ def get_tags_metadata(self) -> tuple[list[str], dict]: metadata["swebench_difficulty"] = self.difficulty tags.append(f"difficulty_{self.difficulty}") - return tags, metadata \ No newline at end of file + return tags, metadata diff --git a/src/codegen/extensions/langchain/llm.py b/src/codegen/extensions/langchain/llm.py index 585dc3b52..57bd6a711 100644 --- a/src/codegen/extensions/langchain/llm.py +++ b/src/codegen/extensions/langchain/llm.py @@ -3,7 +3,7 @@ import os import re from collections.abc import Sequence -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional from langchain_anthropic import ChatAnthropic from langchain_core.callbacks import CallbackManagerForLLMRun @@ -107,74 +107,63 @@ def _get_model(self) -> BaseChatModel: msg = f"Unknown model provider: {self.model_provider}. Must be one of: anthropic, openai, xai" raise ValueError(msg) - def _process_messages_for_multimodal(self, messages: List[BaseMessage]) -> List[BaseMessage]: + def _process_messages_for_multimodal(self, messages: list[BaseMessage]) -> list[BaseMessage]: """Process messages to handle multimodal content (images). - + This function looks for image URLs in the format [Image: filename](URL) in message content and converts them to the appropriate format for multimodal models. - + Args: messages: List of messages to process - + Returns: Processed messages with multimodal content """ processed_messages = [] - + for message in messages: if not isinstance(message, HumanMessage): # Only process human messages for now processed_messages.append(message) continue - + content = message.content if isinstance(content, str): # Check for image URLs in the format [Image: filename](URL) - image_pattern = r'\[Image(?:\s+\d+)?:\s+([^\]]+)\]\(([^)]+)\)' + image_pattern = r"\[Image(?:\s+\d+)?:\s+([^\]]+)\]\(([^)]+)\)" matches = re.findall(image_pattern, content) - + if not matches: # No images found, keep the message as is processed_messages.append(message) continue - + # Convert to multimodal format multimodal_content = [] last_end = 0 - + for match in re.finditer(image_pattern, content): # Add text before the image if match.start() > last_end: - multimodal_content.append({ - "type": "text", - "text": content[last_end:match.start()] - }) - + multimodal_content.append({"type": "text", "text": content[last_end : match.start()]}) + # Add the image image_url = match.group(2) - multimodal_content.append({ - "type": "image_url", - "image_url": { - "url": image_url - } - }) - + multimodal_content.append({"type": "image_url", "image_url": {"url": image_url}}) + last_end = match.end() - + # Add any remaining text after the last image if last_end < len(content): - multimodal_content.append({ - "type": "text", - "text": content[last_end:] - }) - + multimodal_content.append({"type": "text", "text": content[last_end:]}) + # Create a new message with multimodal content new_message = HumanMessage(content=multimodal_content) processed_messages.append(new_message) else: # Content is already in a different format, keep as is processed_messages.append(message) - + return processed_messages def _generate( @@ -199,7 +188,7 @@ def _generate( if self.model_provider == "anthropic" and "claude-3" in self.model_name: processed_messages = self._process_messages_for_multimodal(messages) return self._model._generate(processed_messages, stop=stop, run_manager=run_manager, **kwargs) - + return self._model._generate(messages, stop=stop, run_manager=run_manager, **kwargs) def bind_tools( @@ -216,4 +205,4 @@ def bind_tools( Returns: Runnable that can be used to invoke the model with tools """ - return self._model.bind_tools(tools, **kwargs) \ No newline at end of file + return self._model.bind_tools(tools, **kwargs)