-
Notifications
You must be signed in to change notification settings - Fork 2.7k
feat: Add SearchableToolset for dynamic tool discovery from large tool catalogs #10426
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0146ad8
a8c67a7
ca3c2e1
717f4c1
62d8604
e442cbd
1367bba
1e1f093
c9532e1
29fb03a
5eab32c
4706261
c19f2a8
f8a4c06
590a1c3
21d0595
09c8f78
fe2e70b
ebc12ca
595620f
448650f
17acf3b
dbb0c8a
d05090c
cc8af69
14da4c2
f8a9be7
e6324fe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,282 @@ | ||
| # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> | ||
| # | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| from collections.abc import Iterator | ||
| from typing import TYPE_CHECKING, Annotated, Any | ||
|
|
||
| from haystack.core.serialization import generate_qualified_class_name, import_class_by_name | ||
| from haystack.dataclasses import Document | ||
| from haystack.document_stores.in_memory import InMemoryDocumentStore | ||
| from haystack.document_stores.types import DuplicatePolicy | ||
| from haystack.tools.from_function import create_tool_from_function | ||
| from haystack.tools.tool import Tool | ||
| from haystack.tools.toolset import Toolset | ||
| from haystack.tools.utils import flatten_tools_or_toolsets, warm_up_tools | ||
|
|
||
| if TYPE_CHECKING: | ||
| from haystack.tools import ToolsType | ||
|
|
||
|
|
||
| class SearchableToolset(Toolset): | ||
| """ | ||
| Dynamic tool discovery from large catalogs using BM25 search. | ||
|
|
||
| This Toolset enables LLMs to discover and use tools from large catalogs through | ||
| BM25-based search. Instead of exposing all tools at once (which can overwhelm the | ||
| LLM context), it provides a `search_tools` bootstrap tool that allows the LLM to | ||
| find and load specific tools as needed. | ||
|
|
||
| For very small catalogs (below `search_threshold`), acts as a simple passthrough | ||
| exposing all tools directly without any discovery mechanism. | ||
|
|
||
| ### Usage Example | ||
|
|
||
| ```python | ||
| from haystack.components.agents import Agent | ||
| from haystack.components.generators.chat import OpenAIChatGenerator | ||
| from haystack.dataclasses import ChatMessage | ||
| from haystack.tools import Tool, SearchableToolset | ||
|
|
||
| # Create a catalog of tools | ||
| catalog = [ | ||
| Tool(name="get_weather", description="Get weather for a city", ...), | ||
| Tool(name="search_web", description="Search the web", ...), | ||
| # ... 100s more tools | ||
| ] | ||
| toolset = SearchableToolset(catalog=catalog) | ||
|
|
||
| agent = Agent(chat_generator=OpenAIChatGenerator(), tools=toolset) | ||
|
|
||
| # The agent is initially provided only with the search_tools tool and will use it to find relevant tools. | ||
| result = agent.run(messages=[ChatMessage.from_user("What's the weather in Milan?")]) | ||
| ``` | ||
| """ | ||
|
|
||
| def __init__(self, catalog: "ToolsType", *, top_k: int = 3, search_threshold: int = 8): | ||
| """ | ||
| Initialize the SearchableToolset. | ||
|
|
||
| :param catalog: Source of tools - a list of Tools, list of Toolsets, or a single Toolset. | ||
| :param top_k: Default number of results for search_tools. | ||
| :param search_threshold: Minimum catalog size to activate search. | ||
| If catalog has fewer tools, acts as passthrough (all tools visible). | ||
| Default is 8. | ||
| """ | ||
| valid_catalog = isinstance(catalog, Toolset) or ( | ||
| isinstance(catalog, list) and all(isinstance(item, (Tool, Toolset)) for item in catalog) | ||
| ) | ||
| if not valid_catalog: | ||
| raise TypeError( | ||
| f"Invalid catalog type: {type(catalog)}. Expected Tool, Toolset, or list of Tools and/or Toolsets." | ||
| ) | ||
|
|
||
| # Store raw catalog; flattening is deferred to warm_up() so that lazy | ||
| # toolsets (e.g. MCPToolset with eager_connect=False) can connect first. | ||
| self._raw_catalog: "ToolsType" = catalog | ||
| self._catalog: list[Tool] = [] | ||
|
|
||
| self._top_k = top_k | ||
| self._search_threshold = search_threshold | ||
|
|
||
| # Runtime state (initialized in warm_up) | ||
| self._discovered_tools: dict[str, Tool] = {} | ||
| self._bootstrap_tool: Tool | None = None | ||
| self._document_store: InMemoryDocumentStore | None = None | ||
| self._warmed_up = False | ||
|
|
||
| # Initialize parent with empty tools list - we manage tools dynamically | ||
| super().__init__(tools=[]) | ||
|
|
||
| def __add__(self, other: Tool | Toolset | list[Tool]) -> "Toolset": | ||
| """Concatenation is not supported for SearchableToolset.""" | ||
| raise NotImplementedError("SearchableToolset does not support concatenation.") | ||
|
|
||
| def add(self, tool: Tool | Toolset) -> None: | ||
| """Adding new tools after initialization is not supported for SearchableToolset.""" | ||
| raise NotImplementedError("SearchableToolset does not support adding new tools after initialization.") | ||
|
|
||
| def _is_passthrough(self) -> bool: | ||
| """ | ||
| Internal method to check if operating in passthrough mode (small catalog). Must be called after warm_up(). | ||
| """ | ||
| return len(self._catalog) < self._search_threshold | ||
|
|
||
| def warm_up(self) -> None: | ||
| """ | ||
| Prepare the toolset for use. | ||
|
|
||
| Warms up child toolsets first (so lazy toolsets like MCPToolset can connect), | ||
| then flattens the catalog, indexes it, and creates the search_tools bootstrap tool. | ||
| In passthrough mode, it warms up all catalog tools directly. | ||
| Must be called before using the toolset with an Agent. | ||
| """ | ||
| if self._warmed_up: | ||
| return | ||
|
|
||
| # Warm up child toolsets first (triggers lazy connections like MCPToolset) | ||
| warm_up_tools(self._raw_catalog) | ||
| # Now flatten — lazy toolsets will have their real tools available | ||
| self._catalog = flatten_tools_or_toolsets(self._raw_catalog) | ||
|
|
||
| if self._is_passthrough(): | ||
| for tool in self._catalog: | ||
| tool.warm_up() | ||
| else: | ||
| self._document_store = InMemoryDocumentStore() | ||
| self._tool_by_name = {tool.name: tool for tool in self._catalog} | ||
| documents = [ | ||
| Document(content=f"{tool.name} {tool.description}", meta={"tool_name": tool.name}) | ||
| for tool in self._catalog | ||
| ] | ||
| self._document_store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) | ||
| self._bootstrap_tool = self._create_search_tool() | ||
|
|
||
| self._warmed_up = True | ||
|
|
||
| def clear(self) -> None: | ||
| """ | ||
| Clear all discovered tools. | ||
|
|
||
| This method allows resetting the toolset's discovered tools between agent runs | ||
| when the same toolset instance is reused. This can be useful for long-running | ||
|
Comment on lines
+141
to
+142
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We say this in the docstrings which sounds nice, but I don't think our Agent is set up to call this method. If we think it should do that perhaps we should open up a follow-up issue to update Agent to utilize this method? (Not entirely sure what that would look like tbh).
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| applications to control memory usage or to start fresh searches. | ||
| """ | ||
| self._discovered_tools.clear() | ||
|
|
||
| def _create_search_tool(self) -> Tool: | ||
| """Create the search_tools bootstrap tool.""" | ||
|
|
||
| tool_by_name = {tool.name: tool for tool in self._catalog} | ||
|
|
||
| def search_tools( | ||
| tool_keywords: Annotated[ | ||
| str, | ||
| "Space-separated words from tool names/descriptions (e.g. 'route weather search')." | ||
| " NOT the user's question or task—use vocabulary from the tools you need.", | ||
| ], | ||
| k: Annotated[int | None, f"Number of results to return (default: {self._top_k})"] = None, | ||
| ) -> str: | ||
| """ | ||
| ALWAYS use this tool FIRST when you need to invoke some tools but don't have the right one loaded yet. | ||
|
|
||
| Provide space separated tool keywords likely to appear in tool names/descriptions | ||
| (e.g. 'route distance weather', 'search email'). Do NOT pass the user's request or task (e.g. | ||
| 'things to do in X', 'user question'); matching is keyword-based. Returns loaded | ||
| tool names; they become available immediately. | ||
| """ | ||
|
sjrl marked this conversation as resolved.
|
||
| num_results = k if k is not None else self._top_k | ||
|
|
||
| if not tool_keywords.strip(): | ||
| return ( | ||
| "No tool keywords provided. Please provide space-separated words likely to appear in tool " | ||
| "names/descriptions (e.g. 'route weather search')." | ||
| ) | ||
|
|
||
| # at this point, the toolset has been warmed up, so self._document_store is not None | ||
| results = self._document_store.bm25_retrieval(query=tool_keywords, top_k=num_results) # type: ignore[union-attr] | ||
|
|
||
| if not results: | ||
| return "No tools found matching these keywords. Try different keywords." | ||
|
|
||
| # Add found tools to _discovered_tools. These become available to the LLM | ||
| # on the next agent iteration when __iter__ is called again - the Agent | ||
| # re-iterates over the toolset each loop, picking up newly discovered tools. | ||
| # The return message here just confirms what was found; actual tool availability | ||
| # comes through the dynamic iteration mechanism. This way we also save tokens | ||
| # by not returning the full tool definitions. | ||
| tool_names = [] | ||
| for doc in results: | ||
| tool = tool_by_name[doc.meta["tool_name"]] | ||
| tool.warm_up() | ||
| self._discovered_tools[tool.name] = tool | ||
| tool_names.append(tool.name) | ||
|
|
||
| return f"Found and loaded {len(tool_names)} tool(s): {', '.join(tool_names)}. Use them directly as tools." | ||
|
|
||
| return create_tool_from_function(function=search_tools, name="search_tools") | ||
|
|
||
| def __iter__(self) -> Iterator[Tool]: | ||
| """ | ||
| Iterate over available tools. | ||
|
|
||
| In passthrough mode, yields all catalog tools. | ||
| Otherwise, yields bootstrap tool + discovered tools. | ||
| Automatically calls warm_up() if needed to ensure bootstrap tool is available. | ||
| """ | ||
| if not self._warmed_up: | ||
| self.warm_up() | ||
| if self._is_passthrough(): | ||
| yield from self._catalog | ||
| else: | ||
| if self._bootstrap_tool is not None: | ||
| yield self._bootstrap_tool | ||
| yield from self._discovered_tools.values() | ||
|
|
||
| def __len__(self) -> int: | ||
| """Return the number of currently available tools.""" | ||
| # the number of tools is computed by invoking __iter__ on the toolset | ||
| return sum(1 for _ in self) | ||
|
|
||
| def __contains__(self, item: str | Tool) -> bool: | ||
| """ | ||
| Check if a tool is available by Tool instance or tool name string. | ||
|
|
||
| :param item: Tool instance or tool name string. | ||
| :returns: True if the tool is available, False otherwise. | ||
| """ | ||
| if isinstance(item, str): | ||
| return any(tool.name == item for tool in self) | ||
| if isinstance(item, Tool): | ||
| return any(tool == item for tool in self) | ||
| raise TypeError(f"Invalid item type: {type(item)}. Must be Tool or str.") | ||
|
|
||
| def __getitem__(self, index: int) -> Tool: | ||
| """ | ||
| Get a tool by index. | ||
|
|
||
| :param index: Index of the tool to retrieve. | ||
| :returns: The tool at the given index. | ||
| :raises IndexError: If the index is out of range. | ||
| """ | ||
| return list(self)[index] | ||
|
|
||
| def to_dict(self) -> dict[str, Any]: | ||
| """ | ||
| Serialize the toolset to a dictionary. | ||
|
|
||
| :returns: Dictionary representation of the toolset. | ||
| """ | ||
| catalog_items: list[Tool | Toolset] = ( | ||
| [self._raw_catalog] if isinstance(self._raw_catalog, Toolset) else list(self._raw_catalog) | ||
| ) | ||
|
|
||
| return { | ||
| "type": generate_qualified_class_name(type(self)), | ||
| "data": { | ||
| "catalog": [item.to_dict() for item in catalog_items], | ||
| "top_k": self._top_k, | ||
| "search_threshold": self._search_threshold, | ||
| }, | ||
| } | ||
|
|
||
| @classmethod | ||
| def from_dict(cls, data: dict[str, Any]) -> "SearchableToolset": | ||
| """ | ||
| Deserialize a toolset from a dictionary. | ||
|
|
||
| :param data: Dictionary representation of the toolset. | ||
| :returns: New SearchableToolset instance. | ||
| """ | ||
| inner_data = data["data"] | ||
|
|
||
| # Deserialize catalog items (may be Tool or Toolset instances) | ||
| catalog_data = inner_data.get("catalog", []) | ||
| catalog: list[Tool | Toolset] = [] | ||
| for item_data in catalog_data: | ||
| item_class = import_class_by_name(item_data["type"]) | ||
| if not issubclass(item_class, (Tool, Toolset)): | ||
| raise TypeError(f"Class '{item_class}' is not a subclass of Tool or Toolset") | ||
| catalog.append(item_class.from_dict(item_data)) | ||
|
|
||
| return cls(catalog=catalog, **{k: inner_data[k] for k in ("top_k", "search_threshold") if k in inner_data}) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| --- | ||
| features: | ||
| - | | ||
| Added ``SearchableToolset`` to ``haystack.tools`` module. This new toolset enables agents to dynamically discover | ||
| tools from large catalogs using keyword-based (BM25) search. Instead of exposing all tools upfront (which can | ||
| overwhelm LLMs with large tool definitions), agents start with a single ``search_tools`` function and progressively | ||
| discover relevant tools as needed. For smaller catalogs, it operates in passthrough mode exposing all tools | ||
| directly. | ||
|
|
||
| Key features include configurable search threshold for automatic passthrough mode and top-k result limiting. | ||
|
|
||
| Example usage: | ||
|
|
||
| .. code:: python | ||
|
|
||
| from haystack.components.agents import Agent | ||
| from haystack.components.generators.chat import OpenAIChatGenerator | ||
| from haystack.dataclasses import ChatMessage | ||
| from haystack.tools import Tool, SearchableToolset | ||
|
|
||
| # Create a catalog of tools | ||
| catalog = [ | ||
| Tool(name="get_weather", description="Get weather for a city", ...), | ||
| Tool(name="search_web", description="Search the web", ...), | ||
| # ... 100s more tools | ||
| ] | ||
| toolset = SearchableToolset(catalog=catalog) | ||
|
|
||
| agent = Agent(chat_generator=OpenAIChatGenerator(), tools=toolset) | ||
|
|
||
| # The agent is initially provided only with the search_tools tool and will use it to find relevant tools. | ||
| result = agent.run(messages=[ChatMessage.from_user("What's the weather in Milan?")]) |
Uh oh!
There was an error while loading. Please reload this page.