From ba0561b4c07f6ff342c4e0c70d566b9e2055fd26 Mon Sep 17 00:00:00 2001 From: "jinli.yl" Date: Mon, 15 Dec 2025 20:05:40 +0800 Subject: [PATCH 1/3] feat(vector): add list filter support for metadata queries - Add list filter support using "in" operator across all vector stores - Support "metadata." prefix stripping for ChromaDB filter keys - Enable MatchAny queries for Qdrant list filters - Implement IN clause handling for PGVector list filters - Add terms query support for Elasticsearch list filters - Update memory store to handle list filter logic - Add test cases for list filter functionality - Modify existing tests to validate list filter behavior --- .../core/vector_store/chroma_vector_store.py | 14 +++++++++-- flowllm/core/vector_store/es_vector_store.py | 3 +++ .../core/vector_store/memory_vector_store.py | 4 ++++ .../vector_store/pgvector_vector_store.py | 10 ++++++++ .../core/vector_store/qdrant_vector_store.py | 10 +++++++- tests/test_memory_vector_store.py | 23 +++++++++++++------ 6 files changed, 54 insertions(+), 10 deletions(-) diff --git a/flowllm/core/vector_store/chroma_vector_store.py b/flowllm/core/vector_store/chroma_vector_store.py index 1838168..f7836c7 100644 --- a/flowllm/core/vector_store/chroma_vector_store.py +++ b/flowllm/core/vector_store/chroma_vector_store.py @@ -87,6 +87,7 @@ def _build_chroma_filters( - Term filters: {"key": "value"} -> exact match - Range filters: {"key": {"gte": 1, "lte": 10}} -> range query - unique_id: Filters by document ID (stored separately from metadata) + - Keys can use "metadata." prefix (will be stripped for ChromaDB) Returns: Tuple of (where_clause, ids_filter): @@ -108,6 +109,12 @@ def _build_chroma_filters( ids_filter = [filter_value] continue + # Strip "metadata." prefix if present (ChromaDB stores metadata fields directly) + if key.startswith("metadata."): + chroma_key = key[len("metadata."):] + else: + chroma_key = key + if isinstance(filter_value, dict): # Range filter: {"gte": 1, "lte": 10} range_conditions = {} @@ -120,10 +127,13 @@ def _build_chroma_filters( if "lt" in filter_value: range_conditions["$lt"] = filter_value["lt"] if range_conditions: - where_conditions[key] = range_conditions + where_conditions[chroma_key] = range_conditions + elif isinstance(filter_value, list): + # List filter: use $in operator for OR logic + where_conditions[chroma_key] = {"$in": filter_value} else: # Term filter: direct value comparison - where_conditions[key] = filter_value + where_conditions[chroma_key] = filter_value return (where_conditions if where_conditions else None, ids_filter) diff --git a/flowllm/core/vector_store/es_vector_store.py b/flowllm/core/vector_store/es_vector_store.py index 73e6934..e2fb9b3 100644 --- a/flowllm/core/vector_store/es_vector_store.py +++ b/flowllm/core/vector_store/es_vector_store.py @@ -158,6 +158,9 @@ def _build_es_filters(filter_dict: Optional[Dict[str, Any]] = None) -> List[Dict range_conditions["lt"] = filter_value["lt"] if range_conditions: filters.append({"range": {es_key: range_conditions}}) + elif isinstance(filter_value, list): + # List filter: use terms query for OR logic + filters.append({"terms": {es_key: filter_value}}) else: # Term filter: direct value comparison filters.append({"term": {es_key: filter_value}}) diff --git a/flowllm/core/vector_store/memory_vector_store.py b/flowllm/core/vector_store/memory_vector_store.py index dccf47a..4a746c3 100644 --- a/flowllm/core/vector_store/memory_vector_store.py +++ b/flowllm/core/vector_store/memory_vector_store.py @@ -228,6 +228,10 @@ def _matches_filters(node: VectorNode, filter_dict: dict = None) -> bool: range_match = False if not range_match: return False + elif isinstance(filter_value, list): + # List filter: value must match any item in the list (OR logic) + if value not in filter_value: + return False else: # Term filter: direct value comparison if value != filter_value: diff --git a/flowllm/core/vector_store/pgvector_vector_store.py b/flowllm/core/vector_store/pgvector_vector_store.py index 12d7324..fa6b30b 100644 --- a/flowllm/core/vector_store/pgvector_vector_store.py +++ b/flowllm/core/vector_store/pgvector_vector_store.py @@ -218,6 +218,16 @@ def _build_sql_filters( param_idx += 1 if range_conditions: conditions.append(f"({' AND '.join(range_conditions)})") + elif isinstance(filter_value, list): + # List filter: use IN clause for OR logic + if use_async: + placeholders = ", ".join(f"${param_idx + i}" for i in range(len(filter_value))) + conditions.append(f"{jsonb_path} IN ({placeholders})") + else: + placeholders = ", ".join(["%s"] * len(filter_value)) + conditions.append(f"{jsonb_path} IN ({placeholders})") + params.extend([str(v) for v in filter_value]) + param_idx += len(filter_value) else: # Term filter: direct value comparison if use_async: diff --git a/flowllm/core/vector_store/qdrant_vector_store.py b/flowllm/core/vector_store/qdrant_vector_store.py index fbca12e..07126aa 100644 --- a/flowllm/core/vector_store/qdrant_vector_store.py +++ b/flowllm/core/vector_store/qdrant_vector_store.py @@ -181,7 +181,7 @@ def _build_qdrant_filters(filter_dict: Optional[Dict[str, Any]] = None): filter_dict = {"age": {"gte": 18, "lte": 65}} ``` """ - from qdrant_client.http.models import FieldCondition, MatchValue, Range + from qdrant_client.http.models import FieldCondition, MatchAny, MatchValue, Range if not filter_dict: return None @@ -215,6 +215,14 @@ def _build_qdrant_filters(filter_dict: Optional[Dict[str, Any]] = None): range=Range(**range_conditions), ), ) + elif isinstance(filter_value, list): + # List filter: use MatchAny for OR logic + conditions.append( + FieldCondition( + key=qdrant_key, + match=MatchAny(any=filter_value), + ), + ) else: # Term filter: direct value comparison conditions.append( diff --git a/tests/test_memory_vector_store.py b/tests/test_memory_vector_store.py index e79a301..799bd69 100644 --- a/tests/test_memory_vector_store.py +++ b/tests/test_memory_vector_store.py @@ -167,6 +167,15 @@ def create_sample_nodes(workspace_id: str, prefix: str = "") -> List[VectorNode] VectorNode( unique_id=f"{id_prefix}node3", workspace_id=workspace_id, + content="Machine learning is a subset of artificial intelligence.", + metadata={ + "node_type": "tech_new", + "category": "ML", + }, + ), + VectorNode( + unique_id=f"{id_prefix}node4", + workspace_id=workspace_id, content="I love eating delicious seafood, especially fresh fish.", metadata={ "node_type": "food", @@ -174,7 +183,7 @@ def create_sample_nodes(workspace_id: str, prefix: str = "") -> List[VectorNode] }, ), VectorNode( - unique_id=f"{id_prefix}node4", + unique_id=f"{id_prefix}node5", workspace_id=workspace_id, content="Deep learning uses neural networks with multiple layers.", metadata={ @@ -277,17 +286,17 @@ def test_search(self, workspace_id: str): def test_search_with_filter(self, workspace_id: str): """Test vector search with filter.""" logger.info("=" * 20 + " FILTER SEARCH TEST " + "=" * 20) - filter_dict = {"metadata.node_type": "tech"} + filter_dict = {"metadata.node_type": ["tech", "tech_new"]} results = self.client.search( "What is artificial intelligence?", workspace_id=workspace_id, top_k=5, filter_dict=filter_dict, ) - logger.info(f"Filtered search returned {len(results)} results (node_type=tech)") + logger.info(f"Filtered search returned {len(results)} results (node_type in [tech, tech_new])") for i, r in enumerate(results, 1): logger.info(f"Filtered Result {i}: {r.model_dump(exclude={'vector'})}") - assert r.metadata.get("node_type") == "tech", "All results should have node_type=tech" + assert r.metadata.get("node_type") in ["tech", "tech_new"], "All results should have node_type in [tech, tech_new]" def test_search_with_id(self, workspace_id: str): """Test vector search by unique_id with empty query.""" @@ -503,17 +512,17 @@ async def test_search(self, workspace_id: str): async def test_search_with_filter(self, workspace_id: str): """Test async vector search with filter.""" logger.info("ASYNC - " + "=" * 20 + " FILTER SEARCH TEST " + "=" * 20) - filter_dict = {"metadata.node_type": "tech"} + filter_dict = {"metadata.node_type": ["tech", "tech_new"]} results = await self.client.async_search( "What is artificial intelligence?", workspace_id=workspace_id, top_k=5, filter_dict=filter_dict, ) - logger.info(f"Filtered search returned {len(results)} results (node_type=tech)") + logger.info(f"Filtered search returned {len(results)} results (node_type in [tech, tech_new])") for i, r in enumerate(results, 1): logger.info(f"Filtered Result {i}: {r.model_dump(exclude={'vector'})}") - assert r.metadata.get("node_type") == "tech", "All results should have node_type=tech" + assert r.metadata.get("node_type") in ["tech", "tech_new"], "All results should have node_type in [tech, tech_new]" async def test_search_with_id(self, workspace_id: str): """Test async vector search by unique_id with empty query.""" From 19dfd44bc4d5f65a3e1241fab04ff393e8dabb07 Mon Sep 17 00:00:00 2001 From: "jinli.yl" Date: Mon, 15 Dec 2025 20:08:42 +0800 Subject: [PATCH 2/3] style(core): format slicing syntax consistently --- flowllm/core/vector_store/chroma_vector_store.py | 2 +- tests/test_memory_vector_store.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/flowllm/core/vector_store/chroma_vector_store.py b/flowllm/core/vector_store/chroma_vector_store.py index f7836c7..a5696db 100644 --- a/flowllm/core/vector_store/chroma_vector_store.py +++ b/flowllm/core/vector_store/chroma_vector_store.py @@ -111,7 +111,7 @@ def _build_chroma_filters( # Strip "metadata." prefix if present (ChromaDB stores metadata fields directly) if key.startswith("metadata."): - chroma_key = key[len("metadata."):] + chroma_key = key[len("metadata.") :] else: chroma_key = key diff --git a/tests/test_memory_vector_store.py b/tests/test_memory_vector_store.py index 799bd69..a385a85 100644 --- a/tests/test_memory_vector_store.py +++ b/tests/test_memory_vector_store.py @@ -296,7 +296,10 @@ def test_search_with_filter(self, workspace_id: str): logger.info(f"Filtered search returned {len(results)} results (node_type in [tech, tech_new])") for i, r in enumerate(results, 1): logger.info(f"Filtered Result {i}: {r.model_dump(exclude={'vector'})}") - assert r.metadata.get("node_type") in ["tech", "tech_new"], "All results should have node_type in [tech, tech_new]" + assert r.metadata.get("node_type") in [ + "tech", + "tech_new", + ], "All results should have node_type in [tech, tech_new]" def test_search_with_id(self, workspace_id: str): """Test vector search by unique_id with empty query.""" @@ -522,7 +525,10 @@ async def test_search_with_filter(self, workspace_id: str): logger.info(f"Filtered search returned {len(results)} results (node_type in [tech, tech_new])") for i, r in enumerate(results, 1): logger.info(f"Filtered Result {i}: {r.model_dump(exclude={'vector'})}") - assert r.metadata.get("node_type") in ["tech", "tech_new"], "All results should have node_type in [tech, tech_new]" + assert r.metadata.get("node_type") in [ + "tech", + "tech_new", + ], "All results should have node_type in [tech, tech_new]" async def test_search_with_id(self, workspace_id: str): """Test async vector search by unique_id with empty query.""" From 0d2f625517c521f08a4135069e6053b3bd290981 Mon Sep 17 00:00:00 2001 From: "jinli.yl" Date: Mon, 15 Dec 2025 20:30:46 +0800 Subject: [PATCH 3/3] refactor(core): remove ContentBlockType enum and update ContentBlock schema --- flowllm/core/enumeration/__init__.py | 2 -- flowllm/core/enumeration/content_block_type.py | 12 ------------ flowllm/core/schema/message.py | 13 ++++--------- 3 files changed, 4 insertions(+), 23 deletions(-) delete mode 100644 flowllm/core/enumeration/content_block_type.py diff --git a/flowllm/core/enumeration/__init__.py b/flowllm/core/enumeration/__init__.py index e234c5c..f765040 100644 --- a/flowllm/core/enumeration/__init__.py +++ b/flowllm/core/enumeration/__init__.py @@ -1,14 +1,12 @@ """Core enumeration module.""" from .chunk_enum import ChunkEnum -from .content_block_type import ContentBlockType from .http_enum import HttpEnum from .registry_enum import RegistryEnum from .role import Role __all__ = [ "ChunkEnum", - "ContentBlockType", "HttpEnum", "RegistryEnum", "Role", diff --git a/flowllm/core/enumeration/content_block_type.py b/flowllm/core/enumeration/content_block_type.py deleted file mode 100644 index cc9a42f..0000000 --- a/flowllm/core/enumeration/content_block_type.py +++ /dev/null @@ -1,12 +0,0 @@ -"""Content block type enumeration for multimodal content.""" - -from enum import Enum - - -class ContentBlockType(str, Enum): - """Enumeration of content block types in multimodal responses.""" - - TEXT = "text" - IMAGE_URL = "image_url" - AUDIO = "audio" - VIDEO = "video" diff --git a/flowllm/core/schema/message.py b/flowllm/core/schema/message.py index 2587ec5..b62b42b 100644 --- a/flowllm/core/schema/message.py +++ b/flowllm/core/schema/message.py @@ -6,7 +6,7 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator from .tool_call import ToolCall -from ..enumeration import Role, ContentBlockType +from ..enumeration import Role class ContentBlock(BaseModel): @@ -35,7 +35,7 @@ class ContentBlock(BaseModel): model_config = ConfigDict(extra="allow") - type: ContentBlockType = Field(default="") + type: str = Field(default="") content: str | dict | list = Field(default="") @model_validator(mode="before") @@ -51,8 +51,8 @@ def init_block(cls, data: dict): def simple_dump(self) -> dict: """Convert ContentBlock to a simple dictionary format.""" result = { - "type": self.type.value, - self.type.value: self.content, + "type": self.type, + self.type: self.content, **self.model_extra, } @@ -95,11 +95,6 @@ def simple_dump(self, add_reasoning: bool = True) -> dict: return result - @property - def string_buffer(self) -> str: - """Get a string representation of the message with role and content.""" - return f"{self.role.value}: {self.dump_content()}" - class Trajectory(BaseModel): """Represents a conversation trajectory with messages and optional scoring."""