From ba0561b4c07f6ff342c4e0c70d566b9e2055fd26 Mon Sep 17 00:00:00 2001
From: "jinli.yl" <jinli.yl@alibaba-inc.com>
Date: Mon, 15 Dec 2025 20:05:40 +0800
Subject: [PATCH 1/3] feat(vector): add list filter support for metadata
 queries

- Add list filter support using "in" operator across all vector stores
- Support "metadata." prefix stripping for ChromaDB filter keys
- Enable MatchAny queries for Qdrant list filters
- Implement IN clause handling for PGVector list filters
- Add terms query support for Elasticsearch list filters
- Update memory store to handle list filter logic
- Add test cases for list filter functionality
- Modify existing tests to validate list filter behavior
---
 .../core/vector_store/chroma_vector_store.py  | 14 +++++++++--
 flowllm/core/vector_store/es_vector_store.py  |  3 +++
 .../core/vector_store/memory_vector_store.py  |  4 ++++
 .../vector_store/pgvector_vector_store.py     | 10 ++++++++
 .../core/vector_store/qdrant_vector_store.py  | 10 +++++++-
 tests/test_memory_vector_store.py             | 23 +++++++++++++------
 6 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/flowllm/core/vector_store/chroma_vector_store.py b/flowllm/core/vector_store/chroma_vector_store.py
index 1838168..f7836c7 100644
--- a/flowllm/core/vector_store/chroma_vector_store.py
+++ b/flowllm/core/vector_store/chroma_vector_store.py
@@ -87,6 +87,7 @@ def _build_chroma_filters(
                 - Term filters: {"key": "value"} -> exact match
                 - Range filters: {"key": {"gte": 1, "lte": 10}} -> range query
                 - unique_id: Filters by document ID (stored separately from metadata)
+                - Keys can use "metadata." prefix (will be stripped for ChromaDB)
 
         Returns:
             Tuple of (where_clause, ids_filter):
@@ -108,6 +109,12 @@ def _build_chroma_filters(
                     ids_filter = [filter_value]
                 continue
 
+            # Strip "metadata." prefix if present (ChromaDB stores metadata fields directly)
+            if key.startswith("metadata."):
+                chroma_key = key[len("metadata."):]
+            else:
+                chroma_key = key
+
             if isinstance(filter_value, dict):
                 # Range filter: {"gte": 1, "lte": 10}
                 range_conditions = {}
@@ -120,10 +127,13 @@ def _build_chroma_filters(
                 if "lt" in filter_value:
                     range_conditions["$lt"] = filter_value["lt"]
                 if range_conditions:
-                    where_conditions[key] = range_conditions
+                    where_conditions[chroma_key] = range_conditions
+            elif isinstance(filter_value, list):
+                # List filter: use $in operator for OR logic
+                where_conditions[chroma_key] = {"$in": filter_value}
             else:
                 # Term filter: direct value comparison
-                where_conditions[key] = filter_value
+                where_conditions[chroma_key] = filter_value
 
         return (where_conditions if where_conditions else None, ids_filter)
 
diff --git a/flowllm/core/vector_store/es_vector_store.py b/flowllm/core/vector_store/es_vector_store.py
index 73e6934..e2fb9b3 100644
--- a/flowllm/core/vector_store/es_vector_store.py
+++ b/flowllm/core/vector_store/es_vector_store.py
@@ -158,6 +158,9 @@ def _build_es_filters(filter_dict: Optional[Dict[str, Any]] = None) -> List[Dict
                     range_conditions["lt"] = filter_value["lt"]
                 if range_conditions:
                     filters.append({"range": {es_key: range_conditions}})
+            elif isinstance(filter_value, list):
+                # List filter: use terms query for OR logic
+                filters.append({"terms": {es_key: filter_value}})
             else:
                 # Term filter: direct value comparison
                 filters.append({"term": {es_key: filter_value}})
diff --git a/flowllm/core/vector_store/memory_vector_store.py b/flowllm/core/vector_store/memory_vector_store.py
index dccf47a..4a746c3 100644
--- a/flowllm/core/vector_store/memory_vector_store.py
+++ b/flowllm/core/vector_store/memory_vector_store.py
@@ -228,6 +228,10 @@ def _matches_filters(node: VectorNode, filter_dict: dict = None) -> bool:
                     range_match = False
                 if not range_match:
                     return False
+            elif isinstance(filter_value, list):
+                # List filter: value must match any item in the list (OR logic)
+                if value not in filter_value:
+                    return False
             else:
                 # Term filter: direct value comparison
                 if value != filter_value:
diff --git a/flowllm/core/vector_store/pgvector_vector_store.py b/flowllm/core/vector_store/pgvector_vector_store.py
index 12d7324..fa6b30b 100644
--- a/flowllm/core/vector_store/pgvector_vector_store.py
+++ b/flowllm/core/vector_store/pgvector_vector_store.py
@@ -218,6 +218,16 @@ def _build_sql_filters(
                     param_idx += 1
                 if range_conditions:
                     conditions.append(f"({' AND '.join(range_conditions)})")
+            elif isinstance(filter_value, list):
+                # List filter: use IN clause for OR logic
+                if use_async:
+                    placeholders = ", ".join(f"${param_idx + i}" for i in range(len(filter_value)))
+                    conditions.append(f"{jsonb_path} IN ({placeholders})")
+                else:
+                    placeholders = ", ".join(["%s"] * len(filter_value))
+                    conditions.append(f"{jsonb_path} IN ({placeholders})")
+                params.extend([str(v) for v in filter_value])
+                param_idx += len(filter_value)
             else:
                 # Term filter: direct value comparison
                 if use_async:
diff --git a/flowllm/core/vector_store/qdrant_vector_store.py b/flowllm/core/vector_store/qdrant_vector_store.py
index fbca12e..07126aa 100644
--- a/flowllm/core/vector_store/qdrant_vector_store.py
+++ b/flowllm/core/vector_store/qdrant_vector_store.py
@@ -181,7 +181,7 @@ def _build_qdrant_filters(filter_dict: Optional[Dict[str, Any]] = None):
             filter_dict = {"age": {"gte": 18, "lte": 65}}
             ```
         """
-        from qdrant_client.http.models import FieldCondition, MatchValue, Range
+        from qdrant_client.http.models import FieldCondition, MatchAny, MatchValue, Range
 
         if not filter_dict:
             return None
@@ -215,6 +215,14 @@ def _build_qdrant_filters(filter_dict: Optional[Dict[str, Any]] = None):
                             range=Range(**range_conditions),
                         ),
                     )
+            elif isinstance(filter_value, list):
+                # List filter: use MatchAny for OR logic
+                conditions.append(
+                    FieldCondition(
+                        key=qdrant_key,
+                        match=MatchAny(any=filter_value),
+                    ),
+                )
             else:
                 # Term filter: direct value comparison
                 conditions.append(
diff --git a/tests/test_memory_vector_store.py b/tests/test_memory_vector_store.py
index e79a301..799bd69 100644
--- a/tests/test_memory_vector_store.py
+++ b/tests/test_memory_vector_store.py
@@ -167,6 +167,15 @@ def create_sample_nodes(workspace_id: str, prefix: str = "") -> List[VectorNode]
             VectorNode(
                 unique_id=f"{id_prefix}node3",
                 workspace_id=workspace_id,
+                content="Machine learning is a subset of artificial intelligence.",
+                metadata={
+                    "node_type": "tech_new",
+                    "category": "ML",
+                },
+            ),
+            VectorNode(
+                unique_id=f"{id_prefix}node4",
+                workspace_id=workspace_id,
                 content="I love eating delicious seafood, especially fresh fish.",
                 metadata={
                     "node_type": "food",
@@ -174,7 +183,7 @@ def create_sample_nodes(workspace_id: str, prefix: str = "") -> List[VectorNode]
                 },
             ),
             VectorNode(
-                unique_id=f"{id_prefix}node4",
+                unique_id=f"{id_prefix}node5",
                 workspace_id=workspace_id,
                 content="Deep learning uses neural networks with multiple layers.",
                 metadata={
@@ -277,17 +286,17 @@ def test_search(self, workspace_id: str):
     def test_search_with_filter(self, workspace_id: str):
         """Test vector search with filter."""
         logger.info("=" * 20 + " FILTER SEARCH TEST " + "=" * 20)
-        filter_dict = {"metadata.node_type": "tech"}
+        filter_dict = {"metadata.node_type": ["tech", "tech_new"]}
         results = self.client.search(
             "What is artificial intelligence?",
             workspace_id=workspace_id,
             top_k=5,
             filter_dict=filter_dict,
         )
-        logger.info(f"Filtered search returned {len(results)} results (node_type=tech)")
+        logger.info(f"Filtered search returned {len(results)} results (node_type in [tech, tech_new])")
         for i, r in enumerate(results, 1):
             logger.info(f"Filtered Result {i}: {r.model_dump(exclude={'vector'})}")
-            assert r.metadata.get("node_type") == "tech", "All results should have node_type=tech"
+            assert r.metadata.get("node_type") in ["tech", "tech_new"], "All results should have node_type in [tech, tech_new]"
 
     def test_search_with_id(self, workspace_id: str):
         """Test vector search by unique_id with empty query."""
@@ -503,17 +512,17 @@ async def test_search(self, workspace_id: str):
     async def test_search_with_filter(self, workspace_id: str):
         """Test async vector search with filter."""
         logger.info("ASYNC - " + "=" * 20 + " FILTER SEARCH TEST " + "=" * 20)
-        filter_dict = {"metadata.node_type": "tech"}
+        filter_dict = {"metadata.node_type": ["tech", "tech_new"]}
         results = await self.client.async_search(
             "What is artificial intelligence?",
             workspace_id=workspace_id,
             top_k=5,
             filter_dict=filter_dict,
         )
-        logger.info(f"Filtered search returned {len(results)} results (node_type=tech)")
+        logger.info(f"Filtered search returned {len(results)} results (node_type in [tech, tech_new])")
         for i, r in enumerate(results, 1):
             logger.info(f"Filtered Result {i}: {r.model_dump(exclude={'vector'})}")
-            assert r.metadata.get("node_type") == "tech", "All results should have node_type=tech"
+            assert r.metadata.get("node_type") in ["tech", "tech_new"], "All results should have node_type in [tech, tech_new]"
 
     async def test_search_with_id(self, workspace_id: str):
         """Test async vector search by unique_id with empty query."""

From 19dfd44bc4d5f65a3e1241fab04ff393e8dabb07 Mon Sep 17 00:00:00 2001
From: "jinli.yl" <jinli.yl@alibaba-inc.com>
Date: Mon, 15 Dec 2025 20:08:42 +0800
Subject: [PATCH 2/3] style(core): format slicing syntax consistently

---
 flowllm/core/vector_store/chroma_vector_store.py |  2 +-
 tests/test_memory_vector_store.py                | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/flowllm/core/vector_store/chroma_vector_store.py b/flowllm/core/vector_store/chroma_vector_store.py
index f7836c7..a5696db 100644
--- a/flowllm/core/vector_store/chroma_vector_store.py
+++ b/flowllm/core/vector_store/chroma_vector_store.py
@@ -111,7 +111,7 @@ def _build_chroma_filters(
 
             # Strip "metadata." prefix if present (ChromaDB stores metadata fields directly)
             if key.startswith("metadata."):
-                chroma_key = key[len("metadata."):]
+                chroma_key = key[len("metadata.") :]
             else:
                 chroma_key = key
 
diff --git a/tests/test_memory_vector_store.py b/tests/test_memory_vector_store.py
index 799bd69..a385a85 100644
--- a/tests/test_memory_vector_store.py
+++ b/tests/test_memory_vector_store.py
@@ -296,7 +296,10 @@ def test_search_with_filter(self, workspace_id: str):
         logger.info(f"Filtered search returned {len(results)} results (node_type in [tech, tech_new])")
         for i, r in enumerate(results, 1):
             logger.info(f"Filtered Result {i}: {r.model_dump(exclude={'vector'})}")
-            assert r.metadata.get("node_type") in ["tech", "tech_new"], "All results should have node_type in [tech, tech_new]"
+            assert r.metadata.get("node_type") in [
+                "tech",
+                "tech_new",
+            ], "All results should have node_type in [tech, tech_new]"
 
     def test_search_with_id(self, workspace_id: str):
         """Test vector search by unique_id with empty query."""
@@ -522,7 +525,10 @@ async def test_search_with_filter(self, workspace_id: str):
         logger.info(f"Filtered search returned {len(results)} results (node_type in [tech, tech_new])")
         for i, r in enumerate(results, 1):
             logger.info(f"Filtered Result {i}: {r.model_dump(exclude={'vector'})}")
-            assert r.metadata.get("node_type") in ["tech", "tech_new"], "All results should have node_type in [tech, tech_new]"
+            assert r.metadata.get("node_type") in [
+                "tech",
+                "tech_new",
+            ], "All results should have node_type in [tech, tech_new]"
 
     async def test_search_with_id(self, workspace_id: str):
         """Test async vector search by unique_id with empty query."""

From 0d2f625517c521f08a4135069e6053b3bd290981 Mon Sep 17 00:00:00 2001
From: "jinli.yl" <jinli.yl@alibaba-inc.com>
Date: Mon, 15 Dec 2025 20:30:46 +0800
Subject: [PATCH 3/3] refactor(core): remove ContentBlockType enum and update
 ContentBlock schema

---
 flowllm/core/enumeration/__init__.py           |  2 --
 flowllm/core/enumeration/content_block_type.py | 12 ------------
 flowllm/core/schema/message.py                 | 13 ++++---------
 3 files changed, 4 insertions(+), 23 deletions(-)
 delete mode 100644 flowllm/core/enumeration/content_block_type.py

diff --git a/flowllm/core/enumeration/__init__.py b/flowllm/core/enumeration/__init__.py
index e234c5c..f765040 100644
--- a/flowllm/core/enumeration/__init__.py
+++ b/flowllm/core/enumeration/__init__.py
@@ -1,14 +1,12 @@
 """Core enumeration module."""
 
 from .chunk_enum import ChunkEnum
-from .content_block_type import ContentBlockType
 from .http_enum import HttpEnum
 from .registry_enum import RegistryEnum
 from .role import Role
 
 __all__ = [
     "ChunkEnum",
-    "ContentBlockType",
     "HttpEnum",
     "RegistryEnum",
     "Role",
diff --git a/flowllm/core/enumeration/content_block_type.py b/flowllm/core/enumeration/content_block_type.py
deleted file mode 100644
index cc9a42f..0000000
--- a/flowllm/core/enumeration/content_block_type.py
+++ /dev/null
@@ -1,12 +0,0 @@
-"""Content block type enumeration for multimodal content."""
-
-from enum import Enum
-
-
-class ContentBlockType(str, Enum):
-    """Enumeration of content block types in multimodal responses."""
-
-    TEXT = "text"
-    IMAGE_URL = "image_url"
-    AUDIO = "audio"
-    VIDEO = "video"
diff --git a/flowllm/core/schema/message.py b/flowllm/core/schema/message.py
index 2587ec5..b62b42b 100644
--- a/flowllm/core/schema/message.py
+++ b/flowllm/core/schema/message.py
@@ -6,7 +6,7 @@
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 
 from .tool_call import ToolCall
-from ..enumeration import Role, ContentBlockType
+from ..enumeration import Role
 
 
 class ContentBlock(BaseModel):
@@ -35,7 +35,7 @@ class ContentBlock(BaseModel):
 
     model_config = ConfigDict(extra="allow")
 
-    type: ContentBlockType = Field(default="")
+    type: str = Field(default="")
     content: str | dict | list = Field(default="")
 
     @model_validator(mode="before")
@@ -51,8 +51,8 @@ def init_block(cls, data: dict):
     def simple_dump(self) -> dict:
         """Convert ContentBlock to a simple dictionary format."""
         result = {
-            "type": self.type.value,
-            self.type.value: self.content,
+            "type": self.type,
+            self.type: self.content,
             **self.model_extra,
         }
 
@@ -95,11 +95,6 @@ def simple_dump(self, add_reasoning: bool = True) -> dict:
 
         return result
 
-    @property
-    def string_buffer(self) -> str:
-        """Get a string representation of the message with role and content."""
-        return f"{self.role.value}: {self.dump_content()}"
-
 
 class Trajectory(BaseModel):
     """Represents a conversation trajectory with messages and optional scoring."""