diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/bochasearch.py b/src/memos/memories/textual/tree_text_memory/retrieve/bochasearch.py index 042ed837e..133a85631 100644 --- a/src/memos/memories/textual/tree_text_memory/retrieve/bochasearch.py +++ b/src/memos/memories/textual/tree_text_memory/retrieve/bochasearch.py @@ -12,7 +12,11 @@ from memos.embedders.factory import OllamaEmbedder from memos.log import get_logger from memos.mem_reader.base import BaseMemReader -from memos.memories.textual.item import SourceMessage, TextualMemoryItem +from memos.memories.textual.item import ( + SearchedTreeNodeTextualMemoryMetadata, + SourceMessage, + TextualMemoryItem, +) logger = get_logger(__name__) @@ -138,7 +142,7 @@ def __init__( self.reader = reader def retrieve_from_internet( - self, query: str, top_k: int = 10, parsed_goal=None, info=None + self, query: str, top_k: int = 10, parsed_goal=None, info=None, mode="fast" ) -> list[TextualMemoryItem]: """ Default internet retrieval (Web Search). @@ -155,24 +159,24 @@ def retrieve_from_internet( """ search_results = self.bocha_api.search_ai(query) # ✅ default to # web-search - return self._convert_to_mem_items(search_results, query, parsed_goal, info) + return self._convert_to_mem_items(search_results, query, parsed_goal, info, mode=mode) def retrieve_from_web( - self, query: str, top_k: int = 10, parsed_goal=None, info=None + self, query: str, top_k: int = 10, parsed_goal=None, info=None, mode="fast" ) -> list[TextualMemoryItem]: """Explicitly retrieve using Bocha Web Search.""" search_results = self.bocha_api.search_web(query) - return self._convert_to_mem_items(search_results, query, parsed_goal, info) + return self._convert_to_mem_items(search_results, query, parsed_goal, info, mode=mode) def retrieve_from_ai( - self, query: str, top_k: int = 10, parsed_goal=None, info=None + self, query: str, top_k: int = 10, parsed_goal=None, info=None, mode="fast" ) -> list[TextualMemoryItem]: """Explicitly retrieve using Bocha AI Search.""" search_results = self.bocha_api.search_ai(query) - return self._convert_to_mem_items(search_results, query, parsed_goal, info) + return self._convert_to_mem_items(search_results, query, parsed_goal, info, mode=mode) def _convert_to_mem_items( - self, search_results: list[dict], query: str, parsed_goal=None, info=None + self, search_results: list[dict], query: str, parsed_goal=None, info=None, mode="fast" ): """Convert API search results into TextualMemoryItem objects.""" memory_items = [] @@ -181,7 +185,7 @@ def _convert_to_mem_items( with ContextThreadPoolExecutor(max_workers=8) as executor: futures = [ - executor.submit(self._process_result, r, query, parsed_goal, info) + executor.submit(self._process_result, r, query, parsed_goal, info, mode=mode) for r in search_results ] for future in as_completed(futures): @@ -195,7 +199,7 @@ def _convert_to_mem_items( return list(unique_memory_items.values()) def _process_result( - self, result: dict, query: str, parsed_goal: str, info: dict[str, Any] + self, result: dict, query: str, parsed_goal: str, info: dict[str, Any], mode="fast" ) -> list[TextualMemoryItem]: """Process one Bocha search result into TextualMemoryItem.""" title = result.get("name", "") @@ -216,27 +220,63 @@ def _process_result( else: publish_time = datetime.now().strftime("%Y-%m-%d") - # Use reader to split and process the content into chunks - read_items = self.reader.get_memory([content], type="doc", info=info) - - memory_items = [] - for read_item_i in read_items[0]: - read_item_i.memory = ( - f"[Outer internet view] Title: {title}\nNewsTime:" - f" {publish_time}\nSummary:" - f" {summary}\n" - f"Content: {read_item_i.memory}" - ) - read_item_i.metadata.source = "web" - read_item_i.metadata.memory_type = "OuterMemory" - read_item_i.metadata.sources = [SourceMessage(type="web", url=url)] if url else [] - read_item_i.metadata.visibility = "public" - read_item_i.metadata.internet_info = { - "title": title, - "url": url, - "site_name": site_name, - "site_icon": site_icon, - "summary": summary, - } - memory_items.append(read_item_i) - return memory_items + if mode == "fast": + info_ = info.copy() + user_id = info_.pop("user_id", "") + session_id = info_.pop("session_id", "") + return [ + TextualMemoryItem( + memory=( + f"[Outer internet view] Title: {title}\nNewsTime:" + f" {publish_time}\nSummary:" + f" {summary}\n" + ), + metadata=SearchedTreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type="OuterMemory", + status="activated", + type="fact", + source="web", + sources=[SourceMessage(type="web", url=url)] if url else [], + visibility="public", + info=info_, + background="", + confidence=0.99, + usage=[], + embedding=self.embedder.embed([content])[0], + internet_info={ + "title": title, + "url": url, + "site_name": site_name, + "site_icon": site_icon, + "summary": summary, + }, + ), + ) + ] + else: + # Use reader to split and process the content into chunks + read_items = self.reader.get_memory([content], type="doc", info=info) + + memory_items = [] + for read_item_i in read_items[0]: + read_item_i.memory = ( + f"[Outer internet view] Title: {title}\nNewsTime:" + f" {publish_time}\nSummary:" + f" {summary}\n" + f"Content: {read_item_i.memory}" + ) + read_item_i.metadata.source = "web" + read_item_i.metadata.memory_type = "OuterMemory" + read_item_i.metadata.sources = [SourceMessage(type="web", url=url)] if url else [] + read_item_i.metadata.visibility = "public" + read_item_i.metadata.internet_info = { + "title": title, + "url": url, + "site_name": site_name, + "site_icon": site_icon, + "summary": summary, + } + memory_items.append(read_item_i) + return memory_items diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py b/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py index fa91bd4f8..eae96ccac 100644 --- a/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py +++ b/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py @@ -536,7 +536,7 @@ def _retrieve_from_internet( return [] logger.info(f"[PATH-C] '{query}' Retrieving from internet...") items = self.internet_retriever.retrieve_from_internet( - query=query, top_k=top_k, parsed_goal=parsed_goal, info=info + query=query, top_k=top_k, parsed_goal=parsed_goal, info=info, mode=mode ) logger.info(f"[PATH-C] '{query}' Retrieved from internet {len(items)} items: {items}") return self.reranker.rerank( diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py b/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py index e5acd00f5..ab12a0647 100644 --- a/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py +++ b/src/memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py @@ -12,7 +12,11 @@ from memos.embedders.factory import OllamaEmbedder from memos.log import get_logger from memos.mem_reader.base import BaseMemReader -from memos.memories.textual.item import SourceMessage, TextualMemoryItem +from memos.memories.textual.item import ( + SearchedTreeNodeTextualMemoryMetadata, + SourceMessage, + TextualMemoryItem, +) logger = get_logger(__name__) @@ -132,7 +136,7 @@ def __init__( self.reader = reader def retrieve_from_internet( - self, query: str, top_k: int = 10, parsed_goal=None, info=None + self, query: str, top_k: int = 10, parsed_goal=None, info=None, mode="fast" ) -> list[TextualMemoryItem]: """ Retrieve information from Xinyu search and convert to TextualMemoryItem format @@ -153,7 +157,7 @@ def retrieve_from_internet( with ContextThreadPoolExecutor(max_workers=8) as executor: futures = [ - executor.submit(self._process_result, result, query, parsed_goal, info) + executor.submit(self._process_result, result, query, parsed_goal, info, mode=mode) for result in search_results ] for future in as_completed(futures): @@ -303,7 +307,7 @@ def _extract_tags(self, title: str, content: str, summary: str, parsed_goal=None return list(set(tags))[:15] # Limit to 15 tags def _process_result( - self, result: dict, query: str, parsed_goal: str, info: None + self, result: dict, query: str, parsed_goal: str, info: None, mode="fast" ) -> list[TextualMemoryItem]: if not info: info = {"user_id": "", "session_id": ""} @@ -323,18 +327,59 @@ def _process_result( else: publish_time = datetime.now().strftime("%Y-%m-%d") - read_items = self.reader.get_memory([content], type="doc", info=info) - - memory_items = [] - for read_item_i in read_items[0]: - read_item_i.memory = ( - f"Title: {title}\nNewsTime: {publish_time}\nSummary: {summary}\n" - f"Content: {read_item_i.memory}" - ) - read_item_i.metadata.source = "web" - read_item_i.metadata.memory_type = "OuterMemory" - read_item_i.metadata.sources = [SourceMessage(type="web", url=url)] if url else [] - read_item_i.metadata.visibility = "public" - - memory_items.append(read_item_i) - return memory_items + if mode == "fast": + info_ = info.copy() + user_id = info_.pop("user_id", "") + session_id = info_.pop("session_id", "") + return [ + TextualMemoryItem( + memory=( + f"[Outer internet view] Title: {title}\nNewsTime:" + f" {publish_time}\nSummary:" + f" {summary}\n" + ), + metadata=SearchedTreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type="OuterMemory", + status="activated", + type="fact", + source="web", + sources=[SourceMessage(type="web", url=url)] if url else [], + visibility="public", + info=info_, + background="", + confidence=0.99, + usage=[], + embedding=self.embedder.embed([content])[0], + internet_info={ + "title": title, + "url": url, + "summary": summary, + "content": content, + }, + ), + ) + ] + else: + read_items = self.reader.get_memory([content], type="doc", info=info) + + memory_items = [] + for read_item_i in read_items[0]: + read_item_i.memory = ( + f"Title: {title}\nNewsTime: {publish_time}\nSummary: {summary}\n" + f"Content: {read_item_i.memory}" + ) + read_item_i.metadata.source = "web" + read_item_i.metadata.memory_type = "OuterMemory" + read_item_i.metadata.sources = [SourceMessage(type="web", url=url)] if url else [] + read_item_i.metadata.visibility = "public" + read_item_i.metadata.internet_info = { + "title": title, + "url": url, + "summary": summary, + "content": content, + } + + memory_items.append(read_item_i) + return memory_items