Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
f593b58
fix: multi-model memreader init error
CaralHsi Nov 27, 2025
f79ea7c
fix: kwargs bug
CaralHsi Nov 27, 2025
192bdcb
feat: init examples for each multi-model parser
CaralHsi Nov 28, 2025
e9ebaa3
feat: simple user_parser
CaralHsi Nov 28, 2025
a99dd3b
Merge branch 'dev' of github.com:MemTensor/MemOS into feat/multi-mode…
CaralHsi Nov 30, 2025
655d78d
feat: add multi-model-parser example
CaralHsi Nov 30, 2025
9617939
feat: add multi-model-parser example
CaralHsi Nov 30, 2025
04e65a9
feat: update user parser: only tackle with ChatCompletionUserMessageP…
CaralHsi Nov 30, 2025
c58cbd9
feat: rewrite create source and parse fast for system parser
CaralHsi Nov 30, 2025
41efcc9
feat: rewrite create source and parse fast for system parser
CaralHsi Nov 30, 2025
fd92727
feat: rewrite assistant parser
CaralHsi Nov 30, 2025
6e4b1e1
feat: add additional sources to assistant parser
CaralHsi Nov 30, 2025
2230b84
feat: add concat fast-mode memories from multi parsers
CaralHsi Nov 30, 2025
5d5184c
refactor: fix name
CaralHsi Nov 30, 2025
7693f80
refactor: fix name
CaralHsi Nov 30, 2025
023041f
refactor: fix name
CaralHsi Nov 30, 2025
86a6f57
refactor: fix name
CaralHsi Nov 30, 2025
9228a21
refactor: fix name
CaralHsi Nov 30, 2025
2996610
refactor: fix name
CaralHsi Nov 30, 2025
ec6666a
feat: add fine process path-A in multi_modal_struct
CaralHsi Dec 1, 2025
c4a14c4
feat: add fine process path-A in multi_modal_struct
CaralHsi Dec 1, 2025
ec68606
feat: add compare simple&multimodal example
CaralHsi Dec 1, 2025
32d83e4
feat: add _process_transfer_multi_modal_data in multimodal
CaralHsi Dec 1, 2025
da1d15a
fix: conflict
CaralHsi Dec 1, 2025
0be6fd7
feat: add image type
CaralHsi Dec 1, 2025
878872c
Merge branch 'dev' into feat/complete_multi_modal
CaralHsi Dec 1, 2025
da7e425
feat: add tool role; update string/text/tool parser
CaralHsi Dec 1, 2025
f445d45
Merge branch 'feat/complete_multi_modal' of github.com:CaralHsi/MemOS…
CaralHsi Dec 1, 2025
ac0e8fe
feat: update file_content_parser and multimodal reader
CaralHsi Dec 1, 2025
c155e63
Merge branch 'dev' into feat/complete_multi_modal
CaralHsi Dec 1, 2025
d22f329
feat: default mem-reader for api is not set to multimodal reqader
CaralHsi Dec 1, 2025
ee6fa28
Merge branch 'feat/complete_multi_modal' of github.com:CaralHsi/MemOS…
CaralHsi Dec 1, 2025
fcb99da
Merge branch 'dev' into feat/complete_multi_modal
CaralHsi Dec 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,452 changes: 764 additions & 688 deletions examples/mem_reader/multimodal_struct_reader.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/memos/api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ def get_embedder_config() -> dict[str, Any]:
def get_reader_config() -> dict[str, Any]:
"""Get reader configuration."""
return {
"backend": os.getenv("MEM_READER_BACKEND", "simple_struct"),
"backend": os.getenv("MEM_READER_BACKEND", "multimodal_struct"),
"config": {
"chunk_type": os.getenv("MEM_READER_CHAT_CHUNK_TYPE", "default"),
"chunk_length": int(os.getenv("MEM_READER_CHAT_CHUNK_TOKEN_SIZE", 1600)),
Expand Down
2 changes: 2 additions & 0 deletions src/memos/mem_reader/read_multi_modal/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .assistant_parser import AssistantParser
from .base import BaseMessageParser
from .file_content_parser import FileContentParser
from .image_parser import ImageParser
from .multi_modal_parser import MultiModalParser
from .string_parser import StringParser
from .system_parser import SystemParser
Expand All @@ -29,6 +30,7 @@
"AssistantParser",
"BaseMessageParser",
"FileContentParser",
"ImageParser",
"MultiModalParser",
"StringParser",
"SystemParser",
Expand Down
4 changes: 4 additions & 0 deletions src/memos/mem_reader/read_multi_modal/assistant_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,10 @@ def parse_fast(
# Combine all content parts
content = " ".join(content_parts) if content_parts else ""

# If content is empty but we have tool_calls, audio, or refusal, still create memory
if not content and not tool_calls and not audio and not refusal:
return []

parts = [f"{role}: "]
if chat_time:
parts.append(f"[{chat_time}]: ")
Expand Down
114 changes: 111 additions & 3 deletions src/memos/mem_reader/read_multi_modal/file_content_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,15 @@
from memos.embedders.base import BaseEmbedder
from memos.llms.base import BaseLLM
from memos.log import get_logger
from memos.memories.textual.item import SourceMessage, TextualMemoryItem
from memos.memories.textual.item import (
SourceMessage,
TextualMemoryItem,
TreeNodeTextualMemoryMetadata,
)
from memos.parsers.factory import ParserFactory
from memos.types.openai_chat_completion_types import File

from .base import BaseMessageParser
from .base import BaseMessageParser, _derive_key


logger = get_logger(__name__)
Expand Down Expand Up @@ -121,7 +125,111 @@ def parse_fast(
info: dict[str, Any],
**kwargs,
) -> list[TextualMemoryItem]:
return []
"""
Parse file content part in fast mode.

Fast mode extracts file information and creates a memory item without parsing file content.
Handles various file parameter scenarios:
- file_data: base64 encoded data, URL, or plain text content
- file_id: ID of an uploaded file
- filename: name of the file

Args:
message: File content part to parse (dict with "type": "file" and "file": {...})
info: Dictionary containing user_id and session_id
**kwargs: Additional parameters

Returns:
List of TextualMemoryItem objects
"""
if not isinstance(message, dict):
logger.warning(f"[FileContentParser] Expected dict, got {type(message)}")
return []

# Extract file information
file_info = message.get("file", {})
if not isinstance(file_info, dict):
logger.warning(f"[FileContentParser] Expected file dict, got {type(file_info)}")
return []

# Extract file parameters (all are optional)
file_data = file_info.get("file_data", "")
file_id = file_info.get("file_id", "")
filename = file_info.get("filename", "")

# Build content string based on available information
content_parts = []

# Priority 1: If file_data is provided, use it (could be base64, URL, or plain text)
if file_data:
# In fast mode, we don't decode base64 or fetch URLs, just record the reference
if isinstance(file_data, str):
# Check if it looks like base64 (starts with data: or is long base64 string)
if file_data.startswith("data:") or (
len(file_data) > 100
and all(
c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
for c in file_data[:100]
)
):
content_parts.append(f"[File Data (base64/encoded): {len(file_data)} chars]")
# Check if it looks like a URL
elif file_data.startswith(("http://", "https://", "file://")):
content_parts.append(f"[File URL: {file_data}]")
else:
# TODO: split into multiple memory items
content_parts.append(file_data)
else:
content_parts.append(f"[File Data: {type(file_data).__name__}]")

# Priority 2: If file_id is provided, reference it
if file_id:
content_parts.append(f"[File ID: {file_id}]")

# Priority 3: If filename is provided, include it
if filename:
content_parts.append(f"[Filename: {filename}]")

# If no content can be extracted, create a placeholder
if not content_parts:
content_parts.append("[File: unknown]")

# Combine content parts
content = " ".join(content_parts)

# Create source
source = self.create_source(message, info)

# Extract info fields
info_ = info.copy()
user_id = info_.pop("user_id", "")
session_id = info_.pop("session_id", "")

# For file content parts, default to LongTermMemory
# (since we don't have role information at this level)
memory_type = "LongTermMemory"

# Create memory item
memory_item = TextualMemoryItem(
memory=content,
metadata=TreeNodeTextualMemoryMetadata(
user_id=user_id,
session_id=session_id,
memory_type=memory_type,
status="activated",
tags=["mode:fast", "multimodal:file"],
key=_derive_key(content),
embedding=self.embedder.embed([content])[0],
usage=[],
sources=[source],
background="",
confidence=0.99,
type="fact",
info=info_,
),
)

return [memory_item]

def parse_fine(
self,
Expand Down
93 changes: 93 additions & 0 deletions src/memos/mem_reader/read_multi_modal/image_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""Parser for image_url content parts."""

from typing import Any

from memos.embedders.base import BaseEmbedder
from memos.llms.base import BaseLLM
from memos.log import get_logger
from memos.memories.textual.item import SourceMessage, TextualMemoryItem
from memos.types.openai_chat_completion_types import ChatCompletionContentPartImageParam

from .base import BaseMessageParser


logger = get_logger(__name__)


class ImageParser(BaseMessageParser):
"""Parser for image_url content parts."""

def __init__(self, embedder: BaseEmbedder, llm: BaseLLM | None = None):
"""
Initialize ImageParser.

Args:
embedder: Embedder for generating embeddings
llm: Optional LLM for fine mode processing
"""
super().__init__(embedder, llm)

def create_source(
self,
message: ChatCompletionContentPartImageParam,
info: dict[str, Any],
) -> SourceMessage:
"""Create SourceMessage from image_url content part."""
if isinstance(message, dict):
image_url = message.get("image_url", {})
if isinstance(image_url, dict):
url = image_url.get("url", "")
detail = image_url.get("detail", "auto")
else:
url = str(image_url)
detail = "auto"
return SourceMessage(
type="image",
content=f"[image_url]: {url}",
original_part=message,
url=url,
detail=detail,
)
return SourceMessage(type="image", content=str(message))

def rebuild_from_source(
self,
source: SourceMessage,
) -> ChatCompletionContentPartImageParam:
"""Rebuild image_url content part from SourceMessage."""
# Use original_part if available
if hasattr(source, "original_part") and source.original_part:
return source.original_part

# Rebuild from source fields
url = getattr(source, "url", "") or (source.content or "").replace("[image_url]: ", "")
detail = getattr(source, "detail", "auto")
return {
"type": "image_url",
"image_url": {
"url": url,
"detail": detail,
},
}

def parse_fast(
self,
message: ChatCompletionContentPartImageParam,
info: dict[str, Any],
**kwargs,
) -> list[TextualMemoryItem]:
"""Parse image_url in fast mode - returns empty list as images need fine mode processing."""
# In fast mode, images are not processed (they need vision models)
# They will be processed in fine mode via process_transfer
return []

def parse_fine(
self,
message: ChatCompletionContentPartImageParam,
info: dict[str, Any],
**kwargs,
) -> list[TextualMemoryItem]:
"""Parse image_url in fine mode - placeholder for future vision model integration."""
# Fine mode processing would use vision models to extract text from images
# For now, return empty list
return []
8 changes: 7 additions & 1 deletion src/memos/mem_reader/read_multi_modal/multi_modal_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .assistant_parser import AssistantParser
from .base import BaseMessageParser
from .file_content_parser import FileContentParser
from .image_parser import ImageParser
from .string_parser import StringParser
from .system_parser import SystemParser
from .text_content_parser import TextContentParser
Expand Down Expand Up @@ -55,7 +56,7 @@ def __init__(
self.tool_parser = ToolParser(embedder, llm)
self.text_content_parser = TextContentParser(embedder, llm)
self.file_content_parser = FileContentParser(embedder, llm, parser)
self.image_parser = None # future
self.image_parser = ImageParser(embedder, llm)
self.audio_parser = None # future

self.role_parsers = {
Expand All @@ -69,7 +70,12 @@ def __init__(
"text": self.text_content_parser,
"file": self.file_content_parser,
"image": self.image_parser,
"image_url": self.image_parser, # Support both "image" and "image_url"
"audio": self.audio_parser,
# Custom tool formats
"tool_description": self.tool_parser,
"tool_input": self.tool_parser,
"tool_output": self.tool_parser,
}

def _get_parser(self, message: Any) -> BaseMessageParser | None:
Expand Down
Loading
Loading