Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/memos/mem_feedback/feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from memos.mem_feedback.base import BaseMemFeedback
from memos.mem_feedback.utils import should_keep_update, split_into_chunks
from memos.mem_reader.factory import MemReaderFactory
from memos.mem_reader.simple_struct import detect_lang
from memos.mem_reader.read_multi_modal import detect_lang
from memos.memories.textual.item import TextualMemoryItem, TreeNodeTextualMemoryMetadata
from memos.memories.textual.tree_text_memory.organize.manager import (
MemoryManager,
Expand Down
4 changes: 2 additions & 2 deletions src/memos/mem_reader/multi_modal_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from memos import log
from memos.configs.mem_reader import MultiModalStructMemReaderConfig
from memos.context.context import ContextThreadPoolExecutor
from memos.mem_reader.read_multi_modal import MultiModalParser
from memos.mem_reader.simple_struct import SimpleStructMemReader, detect_lang
from memos.mem_reader.read_multi_modal import MultiModalParser, detect_lang
from memos.mem_reader.simple_struct import SimpleStructMemReader
from memos.memories.textual.item import TextualMemoryItem
from memos.templates.tool_mem_prompts import TOOL_TRAJECTORY_PROMPT_EN, TOOL_TRAJECTORY_PROMPT_ZH
from memos.types import MessagesType
Expand Down
3 changes: 2 additions & 1 deletion src/memos/mem_reader/read_multi_modal/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from .text_content_parser import TextContentParser
from .tool_parser import ToolParser
from .user_parser import UserParser
from .utils import coerce_scene_data, extract_role
from .utils import coerce_scene_data, detect_lang, extract_role


__all__ = [
Expand All @@ -38,5 +38,6 @@
"ToolParser",
"UserParser",
"coerce_scene_data",
"detect_lang",
"extract_role",
]
278 changes: 271 additions & 7 deletions src/memos/mem_reader/read_multi_modal/image_parser.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
"""Parser for image_url content parts."""

import json
import re

from typing import Any

from memos.embedders.base import BaseEmbedder
from memos.llms.base import BaseLLM
from memos.log import get_logger
from memos.memories.textual.item import SourceMessage, TextualMemoryItem
from memos.memories.textual.item import (
SourceMessage,
TextualMemoryItem,
TreeNodeTextualMemoryMetadata,
)
from memos.templates.mem_reader_prompts import IMAGE_ANALYSIS_PROMPT_EN, IMAGE_ANALYSIS_PROMPT_ZH
from memos.types.openai_chat_completion_types import ChatCompletionContentPartImageParam

from .base import BaseMessageParser
from .base import BaseMessageParser, _derive_key
from .utils import detect_lang


logger = get_logger(__name__)
Expand Down Expand Up @@ -43,7 +52,7 @@ def create_source(
detail = "auto"
return SourceMessage(
type="image",
content=f"[image_url]: {url}",
content=url,
original_part=message,
url=url,
detail=detail,
Expand Down Expand Up @@ -87,7 +96,262 @@ def parse_fine(
info: dict[str, Any],
**kwargs,
) -> list[TextualMemoryItem]:
"""Parse image_url in fine mode - placeholder for future vision model integration."""
# Fine mode processing would use vision models to extract text from images
# For now, return empty list
return []
"""
Parse image_url in fine mode using vision models to extract information from images.

Args:
message: Image message to parse
info: Dictionary containing user_id and session_id
**kwargs: Additional parameters (e.g., context_items, custom_tags)

Returns:
List of TextualMemoryItem objects extracted from the image
"""
if not self.llm:
logger.warning("[ImageParser] LLM not available for fine mode processing")
return []

# Extract image information
if not isinstance(message, dict):
logger.warning(f"[ImageParser] Expected dict, got {type(message)}")
return []

image_url = message.get("image_url", {})
if isinstance(image_url, dict):
url = image_url.get("url", "")
detail = image_url.get("detail", "auto")
else:
url = str(image_url)
detail = "auto"

if not url:
logger.warning("[ImageParser] No image URL found in message")
return []

# Create source for this image
source = self.create_source(message, info)

# Get context items if available
context_items = kwargs.get("context_items")

# Determine language from context if available
lang = "en"
if context_items:
for item in context_items:
if hasattr(item, "memory") and item.memory:
lang = detect_lang(item.memory)
break

# Select prompt based on language
image_analysis_prompt = (
IMAGE_ANALYSIS_PROMPT_ZH if lang == "zh" else IMAGE_ANALYSIS_PROMPT_EN
)

# Build messages with image content
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": image_analysis_prompt},
{
"type": "image_url",
"image_url": {
"url": url,
"detail": detail,
},
},
],
}
]

# Add context if available
if context_items:
context_text = ""
for item in context_items:
if hasattr(item, "memory") and item.memory:
context_text += f"{item.memory}\n"
if context_text:
messages.insert(
0,
{
"role": "system",
"content": f"Context from previous conversation:\n{context_text}",
},
)

try:
# Call LLM with vision model
response_text = self.llm.generate(messages)
if not response_text:
logger.warning("[ImageParser] Empty response from LLM")
return []

# Parse JSON response
response_json = self._parse_json_result(response_text)

# Extract memory items from response
memory_items = []
memory_list = response_json.get("memory list", [])

if not memory_list:
logger.warning("[ImageParser] No memory items extracted from image")
# Fallback: create a simple memory item with the summary
summary = response_json.get(
"summary", "Image analyzed but no specific memories extracted."
)
if summary:
memory_items.append(
self._create_memory_item(
value=summary,
info=info,
memory_type="LongTermMemory",
tags=["image", "visual"],
key=_derive_key(summary),
sources=[source],
background=summary,
)
)
return memory_items

# Create memory items from parsed response
for mem_data in memory_list:
try:
# Normalize memory_type
memory_type = (
mem_data.get("memory_type", "LongTermMemory")
.replace("长期记忆", "LongTermMemory")
.replace("用户记忆", "UserMemory")
)
if memory_type not in ["LongTermMemory", "UserMemory"]:
memory_type = "LongTermMemory"

value = mem_data.get("value", "").strip()
if not value:
continue

tags = mem_data.get("tags", [])
if not isinstance(tags, list):
tags = []
# Add image-related tags
if "image" not in [t.lower() for t in tags]:
tags.append("image")
if "visual" not in [t.lower() for t in tags]:
tags.append("visual")

key = mem_data.get("key", "")
background = response_json.get("summary", "")

memory_item = self._create_memory_item(
value=value,
info=info,
memory_type=memory_type,
tags=tags,
key=key if key else _derive_key(value),
sources=[source],
background=background,
)
memory_items.append(memory_item)
except Exception as e:
logger.error(f"[ImageParser] Error creating memory item: {e}")
continue

return memory_items

except Exception as e:
logger.error(f"[ImageParser] Error processing image in fine mode: {e}")
# Fallback: create a simple memory item
fallback_value = f"Image analyzed: {url}"
return [
self._create_memory_item(
value=fallback_value,
info=info,
memory_type="LongTermMemory",
tags=["image", "visual"],
key=_derive_key(fallback_value),
sources=[source],
background="Image processing encountered an error.",
)
]

def _parse_json_result(self, response_text: str) -> dict:
"""
Parse JSON result from LLM response.
Similar to SimpleStructMemReader.parse_json_result.
"""
s = (response_text or "").strip()

# Try to extract JSON from code blocks
m = re.search(r"```(?:json)?\s*([\s\S]*?)```", s, flags=re.I)
s = (m.group(1) if m else s.replace("```", "")).strip()

# Find first {
i = s.find("{")
if i == -1:
return {}
s = s[i:].strip()

try:
return json.loads(s)
except json.JSONDecodeError:
pass

# Try to find the last } or ]
j = max(s.rfind("}"), s.rfind("]"))
if j != -1:
try:
return json.loads(s[: j + 1])
except json.JSONDecodeError:
pass

# Try to close brackets
def _cheap_close(t: str) -> str:
t += "}" * max(0, t.count("{") - t.count("}"))
t += "]" * max(0, t.count("[") - t.count("]"))
return t

t = _cheap_close(s)
try:
return json.loads(t)
except json.JSONDecodeError as e:
if "Invalid \\escape" in str(e):
s = s.replace("\\", "\\\\")
try:
return json.loads(s)
except json.JSONDecodeError:
pass
logger.error(f"[ImageParser] Failed to parse JSON: {e}\nResponse: {response_text}")
return {}

def _create_memory_item(
self,
value: str,
info: dict[str, Any],
memory_type: str,
tags: list[str],
key: str,
sources: list[SourceMessage],
background: str = "",
) -> TextualMemoryItem:
"""Create a TextualMemoryItem with the given parameters."""
info_ = info.copy()
user_id = info_.pop("user_id", "")
session_id = info_.pop("session_id", "")

return TextualMemoryItem(
memory=value,
metadata=TreeNodeTextualMemoryMetadata(
user_id=user_id,
session_id=session_id,
memory_type=memory_type,
status="activated",
tags=tags,
key=key,
embedding=self.embedder.embed([value])[0],
usage=[],
sources=sources,
background=background,
confidence=0.99,
type="fact",
info=info_,
),
)
2 changes: 2 additions & 0 deletions src/memos/mem_reader/read_multi_modal/multi_modal_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,8 @@ def process_transfer(
parser = self.file_content_parser
elif source.type == "text":
parser = self.text_content_parser
elif source.type in ["image", "image_url"]:
parser = self.image_parser
elif source.role:
# Chat message, use role parser
parser = self.role_parsers.get(source.role)
Expand Down
14 changes: 13 additions & 1 deletion src/memos/mem_reader/read_multi_modal/user_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,20 @@ def create_source(
original_part=part,
)
)
elif part_type == "image_url":
image_info = part.get("image_url", {})
sources.append(
SourceMessage(
type="image",
role=role,
chat_time=chat_time,
message_id=message_id,
image_path=image_info.get("url"),
original_part=part,
)
)
else:
# image_url, input_audio, etc.
# input_audio, etc.
sources.append(
SourceMessage(
type=part_type,
Expand Down
Loading
Loading