Skip to content

Commit d7aaf57

Browse files
authored
fix: parse in sim-struct chat-data; unify image analysis prompt with context injection & improve LLM robustness (#913)
* fix: parse in sim-struct chat-data * fix: parse error in image * feat: modify figure mem-reader with more context consideration
1 parent 0ab1389 commit d7aaf57

4 files changed

Lines changed: 105 additions & 34 deletions

File tree

src/memos/mem_reader/read_multi_modal/image_parser.py

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,17 @@ def parse_fine(
151151
IMAGE_ANALYSIS_PROMPT_ZH if lang == "zh" else IMAGE_ANALYSIS_PROMPT_EN
152152
)
153153

154+
# Add context if available
155+
context_text = ""
156+
if context_items:
157+
for item in context_items:
158+
if hasattr(item, "memory") and item.memory:
159+
context_text += f"{item.memory}\n"
160+
context_text = context_text.strip()
161+
162+
# Inject context into prompt when possible
163+
image_analysis_prompt = image_analysis_prompt.replace("{context}", context_text)
164+
154165
# Build messages with image content
155166
messages = [
156167
{
@@ -168,21 +179,6 @@ def parse_fine(
168179
}
169180
]
170181

171-
# Add context if available
172-
if context_items:
173-
context_text = ""
174-
for item in context_items:
175-
if hasattr(item, "memory") and item.memory:
176-
context_text += f"{item.memory}\n"
177-
if context_text:
178-
messages.insert(
179-
0,
180-
{
181-
"role": "system",
182-
"content": f"Context from previous conversation:\n{context_text}",
183-
},
184-
)
185-
186182
try:
187183
# Call LLM with vision model
188184
response_text = self.llm.generate(messages)
@@ -192,6 +188,9 @@ def parse_fine(
192188

193189
# Parse JSON response
194190
response_json = self._parse_json_result(response_text)
191+
if not response_json:
192+
logger.warning(f"[ImageParser] Fail to parse response from LLM: {response_text}")
193+
return []
195194

196195
# Extract memory items from response
197196
memory_items = []
@@ -323,8 +322,7 @@ def _cheap_close(t: str) -> str:
323322
return json.loads(s)
324323
except json.JSONDecodeError:
325324
pass
326-
logger.error(f"[ImageParser] Failed to parse JSON: {e}\nResponse: {response_text}")
327-
return {}
325+
logger.warning(f"[ImageParser] Failed to parse JSON: {e}\nResponse: {response_text}")
328326

329327
def _create_memory_item(
330328
self,

src/memos/mem_reader/simple_struct.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,22 @@ def _make_memory_item(
224224
),
225225
)
226226

227+
def _safe_generate(self, messages: list[dict]) -> str | None:
228+
try:
229+
return self.llm.generate(messages)
230+
except Exception:
231+
logger.exception("[LLM] Generation failed")
232+
return None
233+
234+
def _safe_parse(self, text: str | None) -> dict | None:
235+
if not text:
236+
return None
237+
try:
238+
return parse_json_result(text)
239+
except Exception:
240+
logger.warning("[LLM] JSON parse failed")
241+
return None
242+
227243
def _get_llm_response(self, mem_str: str, custom_tags: list[str] | None) -> dict:
228244
lang = detect_lang(mem_str)
229245
template = PROMPT_DICT["chat"][lang]
@@ -240,13 +256,13 @@ def _get_llm_response(self, mem_str: str, custom_tags: list[str] | None) -> dict
240256
if self.config.remove_prompt_example:
241257
prompt = prompt.replace(examples, "")
242258
messages = [{"role": "user", "content": prompt}]
243-
try:
244-
response_text = self.llm.generate(messages)
245-
response_json = parse_json_result(response_text)
246-
except Exception as e:
247-
logger.error(f"[LLM] Exception during chat generation: {e}")
248-
response_json = {
249-
"memory list": [
259+
260+
response_text = self._safe_generate(messages)
261+
response_json = self._safe_parse(response_text)
262+
263+
if not response_json:
264+
return {
265+
"memory_list": [
250266
{
251267
"key": mem_str[:10],
252268
"memory_type": "UserMemory",
@@ -256,6 +272,7 @@ def _get_llm_response(self, mem_str: str, custom_tags: list[str] | None) -> dict
256272
],
257273
"summary": mem_str,
258274
}
275+
259276
return response_json
260277

261278
def _iter_chat_windows(self, scene_data_info, max_tokens=None, overlap=200):

src/memos/mem_reader/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def _cheap_close(t: str) -> str:
7070
if "Invalid \\escape" in str(e):
7171
s = s.replace("\\", "\\\\")
7272
return json.loads(s)
73-
logger.error(
73+
logger.warning(
7474
f"[JSONParse] Failed to decode JSON: {e}\nTail: Raw {response_text} \
7575
json: {s}"
7676
)

src/memos/templates/mem_reader_prompts.py

Lines changed: 65 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -604,13 +604,12 @@
604604
你可以选择与memory相关的在上述列表中可以加入tags,同时你可以根据memory的内容自由添加tags。"""
605605

606606

607-
IMAGE_ANALYSIS_PROMPT_EN = """You are an intelligent memory assistant. Analyze the provided image and extract meaningful information that should be remembered.
607+
IMAGE_ANALYSIS_PROMPT_EN = """You are an intelligent memory assistant. Please analyze the provided image based on the contextual information (if any) and extract meaningful information that should be remembered.
608608
609609
Please extract:
610610
1. **Visual Content**: What objects, people, scenes, or text are visible in the image?
611-
2. **Context**: What is the context or situation depicted?
612-
3. **Key Information**: What important details, facts, or information can be extracted?
613-
4. **User Relevance**: What aspects of this image might be relevant to the user's memory?
611+
2. **Key Information**: What important details, facts, or information can be extracted?
612+
3. **User Relevance**: What aspects of this image might be relevant to the user's memory?
614613
615614
Return a valid JSON object with the following structure:
616615
{
@@ -630,16 +629,44 @@
630629
- The `key`, `value`, `tags`, `summary` and `memory_type` fields should match the language of the user's context if available, otherwise use English.
631630
- Keep `memory_type` in English.
632631
632+
Example:
633+
Reference context:
634+
role-user: I plan to carry this for hiking at Mount Siguniang
635+
role-Bob: Me too
636+
637+
Image URL to be analyzed: https://xxxxxx.jpg
638+
{
639+
"memory list": [
640+
{
641+
"key": "Cylindrical Carry-On Item Attached to Hiking Backpack",
642+
"memory_type": "LongTermMemory",
643+
"value": "An outdoor hiking backpack has a black cylindrical carry-on item secured to its side with webbing straps. The cylinder is positioned vertically, with a length close to the height of the backpack’s side pocket. The exterior is dark-colored with a textured or perforated surface, clearly designed for outdoor use and convenient access while walking.",
644+
"tags": ["outdoor", "hiking", "backpack", "side-mounted", "carry-on item"]
645+
},
646+
{
647+
"key": "Mount Siguniang Hiking Equipment Plan",
648+
"memory_type": "UserMemory",
649+
"value": "Both the user and Bob explicitly plan to carry this outdoor backpack during their hiking trip to Mount Siguniang, indicating that this carrying setup has been included in their preparation for a high-altitude hiking journey.",
650+
"tags": ["user plan", "Mount Siguniang", "hiking", "trekking trip"]
651+
}
652+
],
653+
"summary": "The image presents a typical hiking setup in an outdoor context. A hiking or travel backpack has a black cylindrical carry-on item attached to its side, suggesting a lightweight and practical configuration for long-distance walking. The overall visual tone emphasizes mobility and convenience. The accompanying text highlights ease of travel, no installation required, and suitability for carrying while on the move. Clear specifications for the cylindrical item are also shown, including its width (approximately 2.56 inches), height (approximately 9.76 inches), and net weight (about 1.45 pounds), underscoring its compact size and manageable weight. Combined with the provided context, this setup is planned for a hiking trip to Mount Siguniang, giving the image a clear personal usage scenario and long-term memory relevance."
654+
}
655+
656+
If context is provided, incorporate it into the extraction. If no context is given, extract only the key information from the image.
657+
658+
Reference context:
659+
{context}
660+
633661
Focus on extracting factual, observable information from the image. Avoid speculation unless clearly relevant to user memory."""
634662

635663

636-
IMAGE_ANALYSIS_PROMPT_ZH = """您是一个智能记忆助手。请分析提供的图像并提取应该被记住的有意义信息
664+
IMAGE_ANALYSIS_PROMPT_ZH = """您是一个智能记忆助手。请根据上下文信息(如有)分析提供的图像并提取应该被记住的有意义信息
637665
638666
请提取:
639667
1. **视觉内容**:图像中可见的物体、人物、场景或文字是什么?
640-
2. **上下文**:图像描绘了什么情境或情况?
641-
3. **关键信息**:可以提取哪些重要的细节、事实或信息?
642-
4. **用户相关性**:图像的哪些方面可能与用户的记忆相关?
668+
2. **关键信息**:可以提取哪些重要的细节、事实或信息?
669+
3. **用户相关性**:图像的哪些方面可能与用户的记忆相关?
643670
644671
返回一个有效的 JSON 对象,格式如下:
645672
{
@@ -659,7 +686,36 @@
659686
- `key`、`value`、`tags`、`summary` 和 `memory_type` 字段应该与用户上下文的语言匹配(如果可用),否则使用中文。
660687
- `memory_type` 保持英文。
661688
662-
专注于从图像中提取事实性、可观察的信息。除非与用户记忆明显相关,否则避免推测。"""
689+
例子:
690+
参考的上下文:
691+
role-user: 我打算背这个去四姑娘山徒步
692+
role-bob: 我也是
693+
694+
待解析的url:https://xxxxxx.jpg
695+
{
696+
"memory list": [
697+
{
698+
"key": "徒步背包侧挂圆柱形随行物品",
699+
"memory_type": "LongTermMemory",
700+
"value": "一只户外徒步背包侧面通过织带固定了一件黑色圆柱形随行物品。圆柱体纵向放置,长度接近背包侧袋高度,外壳为深色并带有防滑或透气纹理,整体外观明显为户外使用设计,方便在行走过程中快速取放。",
701+
"tags": ["户外", "徒步", "背包", "侧挂", "随行物品"]
702+
},
703+
{
704+
"key": "四姑娘山徒步随身装备计划",
705+
"memory_type": "UserMemory",
706+
"value": "用户和Bob明确计划在四姑娘山徒步行程中背负该款户外背包,说明这套背负方式已被纳入他们高海拔徒步行程的装备准备中。",
707+
"tags": ["用户计划", "四姑娘山", "徒步", "登山行程"]
708+
}
709+
],
710+
"summary": "画面展示了一种典型的徒步出行配置:一只登山或旅行背包侧边固定着一件黑色圆柱形随行物品,整体氛围明显指向户外行走和轻量化携带场景。画面中的文字强调轻便、无需安装、适合随身携带的使用理念,并直接给出了随行物品的尺寸与重量信息(宽度约2.56英寸、高度约9.76英寸、净重约1.45磅),突出了便于背负和长时间携行的特点。结合用户给出的背景,这套装备被计划用于四姑娘山徒步,具备清晰的个人使用情境和长期记忆价值。"
711+
}
712+
713+
如果给定了上下文,就结合上下文信息进行提取,如果没有给定上下文,请直接提取图片的关键信息。
714+
参考的上下文:
715+
{context}
716+
717+
专注于从图像中提取事实性、可观察的信息。除非与用户记忆明显相关,否则避免推测。
718+
"""
663719

664720

665721
SIMPLE_STRUCT_REWRITE_MEMORY_PROMPT_BACKUP = """

0 commit comments

Comments
 (0)