From 66aa231c7cfc3967296fd354dfd5675e00a6066e Mon Sep 17 00:00:00 2001 From: fridayL Date: Tue, 30 Dec 2025 15:21:12 +0800 Subject: [PATCH 1/2] feat: update source return and chunk settings --- src/memos/chunkers/markdown_chunker.py | 2 +- src/memos/mem_reader/read_multi_modal/base.py | 2 +- .../mem_reader/read_multi_modal/file_content_parser.py | 1 + src/memos/mem_reader/read_multi_modal/utils.py | 2 +- src/memos/reranker/strategies/concat_docsource.py | 7 ++++++- 5 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/memos/chunkers/markdown_chunker.py b/src/memos/chunkers/markdown_chunker.py index de375a4dc..b7771ac35 100644 --- a/src/memos/chunkers/markdown_chunker.py +++ b/src/memos/chunkers/markdown_chunker.py @@ -57,6 +57,6 @@ def chunk(self, text: str, **kwargs) -> list[str] | list[Chunk]: except Exception as e: logger.warning(f"warning chunking document: {e}") chunks.append(doc.page_content) - + logger.info(f"Generated chunks: {chunks[:5]}") logger.debug(f"Generated {len(chunks)} chunks from input text") return chunks diff --git a/src/memos/mem_reader/read_multi_modal/base.py b/src/memos/mem_reader/read_multi_modal/base.py index 7664f4d7f..1a756c5d0 100644 --- a/src/memos/mem_reader/read_multi_modal/base.py +++ b/src/memos/mem_reader/read_multi_modal/base.py @@ -258,7 +258,7 @@ def _split_text(self, text: str, is_markdown: bool = False) -> list[str]: if not text or not text.strip(): return [] - splitter = get_text_splitter() + splitter = get_text_splitter(is_markdown=is_markdown) if not splitter: # If text splitter is not available, return text as single chunk return [text] if text.strip() else [] diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 8fa0f2454..8808167eb 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -94,6 +94,7 @@ def _handle_url(self, url_str: str, filename: str) -> tuple[str, str | None, boo response = requests.get(url_str, timeout=30) response.raise_for_status() + response.encoding = 'utf-8' if not filename: filename = os.path.basename(parsed_url.path) or "downloaded_file" diff --git a/src/memos/mem_reader/read_multi_modal/utils.py b/src/memos/mem_reader/read_multi_modal/utils.py index cba8ddeda..d3d97b4e6 100644 --- a/src/memos/mem_reader/read_multi_modal/utils.py +++ b/src/memos/mem_reader/read_multi_modal/utils.py @@ -107,7 +107,7 @@ def _cheap_close(t: str) -> str: "config": {}, } -DEFAULT_CHUNK_SIZE = int(os.getenv("FILE_PARSER_CHUNK_SIZE", "1000")) +DEFAULT_CHUNK_SIZE = int(os.getenv("FILE_PARSER_CHUNK_SIZE", "1280")) DEFAULT_CHUNK_OVERLAP = int(os.getenv("FILE_PARSER_CHUNK_OVERLAP", "200")) diff --git a/src/memos/reranker/strategies/concat_docsource.py b/src/memos/reranker/strategies/concat_docsource.py index 0fb471218..d90452995 100644 --- a/src/memos/reranker/strategies/concat_docsource.py +++ b/src/memos/reranker/strategies/concat_docsource.py @@ -54,6 +54,7 @@ def prepare_documents( original_items = {} tracker = DialogueRankingTracker() documents = [] + documents_set = set() for item in graph_results: memory = getattr(item, "memory", None) if isinstance(memory, str): @@ -66,7 +67,11 @@ def prepare_documents( if source.type == "file": chunk_text += source.content if chunk_text: - documents.append(f"{memory}\n\n[Sources]:\n{chunk_text}") + if chunk_text in documents_set: + continue + else: + documents_set.add(chunk_text) + documents.append(f"{memory}\n\n[Sources]:\n{chunk_text}") else: documents.append(memory) return tracker, original_items, documents From dfaa1ed527babc16393f933f34aec86aef8f49f4 Mon Sep 17 00:00:00 2001 From: fridayL Date: Tue, 30 Dec 2025 15:35:11 +0800 Subject: [PATCH 2/2] feat: update code format --- docker/Dockerfile | 2 +- docker/requirements-full.txt | 2 +- docker/requirements.txt | 2 +- src/memos/mem_reader/read_multi_modal/file_content_parser.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 13fb477d9..76be1709d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -32,4 +32,4 @@ ENV PYTHONPATH=/app/src EXPOSE 8000 # Start the docker -CMD ["uvicorn", "memos.api.server_api:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] \ No newline at end of file +CMD ["uvicorn", "memos.api.server_api:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] diff --git a/docker/requirements-full.txt b/docker/requirements-full.txt index 538f5e578..57c26067f 100644 --- a/docker/requirements-full.txt +++ b/docker/requirements-full.txt @@ -183,4 +183,4 @@ psycopg2-binary==2.9.9 py-key-value-aio==0.2.8 py-key-value-shared==0.2.8 PyJWT==2.10.1 -pytest==9.0.2 \ No newline at end of file +pytest==9.0.2 diff --git a/docker/requirements.txt b/docker/requirements.txt index 738a53920..aa01fa626 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -125,4 +125,4 @@ urllib3==2.5.0 uvicorn==0.38.0 uvloop==0.22.1 watchfiles==1.1.1 -websockets==15.0.1 \ No newline at end of file +websockets==15.0.1 diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 8808167eb..fbc704d0b 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -94,7 +94,7 @@ def _handle_url(self, url_str: str, filename: str) -> tuple[str, str | None, boo response = requests.get(url_str, timeout=30) response.raise_for_status() - response.encoding = 'utf-8' + response.encoding = "utf-8" if not filename: filename = os.path.basename(parsed_url.path) or "downloaded_file"