Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
587e5a7
feat: update memos headers
fridayL Nov 19, 2025
5637c9d
feat: headers add
fridayL Nov 19, 2025
68831c0
feat: update search agent
fridayL Nov 20, 2025
58c512d
feat: upadte mem story
fridayL Nov 21, 2025
a497d46
feat: update mem scehduler
fridayL Nov 21, 2025
bd72e9b
feat: update deepsearch mem code
fridayL Nov 21, 2025
91664dc
Merge branch 'dev_new' into feat/deep-search
fridayL Nov 22, 2025
f332ef2
feat: update deepsearch agent
fridayL Nov 22, 2025
c21fc58
feat: update test code
fridayL Nov 22, 2025
fca3776
Merge branch 'dev_new' into feat/deep-search
fridayL Nov 24, 2025
0f62af8
fix: remove dup config
fridayL Nov 24, 2025
5f0a97c
Merge branch 'dev' into feat/deep-search
fridayL Nov 24, 2025
dac3394
feat: dock search pipeline
fridayL Nov 25, 2025
f38115c
Merge branch 'feat/deep-search' of https://github.com/fridayL/MemOS i…
fridayL Nov 25, 2025
696692d
Merge branch 'dev_new' into feat/deep-search
fridayL Nov 25, 2025
9489d54
fix: code test
fridayL Nov 25, 2025
e43e5db
feat: add test scripts
fridayL Nov 25, 2025
ecd4508
feat: add test
fridayL Nov 25, 2025
6e21032
feat: update need_raw process
fridayL Nov 25, 2025
fac355d
Merge branch 'dev_new' into feat/deep-search
fridayL Nov 25, 2025
592f637
fix: add initter
fridayL Nov 25, 2025
df4a66f
Merge branch 'dev_new' into feat/deep-search
fridayL Nov 25, 2025
fbdd07a
Merge branch 'dev_new' into feat/deep-search
fridayL Nov 27, 2025
ad99745
fix: change agent search func name
fridayL Nov 27, 2025
e203755
Merge branch 'dev' into feat/deep-search
fridayL Nov 27, 2025
ca780ea
Merge branch 'dev_new' into feat/deep-search
fridayL Nov 27, 2025
1b67652
Merge branch 'feat/deep-search' of https://github.com/fridayL/MemOS i…
fridayL Nov 27, 2025
94dba83
feat: update logs and defined
fridayL Nov 28, 2025
64414ea
Merge branch 'dev' into feat/deep-search
fridayL Nov 28, 2025
34e9ea4
Merge branch 'dev_new' into feat/deep-search
fridayL Nov 28, 2025
f361d1f
Merge branch 'feat/deep-search' of https://github.com/fridayL/MemOS i…
fridayL Nov 28, 2025
b3acc98
Merge branch 'dev_new' into feat/deep-search
fridayL Dec 1, 2025
953872e
feat: update full text mem search
fridayL Dec 1, 2025
20438e9
Merge branch 'dev_new' into feat/deep-search
fridayL Dec 1, 2025
2591c10
feat: cp plugin to dev
fridayL Dec 1, 2025
4836670
Merge branch 'dev_new' into feat/deep-search
fridayL Dec 1, 2025
383eaaa
feat: add one recall for fulltext retrieval
fridayL Dec 1, 2025
502e15e
fix: set default for fulltext search
fridayL Dec 2, 2025
f33aa47
Merge branch 'dev_new' into feat/deep-search
fridayL Dec 2, 2025
861e489
feat: add langchain chunk
fridayL Dec 2, 2025
10293bf
Merge branch 'dev_new' into feat/deep-search
fridayL Dec 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions examples/mem_chunk/markdown_chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from memos.chunkers import ChunkerFactory
from memos.configs.chunker import ChunkerConfigFactory


config = ChunkerConfigFactory.model_validate(
{
"backend": "markdown",
"config": {
"chunk_size": 1000,
"chunk_overlap": 100,
"recursive": True,
},
}
)

chunker = ChunkerFactory.from_config(config)

text = """
# Header 1
This is the first sentence. This is the second sentence.
And here's a third one with some additional context.

# Header 2
This is the fourth sentence. This is the fifth sentence.
And here's a sixth one with some additional context.

# Header 3
This is the seventh sentence. This is the eighth sentence.
And here's a ninth one with some additional context.
"""
chunks = chunker.chunk(text)
for chunk in chunks:
print("doc:", chunk)
324 changes: 56 additions & 268 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ mem-user = [
mem-reader = [
"chonkie (>=1.0.7,<2.0.0)", # Sentence chunking library
"markitdown[docx,pdf,pptx,xls,xlsx] (>=0.1.1,<0.2.0)", # Markdown parser for various file formats
"langchain-text-splitters (>=1.0.0,<2.0.0)", # markdown chunk for langchain
]

# PreferenceTextMemory
Expand All @@ -105,6 +106,7 @@ all = [
"pika (>=1.3.2,<2.0.0)",
"pymysql (>=1.1.0,<2.0.0)",
"chonkie (>=1.0.7,<2.0.0)",
"langchain-text-splitters (>=1.0.0,<2.0.0)",
"markitdown[docx,pdf,pptx,xls,xlsx] (>=0.1.1,<0.2.0)",
"pymilvus (>=2.6.1,<3.0.0)",
"datasketch (>=1.6.5,<2.0.0)",
Expand Down Expand Up @@ -174,7 +176,6 @@ bert-score = "^0.3.13"
scipy = "^1.10.1"
python-dotenv = "^1.1.1"
langgraph = "^0.5.1"
langmem = "^0.0.27"


[tool.poetry.group.mem-user.dependencies]
Expand Down
2 changes: 2 additions & 0 deletions src/memos/chunkers/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from memos.configs.chunker import ChunkerConfigFactory

from .base import BaseChunker
from .markdown_chunker import MarkdownChunker
from .sentence_chunker import SentenceChunker


Expand All @@ -11,6 +12,7 @@ class ChunkerFactory:

backend_to_class: ClassVar[dict[str, Any]] = {
"sentence": SentenceChunker,
"markdown": MarkdownChunker,
}

@classmethod
Expand Down
53 changes: 53 additions & 0 deletions src/memos/chunkers/markdown_chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from memos.configs.chunker import MarkdownChunkerConfig
from memos.dependency import require_python_package
from memos.log import get_logger

from .base import BaseChunker, Chunk


logger = get_logger(__name__)


class MarkdownChunker(BaseChunker):
"""Markdown-based text chunker."""

@require_python_package(
import_name="langchain_text_splitters",
install_command="pip install langchain_text_splitters==1.0.0",
install_link="https://github.com/langchain-ai/langchain-text-splitters",
)
def __init__(self, config: MarkdownChunkerConfig):
from langchain_text_splitters import (
MarkdownHeaderTextSplitter,
RecursiveCharacterTextSplitter,
)

self.config = config
self.chunker = MarkdownHeaderTextSplitter(
headers_to_split_on=config.headers_to_split_on,
strip_headers=config.strip_headers,
)
self.chunker_recursive = None
logger.info(f"Initialized MarkdownHeaderTextSplitter with config: {config}")
if config.recursive:
self.chunker_recursive = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
)

def chunk(self, text: str) -> list[str] | list[Chunk]:
"""Chunk the given text into smaller chunks based on sentences."""
md_header_splits = self.chunker.split_text(text)
chunks = []
if self.chunker_recursive:
md_header_splits = self.chunker_recursive.split_documents(md_header_splits)
for doc in md_header_splits:
try:
chunk = " ".join(list(doc.metadata.values())) + "\n" + doc.page_content
chunks.append(chunk)
except Exception as e:
logger.warning(f"warning chunking document: {e}")
chunks.append(doc.page_content)

logger.debug(f"Generated {len(chunks)} chunks from input text")
return chunks
14 changes: 14 additions & 0 deletions src/memos/configs/chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,19 @@ class SentenceChunkerConfig(BaseChunkerConfig):
"""Configuration for sentence-based text chunker."""


class MarkdownChunkerConfig(BaseChunkerConfig):
"""Configuration for markdown-based text chunker."""

headers_to_split_on: list[tuple[str, str]] = Field(
default=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")],
description="Headers to split on",
)
strip_headers: bool = Field(default=True, description="Strip headers from the text")
recursive: bool = Field(
default=False, description="Whether to use recursive character text splitter"
)


class ChunkerConfigFactory(BaseConfig):
"""Factory class for creating chunker configurations."""

Expand All @@ -28,6 +41,7 @@ class ChunkerConfigFactory(BaseConfig):

backend_to_class: ClassVar[dict[str, Any]] = {
"sentence": SentenceChunkerConfig,
"markdown": MarkdownChunkerConfig,
}

@field_validator("backend")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,10 @@ def _retrieve_simple(
user_name: str | None = None,
**kwargs,
):
"""Retrieve from by keywords and embedding"""
"""
Retrieve from by keywords and embedding, this func is hotfix for sources=plugin mode
will merge with fulltext retrieval in the future
"""
query_words = []
if self.tokenizer:
query_words = self.tokenizer.tokenize_mixed(query)
Expand Down
Loading