diff --git a/examples/mem_chunk/markdown_chunk.py b/examples/mem_chunk/markdown_chunk.py new file mode 100644 index 000000000..ce7d2b9ae --- /dev/null +++ b/examples/mem_chunk/markdown_chunk.py @@ -0,0 +1,33 @@ +from memos.chunkers import ChunkerFactory +from memos.configs.chunker import ChunkerConfigFactory + + +config = ChunkerConfigFactory.model_validate( + { + "backend": "markdown", + "config": { + "chunk_size": 1000, + "chunk_overlap": 100, + "recursive": True, + }, + } +) + +chunker = ChunkerFactory.from_config(config) + +text = """ +# Header 1 +This is the first sentence. This is the second sentence. +And here's a third one with some additional context. + +# Header 2 +This is the fourth sentence. This is the fifth sentence. +And here's a sixth one with some additional context. + +# Header 3 +This is the seventh sentence. This is the eighth sentence. +And here's a ninth one with some additional context. +""" +chunks = chunker.chunk(text) +for chunk in chunks: + print("doc:", chunk) diff --git a/poetry.lock b/poetry.lock index 40d0f6210..940697b1c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "absl-py" @@ -24,32 +24,6 @@ files = [ {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, ] -[[package]] -name = "anthropic" -version = "0.57.1" -description = "The official Python library for the anthropic API" -optional = false -python-versions = ">=3.8" -groups = ["eval"] -files = [ - {file = "anthropic-0.57.1-py3-none-any.whl", hash = "sha256:33afc1f395af207d07ff1bffc0a3d1caac53c371793792569c5d2f09283ea306"}, - {file = "anthropic-0.57.1.tar.gz", hash = "sha256:7815dd92245a70d21f65f356f33fc80c5072eada87fb49437767ea2918b2c4b0"}, -] - -[package.dependencies] -anyio = ">=3.5.0,<5" -distro = ">=1.7.0,<2" -httpx = ">=0.25.0,<1" -jiter = ">=0.4.0,<1" -pydantic = ">=1.9.0,<3" -sniffio = "*" -typing-extensions = ">=4.10,<5" - -[package.extras] -aiohttp = ["aiohttp", "httpx-aiohttp (>=0.1.6)"] -bedrock = ["boto3 (>=1.28.57)", "botocore (>=1.31.57)"] -vertex = ["google-auth[requests] (>=2,<3)"] - [[package]] name = "anyio" version = "4.9.0" @@ -73,19 +47,6 @@ doc = ["Sphinx (>=8.2,<9.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", test = ["anyio[trio]", "blockbuster (>=1.5.23)", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "trustme", "truststore (>=0.9.1) ; python_version >= \"3.10\"", "uvloop (>=0.21) ; platform_python_implementation == \"CPython\" and platform_system != \"Windows\" and python_version < \"3.14\""] trio = ["trio (>=0.26.1)"] -[[package]] -name = "async-timeout" -version = "4.0.3" -description = "Timeout context manager for asyncio programs" -optional = false -python-versions = ">=3.7" -groups = ["main", "eval"] -files = [ - {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, - {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, -] -markers = {main = "(extra == \"mem-scheduler\" or extra == \"all\") and python_version == \"3.10\"", eval = "python_version == \"3.10\""} - [[package]] name = "async-timeout" version = "5.0.1" @@ -93,7 +54,7 @@ description = "Timeout context manager for asyncio programs" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "(extra == \"mem-scheduler\" or extra == \"all\") and python_full_version < \"3.11.3\" and python_version == \"3.11\"" +markers = "(python_version == \"3.10\" or python_version == \"3.11\") and python_full_version < \"3.11.3\" and (extra == \"mem-scheduler\" or extra == \"all\")" files = [ {file = "async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c"}, {file = "async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"}, @@ -293,7 +254,7 @@ files = [ {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, ] -markers = {main = "platform_python_implementation != \"PyPy\"", eval = "platform_python_implementation == \"PyPy\""} +markers = {main = "extra == \"mem-reader\" or extra == \"all\" or platform_python_implementation != \"PyPy\"", eval = "platform_python_implementation == \"PyPy\""} [package.dependencies] pycparser = "*" @@ -823,24 +784,6 @@ files = [ [package.dependencies] python-dotenv = "*" -[[package]] -name = "dydantic" -version = "0.0.8" -description = "Dynamically generate pydantic models from JSON schema." -optional = false -python-versions = "<4.0,>=3.9" -groups = ["eval"] -files = [ - {file = "dydantic-0.0.8-py3-none-any.whl", hash = "sha256:cd0a991f523bd8632699872f1c0c4278415dd04783e36adec5428defa0afb721"}, - {file = "dydantic-0.0.8.tar.gz", hash = "sha256:14a31d4cdfce314ce3e69e8f8c7c46cbc26ce3ce4485de0832260386c612942f"}, -] - -[package.dependencies] -pydantic = ">=2,<3" - -[package.extras] -email = ["email-validator (>=2.1,<3.0)"] - [[package]] name = "email-validator" version = "2.2.0" @@ -1137,7 +1080,7 @@ description = "Lightweight in-process concurrent programming" optional = false python-versions = ">=3.9" groups = ["main", "eval"] -markers = "(platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") and python_version < \"3.14\"" +markers = "(python_version == \"3.10\" or python_version == \"3.11\" or python_version == \"3.12\" or python_version == \"3.13\") and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")" files = [ {file = "greenlet-3.2.3-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:1afd685acd5597349ee6d7a88a8bec83ce13c106ac78c196ee9dde7c04fe87be"}, {file = "greenlet-3.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:761917cac215c61e9dc7324b2606107b3b292a8349bdebb31503ab4de3f559ac"}, @@ -1701,11 +1644,12 @@ version = "1.33" description = "Apply JSON-Patches (RFC 6902)" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" -groups = ["eval"] +groups = ["main", "eval"] files = [ {file = "jsonpatch-1.33-py2.py3-none-any.whl", hash = "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade"}, {file = "jsonpatch-1.33.tar.gz", hash = "sha256:9fcd4009c41e6d12348b4a0ff2563ba56a2923a7dfee731d004e212e1ee5030c"}, ] +markers = {main = "extra == \"mem-reader\" or extra == \"all\""} [package.dependencies] jsonpointer = ">=1.9" @@ -1716,11 +1660,12 @@ version = "3.0.0" description = "Identify specific nodes in a JSON document (RFC 6901)" optional = false python-versions = ">=3.7" -groups = ["eval"] +groups = ["main", "eval"] files = [ {file = "jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942"}, {file = "jsonpointer-3.0.0.tar.gz", hash = "sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef"}, ] +markers = {main = "extra == \"mem-reader\" or extra == \"all\""} [[package]] name = "jsonschema" @@ -1849,116 +1794,43 @@ files = [ {file = "kiwisolver-1.4.8.tar.gz", hash = "sha256:23d5f023bdc8c7e54eb65f03ca5d5bb25b601eac4d7f1a042888a1f45237987e"}, ] -[[package]] -name = "langchain" -version = "0.3.26" -description = "Building applications with LLMs through composability" -optional = false -python-versions = ">=3.9" -groups = ["eval"] -files = [ - {file = "langchain-0.3.26-py3-none-any.whl", hash = "sha256:361bb2e61371024a8c473da9f9c55f4ee50f269c5ab43afdb2b1309cb7ac36cf"}, - {file = "langchain-0.3.26.tar.gz", hash = "sha256:8ff034ee0556d3e45eff1f1e96d0d745ced57858414dba7171c8ebdbeb5580c9"}, -] - -[package.dependencies] -async-timeout = {version = ">=4.0.0,<5.0.0", markers = "python_version < \"3.11\""} -langchain-core = ">=0.3.66,<1.0.0" -langchain-text-splitters = ">=0.3.8,<1.0.0" -langsmith = ">=0.1.17" -pydantic = ">=2.7.4,<3.0.0" -PyYAML = ">=5.3" -requests = ">=2,<3" -SQLAlchemy = ">=1.4,<3" - -[package.extras] -anthropic = ["langchain-anthropic"] -aws = ["langchain-aws"] -azure-ai = ["langchain-azure-ai"] -cohere = ["langchain-cohere"] -community = ["langchain-community"] -deepseek = ["langchain-deepseek"] -fireworks = ["langchain-fireworks"] -google-genai = ["langchain-google-genai"] -google-vertexai = ["langchain-google-vertexai"] -groq = ["langchain-groq"] -huggingface = ["langchain-huggingface"] -mistralai = ["langchain-mistralai"] -ollama = ["langchain-ollama"] -openai = ["langchain-openai"] -perplexity = ["langchain-perplexity"] -together = ["langchain-together"] -xai = ["langchain-xai"] - -[[package]] -name = "langchain-anthropic" -version = "0.3.17" -description = "An integration package connecting AnthropicMessages and LangChain" -optional = false -python-versions = ">=3.9" -groups = ["eval"] -files = [ - {file = "langchain_anthropic-0.3.17-py3-none-any.whl", hash = "sha256:6df784615b93aab0336fbd6a50ca2bd16a704ef01c9488c36a4fa7aad2faf2d6"}, - {file = "langchain_anthropic-0.3.17.tar.gz", hash = "sha256:f2c2a0382ed7992204d790ff8538448f5243f4dbb1e798256ef790c9a69033e4"}, -] - -[package.dependencies] -anthropic = ">=0.57.0,<1" -langchain-core = ">=0.3.68,<1.0.0" -pydantic = ">=2.7.4,<3.0.0" - [[package]] name = "langchain-core" -version = "0.3.69" +version = "1.1.0" description = "Building applications with LLMs through composability" optional = false -python-versions = ">=3.9" -groups = ["eval"] +python-versions = "<4.0.0,>=3.10.0" +groups = ["main", "eval"] files = [ - {file = "langchain_core-0.3.69-py3-none-any.whl", hash = "sha256:383e9cb4919f7ef4b24bf8552ef42e4323c064924fea88b28dd5d7ddb740d3b8"}, - {file = "langchain_core-0.3.69.tar.gz", hash = "sha256:c132961117cc7f0227a4c58dd3e209674a6dd5b7e74abc61a0df93b0d736e283"}, + {file = "langchain_core-1.1.0-py3-none-any.whl", hash = "sha256:2c9f27dadc6d21ed4aa46506a37a56e6a7e2d2f9141922dc5c251ba921822ee6"}, + {file = "langchain_core-1.1.0.tar.gz", hash = "sha256:2b76a82d427922c8bc51c08404af4fc2a29e9f161dfe2297cb05091e810201e7"}, ] +markers = {main = "extra == \"mem-reader\" or extra == \"all\""} [package.dependencies] -jsonpatch = ">=1.33,<2.0" -langsmith = ">=0.3.45" -packaging = ">=23.2" -pydantic = ">=2.7.4" -PyYAML = ">=5.3" +jsonpatch = ">=1.33.0,<2.0.0" +langsmith = ">=0.3.45,<1.0.0" +packaging = ">=23.2.0,<26.0.0" +pydantic = ">=2.7.4,<3.0.0" +pyyaml = ">=5.3.0,<7.0.0" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<10.0.0" -typing-extensions = ">=4.7" - -[[package]] -name = "langchain-openai" -version = "0.3.28" -description = "An integration package connecting OpenAI and LangChain" -optional = false -python-versions = ">=3.9" -groups = ["eval"] -files = [ - {file = "langchain_openai-0.3.28-py3-none-any.whl", hash = "sha256:4cd6d80a5b2ae471a168017bc01b2e0f01548328d83532400a001623624ede67"}, - {file = "langchain_openai-0.3.28.tar.gz", hash = "sha256:6c669548dbdea325c034ae5ef699710e2abd054c7354fdb3ef7bf909dc739d9e"}, -] - -[package.dependencies] -langchain-core = ">=0.3.68,<1.0.0" -openai = ">=1.86.0,<2.0.0" -tiktoken = ">=0.7,<1" +typing-extensions = ">=4.7.0,<5.0.0" [[package]] name = "langchain-text-splitters" -version = "0.3.8" +version = "1.0.0" description = "LangChain text splitting utilities" -optional = false -python-versions = "<4.0,>=3.9" -groups = ["eval"] +optional = true +python-versions = "<4.0.0,>=3.10.0" +groups = ["main"] +markers = "extra == \"mem-reader\" or extra == \"all\"" files = [ - {file = "langchain_text_splitters-0.3.8-py3-none-any.whl", hash = "sha256:e75cc0f4ae58dcf07d9f18776400cf8ade27fadd4ff6d264df6278bb302f6f02"}, - {file = "langchain_text_splitters-0.3.8.tar.gz", hash = "sha256:116d4b9f2a22dda357d0b79e30acf005c5518177971c66a9f1ab0edfdb0f912e"}, + {file = "langchain_text_splitters-1.0.0-py3-none-any.whl", hash = "sha256:f00c8219d3468f2c5bd951b708b6a7dd9bc3c62d0cfb83124c377f7170f33b2e"}, + {file = "langchain_text_splitters-1.0.0.tar.gz", hash = "sha256:d8580a20ad7ed10b432feb273e5758b2cc0902d094919629cec0e1ad691a6744"}, ] [package.dependencies] -langchain-core = ">=0.3.51,<1.0.0" +langchain-core = ">=1.0.0,<2.0.0" [[package]] name = "langgraph" @@ -2028,39 +1900,18 @@ files = [ httpx = ">=0.25.2" orjson = ">=3.10.1" -[[package]] -name = "langmem" -version = "0.0.27" -description = "Prebuilt utilities for memory management and retrieval." -optional = false -python-versions = ">=3.10" -groups = ["eval"] -files = [ - {file = "langmem-0.0.27-py3-none-any.whl", hash = "sha256:25e9f06ad7c420442cf4b62caff6f805b124dfb2e2cc9cacc464d7a455fbafda"}, - {file = "langmem-0.0.27.tar.gz", hash = "sha256:729c1eb77c4cd8d9f2285f908a68a1e622ef01f074eeeb8cbbc7343f296efc53"}, -] - -[package.dependencies] -langchain = ">=0.3.15" -langchain-anthropic = ">=0.3.3" -langchain-core = ">=0.3.46" -langchain-openai = ">=0.3.1" -langgraph = ">=0.3.23" -langgraph-checkpoint = ">=2.0.12" -langsmith = ">=0.3.8" -trustcall = ">=0.0.39" - [[package]] name = "langsmith" version = "0.4.7" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = ">=3.9" -groups = ["eval"] +groups = ["main", "eval"] files = [ {file = "langsmith-0.4.7-py3-none-any.whl", hash = "sha256:de91f1abdd65da369996f8eedb5201f442110c9c3bde5babc6f5300f07da65df"}, {file = "langsmith-0.4.7.tar.gz", hash = "sha256:3864cf29295c2565c578e93d1533f5b39e2b4af616545ace30f069635a319890"}, ] +markers = {main = "extra == \"mem-reader\" or extra == \"all\""} [package.dependencies] httpx = ">=0.23.0,<1" @@ -2772,7 +2623,7 @@ files = [ {file = "nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:235f728d6e2a409eddf1df58d5b0921cf80cfa9e72b9f2775ccb7b4a87984668"}, {file = "nvidia_cublas_cu12-12.6.4.1-py3-none-win_amd64.whl", hash = "sha256:9e4fa264f4d8a4eb0cdbd34beadc029f453b3bafae02401e999cf3d5a5af75f8"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [[package]] name = "nvidia-cuda-cupti-cu12" @@ -2788,7 +2639,7 @@ files = [ {file = "nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a3eff6cdfcc6a4c35db968a06fcadb061cbc7d6dde548609a941ff8701b98b73"}, {file = "nvidia_cuda_cupti_cu12-12.6.80-py3-none-win_amd64.whl", hash = "sha256:bbe6ae76e83ce5251b56e8c8e61a964f757175682bbad058b170b136266ab00a"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [[package]] name = "nvidia-cuda-nvrtc-cu12" @@ -2802,7 +2653,7 @@ files = [ {file = "nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:35b0cc6ee3a9636d5409133e79273ce1f3fd087abb0532d2d2e8fff1fe9efc53"}, {file = "nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:f7007dbd914c56bd80ea31bc43e8e149da38f68158f423ba845fc3292684e45a"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [[package]] name = "nvidia-cuda-runtime-cu12" @@ -2818,7 +2669,7 @@ files = [ {file = "nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a84d15d5e1da416dd4774cb42edf5e954a3e60cc945698dc1d5be02321c44dc8"}, {file = "nvidia_cuda_runtime_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:86c58044c824bf3c173c49a2dbc7a6c8b53cb4e4dca50068be0bf64e9dab3f7f"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [[package]] name = "nvidia-cudnn-cu12" @@ -2832,7 +2683,7 @@ files = [ {file = "nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:30ac3869f6db17d170e0e556dd6cc5eee02647abc31ca856634d5a40f82c15b2"}, {file = "nvidia_cudnn_cu12-9.5.1.17-py3-none-win_amd64.whl", hash = "sha256:d7af0f8a4f3b4b9dbb3122f2ef553b45694ed9c384d5a75bab197b8eefb79ab8"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [package.dependencies] nvidia-cublas-cu12 = "*" @@ -2851,7 +2702,7 @@ files = [ {file = "nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.whl", hash = "sha256:768160ac89f6f7b459bee747e8d175dbf53619cfe74b2a5636264163138013ca"}, {file = "nvidia_cufft_cu12-11.3.0.4-py3-none-win_amd64.whl", hash = "sha256:6048ebddfb90d09d2707efb1fd78d4e3a77cb3ae4dc60e19aab6be0ece2ae464"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [package.dependencies] nvidia-nvjitlink-cu12 = "*" @@ -2867,7 +2718,7 @@ files = [ {file = "nvidia_cufile_cu12-1.11.1.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc23469d1c7e52ce6c1d55253273d32c565dd22068647f3aa59b3c6b005bf159"}, {file = "nvidia_cufile_cu12-1.11.1.6-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:8f57a0051dcf2543f6dc2b98a98cb2719c37d3cee1baba8965d57f3bbc90d4db"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [[package]] name = "nvidia-curand-cu12" @@ -2883,7 +2734,7 @@ files = [ {file = "nvidia_curand_cu12-10.3.7.77-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:7b2ed8e95595c3591d984ea3603dd66fe6ce6812b886d59049988a712ed06b6e"}, {file = "nvidia_curand_cu12-10.3.7.77-py3-none-win_amd64.whl", hash = "sha256:6d6d935ffba0f3d439b7cd968192ff068fafd9018dbf1b85b37261b13cfc9905"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [[package]] name = "nvidia-cusolver-cu12" @@ -2899,7 +2750,7 @@ files = [ {file = "nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dbbe4fc38ec1289c7e5230e16248365e375c3673c9c8bac5796e2e20db07f56e"}, {file = "nvidia_cusolver_cu12-11.7.1.2-py3-none-win_amd64.whl", hash = "sha256:6813f9d8073f555444a8705f3ab0296d3e1cb37a16d694c5fc8b862a0d8706d7"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [package.dependencies] nvidia-cublas-cu12 = "*" @@ -2920,7 +2771,7 @@ files = [ {file = "nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:23749a6571191a215cb74d1cdbff4a86e7b19f1200c071b3fcf844a5bea23a2f"}, {file = "nvidia_cusparse_cu12-12.5.4.2-py3-none-win_amd64.whl", hash = "sha256:4acb8c08855a26d737398cba8fb6f8f5045d93f82612b4cfd84645a2332ccf20"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [package.dependencies] nvidia-nvjitlink-cu12 = "*" @@ -2937,7 +2788,7 @@ files = [ {file = "nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e5c8a26c36445dd2e6812f1177978a24e2d37cacce7e090f297a688d1ec44f46"}, {file = "nvidia_cusparselt_cu12-0.6.3-py3-none-win_amd64.whl", hash = "sha256:3b325bcbd9b754ba43df5a311488fca11a6b5dc3d11df4d190c000cf1a0765c7"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [[package]] name = "nvidia-nccl-cu12" @@ -2950,7 +2801,7 @@ files = [ {file = "nvidia_nccl_cu12-2.26.2-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5c196e95e832ad30fbbb50381eb3cbd1fadd5675e587a548563993609af19522"}, {file = "nvidia_nccl_cu12-2.26.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:694cf3879a206553cc9d7dbda76b13efaf610fdb70a50cba303de1b0d1530ac6"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [[package]] name = "nvidia-nvjitlink-cu12" @@ -2964,7 +2815,7 @@ files = [ {file = "nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cf4eaa7d4b6b543ffd69d6abfb11efdeb2db48270d94dfd3a452c24150829e41"}, {file = "nvidia_nvjitlink_cu12-12.6.85-py3-none-win_amd64.whl", hash = "sha256:e61120e52ed675747825cdd16febc6a0730537451d867ee58bee3853b1b13d1c"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [[package]] name = "nvidia-nvtx-cu12" @@ -2980,7 +2831,7 @@ files = [ {file = "nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6574241a3ec5fdc9334353ab8c479fe75841dbe8f4532a8fc97ce63503330ba1"}, {file = "nvidia_nvtx_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:2fb11a4af04a5e6c84073e6404d26588a34afd35379f0855a99797897efa75c0"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [[package]] name = "ollama" @@ -3637,7 +3488,7 @@ files = [ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] -markers = {main = "platform_python_implementation != \"PyPy\"", eval = "platform_python_implementation == \"PyPy\""} +markers = {main = "extra == \"mem-reader\" or extra == \"all\" or platform_python_implementation != \"PyPy\"", eval = "platform_python_implementation == \"PyPy\""} [[package]] name = "pydantic" @@ -4068,7 +3919,7 @@ files = [ {file = "pywin32-311-cp39-cp39-win_amd64.whl", hash = "sha256:e0c4cfb0621281fe40387df582097fd796e80430597cb9944f0ae70447bacd91"}, {file = "pywin32-311-cp39-cp39-win_arm64.whl", hash = "sha256:62ea666235135fee79bb154e695f3ff67370afefd71bd7fea7512fc70ef31e3d"}, ] -markers = {main = "platform_system == \"Windows\" and extra == \"all\" or sys_platform == \"win32\"", eval = "platform_system == \"Windows\""} +markers = {main = "extra == \"all\" and platform_system == \"Windows\" or sys_platform == \"win32\"", eval = "platform_system == \"Windows\""} [[package]] name = "pyyaml" @@ -4352,11 +4203,12 @@ version = "1.0.0" description = "A utility belt for advanced users of python-requests" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -groups = ["eval"] +groups = ["main", "eval"] files = [ {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, ] +markers = {main = "extra == \"mem-reader\" or extra == \"all\""} [package.dependencies] requests = ">=2.0.1,<3.0.0" @@ -5065,7 +4917,7 @@ files = [ {file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"}, {file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"}, ] -markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (extra == \"all\" or extra == \"pref-mem\") or extra == \"pref-mem\" or extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\" or python_version >= \"3.12\""} +markers = {main = "extra == \"all\" or extra == \"pref-mem\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\" or python_version >= \"3.12\""} [package.extras] check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""] @@ -5307,54 +5159,6 @@ files = [ {file = "threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e"}, ] -[[package]] -name = "tiktoken" -version = "0.9.0" -description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" -optional = false -python-versions = ">=3.9" -groups = ["eval"] -files = [ - {file = "tiktoken-0.9.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:586c16358138b96ea804c034b8acf3f5d3f0258bd2bc3b0227af4af5d622e382"}, - {file = "tiktoken-0.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d9c59ccc528c6c5dd51820b3474402f69d9a9e1d656226848ad68a8d5b2e5108"}, - {file = "tiktoken-0.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0968d5beeafbca2a72c595e8385a1a1f8af58feaebb02b227229b69ca5357fd"}, - {file = "tiktoken-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:92a5fb085a6a3b7350b8fc838baf493317ca0e17bd95e8642f95fc69ecfed1de"}, - {file = "tiktoken-0.9.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:15a2752dea63d93b0332fb0ddb05dd909371ededa145fe6a3242f46724fa7990"}, - {file = "tiktoken-0.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:26113fec3bd7a352e4b33dbaf1bd8948de2507e30bd95a44e2b1156647bc01b4"}, - {file = "tiktoken-0.9.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f32cc56168eac4851109e9b5d327637f15fd662aa30dd79f964b7c39fbadd26e"}, - {file = "tiktoken-0.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:45556bc41241e5294063508caf901bf92ba52d8ef9222023f83d2483a3055348"}, - {file = "tiktoken-0.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03935988a91d6d3216e2ec7c645afbb3d870b37bcb67ada1943ec48678e7ee33"}, - {file = "tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b3d80aad8d2c6b9238fc1a5524542087c52b860b10cbf952429ffb714bc1136"}, - {file = "tiktoken-0.9.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b2a21133be05dc116b1d0372af051cd2c6aa1d2188250c9b553f9fa49301b336"}, - {file = "tiktoken-0.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:11a20e67fdf58b0e2dea7b8654a288e481bb4fc0289d3ad21291f8d0849915fb"}, - {file = "tiktoken-0.9.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e88f121c1c22b726649ce67c089b90ddda8b9662545a8aeb03cfef15967ddd03"}, - {file = "tiktoken-0.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a6600660f2f72369acb13a57fb3e212434ed38b045fd8cc6cdd74947b4b5d210"}, - {file = "tiktoken-0.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:95e811743b5dfa74f4b227927ed86cbc57cad4df859cb3b643be797914e41794"}, - {file = "tiktoken-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99376e1370d59bcf6935c933cb9ba64adc29033b7e73f5f7569f3aad86552b22"}, - {file = "tiktoken-0.9.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:badb947c32739fb6ddde173e14885fb3de4d32ab9d8c591cbd013c22b4c31dd2"}, - {file = "tiktoken-0.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:5a62d7a25225bafed786a524c1b9f0910a1128f4232615bf3f8257a73aaa3b16"}, - {file = "tiktoken-0.9.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2b0e8e05a26eda1249e824156d537015480af7ae222ccb798e5234ae0285dbdb"}, - {file = "tiktoken-0.9.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:27d457f096f87685195eea0165a1807fae87b97b2161fe8c9b1df5bd74ca6f63"}, - {file = "tiktoken-0.9.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cf8ded49cddf825390e36dd1ad35cd49589e8161fdcb52aa25f0583e90a3e01"}, - {file = "tiktoken-0.9.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc156cb314119a8bb9748257a2eaebd5cc0753b6cb491d26694ed42fc7cb3139"}, - {file = "tiktoken-0.9.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:cd69372e8c9dd761f0ab873112aba55a0e3e506332dd9f7522ca466e817b1b7a"}, - {file = "tiktoken-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5ea0edb6f83dc56d794723286215918c1cde03712cbbafa0348b33448faf5b95"}, - {file = "tiktoken-0.9.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c6386ca815e7d96ef5b4ac61e0048cd32ca5a92d5781255e13b31381d28667dc"}, - {file = "tiktoken-0.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:75f6d5db5bc2c6274b674ceab1615c1778e6416b14705827d19b40e6355f03e0"}, - {file = "tiktoken-0.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e15b16f61e6f4625a57a36496d28dd182a8a60ec20a534c5343ba3cafa156ac7"}, - {file = "tiktoken-0.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebcec91babf21297022882344c3f7d9eed855931466c3311b1ad6b64befb3df"}, - {file = "tiktoken-0.9.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e5fd49e7799579240f03913447c0cdfa1129625ebd5ac440787afc4345990427"}, - {file = "tiktoken-0.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:26242ca9dc8b58e875ff4ca078b9a94d2f0813e6a535dcd2205df5d49d927cc7"}, - {file = "tiktoken-0.9.0.tar.gz", hash = "sha256:d02a5ca6a938e0490e1ff957bc48c8b078c88cb83977be1625b1fd8aac792c5d"}, -] - -[package.dependencies] -regex = ">=2022.1.18" -requests = ">=2.26.0" - -[package.extras] -blobfile = ["blobfile (>=2)"] - [[package]] name = "tokenizers" version = "0.21.2" @@ -5604,7 +5408,7 @@ files = [ {file = "triton-3.3.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3198adb9d78b77818a5388bff89fa72ff36f9da0bc689db2f0a651a67ce6a42"}, {file = "triton-3.3.1-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f6139aeb04a146b0b8e0fbbd89ad1e65861c57cfed881f21d62d3cb94a36bab7"}, ] -markers = {main = "platform_machine == \"x86_64\" and extra == \"all\" and platform_system == \"Linux\"", eval = "platform_machine == \"x86_64\" and platform_system == \"Linux\""} +markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"all\"", eval = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} [package.dependencies] setuptools = ">=40.8.0" @@ -5614,23 +5418,6 @@ build = ["cmake (>=3.20)", "lit"] tests = ["autopep8", "isort", "llnl-hatchet", "numpy", "pytest", "pytest-forked", "pytest-xdist", "scipy (>=1.7.1)"] tutorials = ["matplotlib", "pandas", "tabulate"] -[[package]] -name = "trustcall" -version = "0.0.39" -description = "Tenacious & trustworthy tool calling built on LangGraph." -optional = false -python-versions = "<4.0,>=3.10" -groups = ["eval"] -files = [ - {file = "trustcall-0.0.39-py3-none-any.whl", hash = "sha256:d7da42e0bba816c0539b2936dfed90ffb3ea8d789e548e73865d416f8ac4ee64"}, - {file = "trustcall-0.0.39.tar.gz", hash = "sha256:ec315818224501b9537ce6b7618dbc21be41210c6e8f2e239169a5a00912cd6e"}, -] - -[package.dependencies] -dydantic = ">=0.0.8,<1.0.0" -jsonpatch = ">=1.33,<2.0" -langgraph = ">=0.2.25" - [[package]] name = "typer" version = "0.16.0" @@ -5830,7 +5617,7 @@ description = "Fast implementation of asyncio event loop on top of libuv" optional = false python-versions = ">=3.8.0" groups = ["main"] -markers = "platform_python_implementation != \"PyPy\" and sys_platform != \"win32\" and sys_platform != \"cygwin\"" +markers = "sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\"" files = [ {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f"}, {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d"}, @@ -6303,7 +6090,7 @@ version = "0.23.0" description = "Zstandard bindings for Python" optional = false python-versions = ">=3.8" -groups = ["eval"] +groups = ["main", "eval"] files = [ {file = "zstandard-0.23.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf0a05b6059c0528477fba9054d09179beb63744355cab9f38059548fedd46a9"}, {file = "zstandard-0.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fc9ca1c9718cb3b06634c7c8dec57d24e9438b2aa9a0f02b8bb36bf478538880"}, @@ -6403,6 +6190,7 @@ files = [ {file = "zstandard-0.23.0-cp39-cp39-win_amd64.whl", hash = "sha256:f8346bfa098532bc1fb6c7ef06783e969d87a99dd1d2a5a18a892c1d7a643c58"}, {file = "zstandard-0.23.0.tar.gz", hash = "sha256:b2d8c62d08e7255f68f7a740bae85b3c9b8e5466baa9cbf7f57f1cde0ac6bc09"}, ] +markers = {main = "extra == \"mem-reader\" or extra == \"all\""} [package.dependencies] cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\""} @@ -6411,8 +6199,8 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["cachetools", "chonkie", "datasketch", "jieba", "markitdown", "neo4j", "pika", "pymilvus", "pymysql", "qdrant-client", "rank-bm25", "redis", "schedule", "sentence-transformers", "torch", "volcengine-python-sdk"] -mem-reader = ["chonkie", "markitdown"] +all = ["cachetools", "chonkie", "datasketch", "jieba", "langchain-text-splitters", "markitdown", "neo4j", "pika", "pymilvus", "pymysql", "qdrant-client", "rank-bm25", "redis", "schedule", "sentence-transformers", "torch", "volcengine-python-sdk"] +mem-reader = ["chonkie", "langchain-text-splitters", "markitdown"] mem-scheduler = ["pika", "redis"] mem-user = ["pymysql"] pref-mem = ["datasketch", "pymilvus"] @@ -6421,4 +6209,4 @@ tree-mem = ["neo4j", "schedule"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4.0" -content-hash = "95e737a53fed62215bcb523c162e19ed67ffc745e27fa081bc3da5e356eba086" +content-hash = "1eae4dc9df321c2e5157497c7ce6fb2b1248cb1d4cf7d57e3d38710be977e07b" diff --git a/pyproject.toml b/pyproject.toml index 9a8db2694..265a5ae5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,6 +87,7 @@ mem-user = [ mem-reader = [ "chonkie (>=1.0.7,<2.0.0)", # Sentence chunking library "markitdown[docx,pdf,pptx,xls,xlsx] (>=0.1.1,<0.2.0)", # Markdown parser for various file formats + "langchain-text-splitters (>=1.0.0,<2.0.0)", # markdown chunk for langchain ] # PreferenceTextMemory @@ -105,6 +106,7 @@ all = [ "pika (>=1.3.2,<2.0.0)", "pymysql (>=1.1.0,<2.0.0)", "chonkie (>=1.0.7,<2.0.0)", + "langchain-text-splitters (>=1.0.0,<2.0.0)", "markitdown[docx,pdf,pptx,xls,xlsx] (>=0.1.1,<0.2.0)", "pymilvus (>=2.6.1,<3.0.0)", "datasketch (>=1.6.5,<2.0.0)", @@ -174,7 +176,6 @@ bert-score = "^0.3.13" scipy = "^1.10.1" python-dotenv = "^1.1.1" langgraph = "^0.5.1" -langmem = "^0.0.27" [tool.poetry.group.mem-user.dependencies] diff --git a/src/memos/chunkers/factory.py b/src/memos/chunkers/factory.py index 95b306aae..47c8fc71b 100644 --- a/src/memos/chunkers/factory.py +++ b/src/memos/chunkers/factory.py @@ -3,6 +3,7 @@ from memos.configs.chunker import ChunkerConfigFactory from .base import BaseChunker +from .markdown_chunker import MarkdownChunker from .sentence_chunker import SentenceChunker @@ -11,6 +12,7 @@ class ChunkerFactory: backend_to_class: ClassVar[dict[str, Any]] = { "sentence": SentenceChunker, + "markdown": MarkdownChunker, } @classmethod diff --git a/src/memos/chunkers/markdown_chunker.py b/src/memos/chunkers/markdown_chunker.py new file mode 100644 index 000000000..477e96b8d --- /dev/null +++ b/src/memos/chunkers/markdown_chunker.py @@ -0,0 +1,53 @@ +from memos.configs.chunker import MarkdownChunkerConfig +from memos.dependency import require_python_package +from memos.log import get_logger + +from .base import BaseChunker, Chunk + + +logger = get_logger(__name__) + + +class MarkdownChunker(BaseChunker): + """Markdown-based text chunker.""" + + @require_python_package( + import_name="langchain_text_splitters", + install_command="pip install langchain_text_splitters==1.0.0", + install_link="https://github.com/langchain-ai/langchain-text-splitters", + ) + def __init__(self, config: MarkdownChunkerConfig): + from langchain_text_splitters import ( + MarkdownHeaderTextSplitter, + RecursiveCharacterTextSplitter, + ) + + self.config = config + self.chunker = MarkdownHeaderTextSplitter( + headers_to_split_on=config.headers_to_split_on, + strip_headers=config.strip_headers, + ) + self.chunker_recursive = None + logger.info(f"Initialized MarkdownHeaderTextSplitter with config: {config}") + if config.recursive: + self.chunker_recursive = RecursiveCharacterTextSplitter( + chunk_size=config.chunk_size, + chunk_overlap=config.chunk_overlap, + ) + + def chunk(self, text: str) -> list[str] | list[Chunk]: + """Chunk the given text into smaller chunks based on sentences.""" + md_header_splits = self.chunker.split_text(text) + chunks = [] + if self.chunker_recursive: + md_header_splits = self.chunker_recursive.split_documents(md_header_splits) + for doc in md_header_splits: + try: + chunk = " ".join(list(doc.metadata.values())) + "\n" + doc.page_content + chunks.append(chunk) + except Exception as e: + logger.warning(f"warning chunking document: {e}") + chunks.append(doc.page_content) + + logger.debug(f"Generated {len(chunks)} chunks from input text") + return chunks diff --git a/src/memos/configs/chunker.py b/src/memos/configs/chunker.py index cb4f0e06d..c2af012f0 100644 --- a/src/memos/configs/chunker.py +++ b/src/memos/configs/chunker.py @@ -20,6 +20,19 @@ class SentenceChunkerConfig(BaseChunkerConfig): """Configuration for sentence-based text chunker.""" +class MarkdownChunkerConfig(BaseChunkerConfig): + """Configuration for markdown-based text chunker.""" + + headers_to_split_on: list[tuple[str, str]] = Field( + default=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")], + description="Headers to split on", + ) + strip_headers: bool = Field(default=True, description="Strip headers from the text") + recursive: bool = Field( + default=False, description="Whether to use recursive character text splitter" + ) + + class ChunkerConfigFactory(BaseConfig): """Factory class for creating chunker configurations.""" @@ -28,6 +41,7 @@ class ChunkerConfigFactory(BaseConfig): backend_to_class: ClassVar[dict[str, Any]] = { "sentence": SentenceChunkerConfig, + "markdown": MarkdownChunkerConfig, } @field_validator("backend") diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py b/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py index f428bf5c0..830b915c1 100644 --- a/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py +++ b/src/memos/memories/textual/tree_text_memory/retrieve/searcher.py @@ -507,7 +507,10 @@ def _retrieve_simple( user_name: str | None = None, **kwargs, ): - """Retrieve from by keywords and embedding""" + """ + Retrieve from by keywords and embedding, this func is hotfix for sources=plugin mode + will merge with fulltext retrieval in the future + """ query_words = [] if self.tokenizer: query_words = self.tokenizer.tokenize_mixed(query)