@@ -111,48 +111,6 @@ def _cheap_close(t: str) -> str:
111111DEFAULT_CHUNK_OVERLAP = int (os .getenv ("FILE_PARSER_CHUNK_OVERLAP" , "200" ))
112112
113113
114- def _simple_split_text (text : str , chunk_size : int , chunk_overlap : int ) -> list [str ]:
115- """
116- Simple text splitter as fallback when langchain is not available.
117-
118- Args:
119- text: Text to split
120- chunk_size: Maximum size of chunks
121- chunk_overlap: Overlap between chunks
122-
123- Returns:
124- List of text chunks
125- """
126- if not text or len (text ) <= chunk_size :
127- return [text ] if text .strip () else []
128-
129- chunks = []
130- start = 0
131- text_len = len (text )
132-
133- while start < text_len :
134- # Calculate end position
135- end = min (start + chunk_size , text_len )
136-
137- # If not the last chunk, try to break at a good position
138- if end < text_len :
139- # Try to break at newline, sentence end, or space
140- for separator in ["\n \n " , "\n " , "。" , "!" , "?" , ". " , "! " , "? " , " " ]:
141- last_sep = text .rfind (separator , start , end )
142- if last_sep != - 1 :
143- end = last_sep + len (separator )
144- break
145-
146- chunk = text [start :end ].strip ()
147- if chunk :
148- chunks .append (chunk )
149-
150- # Move start position with overlap
151- start = max (start + 1 , end - chunk_overlap )
152-
153- return chunks
154-
155-
156114# Initialize parser instance
157115file_parser = None
158116try :
@@ -163,51 +121,27 @@ def _simple_split_text(text: str, chunk_size: int, chunk_overlap: int) -> list[s
163121 logger .error (f"[FileContentParser] Failed to create parser: { e } " )
164122 file_parser = None
165123
166- # Initialize text splitter instance
167- text_splitter = None
168- _use_simple_splitter = False
124+ markdown_text_splitter = None
169125
170126try :
171- try :
172- from langchain .text_splitter import RecursiveCharacterTextSplitter
173- except ImportError :
174- try :
175- from langchain_text_splitters import (
176- MarkdownHeaderTextSplitter ,
177- RecursiveCharacterTextSplitter ,
178- )
179- except ImportError :
180- logger .error (
181- "langchain not available. Install with: pip install langchain or pip install langchain-text-splitters"
182- )
183-
184- text_splitter = RecursiveCharacterTextSplitter (
185- chunk_size = DEFAULT_CHUNK_SIZE ,
186- chunk_overlap = DEFAULT_CHUNK_OVERLAP ,
187- length_function = len ,
188- separators = ["\n \n " , "\n " , "。" , "!" , "?" , ". " , "! " , "? " , " " , "" ],
189- )
190- markdown_text_splitter = MarkdownHeaderTextSplitter (
191- headers_to_split_on = [("#" , "Header 1" ), ("##" , "Header 2" ), ("###" , "Header 3" )],
192- strip_headers = False ,
193- )
194- logger .debug (
195- f"[FileContentParser] Initialized langchain text splitter with chunk_size={ DEFAULT_CHUNK_SIZE } , "
196- f"chunk_overlap={ DEFAULT_CHUNK_OVERLAP } "
127+ from memos .chunkers .charactertext_chunker import CharacterTextChunker
128+ from memos .chunkers .markdown_chunker import MarkdownChunker
129+
130+ markdown_text_splitter = MarkdownChunker (
131+ chunk_size = DEFAULT_CHUNK_SIZE , chunk_overlap = DEFAULT_CHUNK_OVERLAP , recursive = True
197132 )
198- except ImportError as e :
199- logger .warning (
200- f"[FileContentParser] langchain not available, using simple text splitter as fallback: { e } . "
201- "Install with: pip install langchain or pip install langchain-text-splitters"
133+ text_splitter = CharacterTextChunker (
134+ chunk_size = DEFAULT_CHUNK_SIZE , chunk_overlap = DEFAULT_CHUNK_OVERLAP
202135 )
203- text_splitter = None
204- _use_simple_splitter = True
136+ logger .info ("[FileContentParser] Initialized text splitter instances by lancga" )
205137except Exception as e :
206- logger .error (
207- f"[FileContentParser] Failed to initialize text splitter: { e } , using simple splitter as fallback"
138+ logger .warning (
139+ f"[FileContentParser] Failed to create text splitter: { e } will use simple splitter fallback"
208140 )
141+ from memos .chunkers .simple_chunker import SimpleTextSplitter
142+
143+ markdown_text_splitter = None
209144 text_splitter = None
210- _use_simple_splitter = True
211145
212146
213147def get_parser () -> Any :
@@ -220,7 +154,9 @@ def get_parser() -> Any:
220154 return file_parser
221155
222156
223- def get_text_splitter (chunk_size : int | None = None , chunk_overlap : int | None = None ) -> Any :
157+ def get_text_splitter (
158+ chunk_size : int | None = None , chunk_overlap : int | None = None , is_markdown : bool = False
159+ ) -> Any :
224160 """
225161 Get text splitter instance or a callable that uses simple splitter.
226162
@@ -231,28 +167,15 @@ def get_text_splitter(chunk_size: int | None = None, chunk_overlap: int | None =
231167 Returns:
232168 Text splitter instance (RecursiveCharacterTextSplitter) or a callable wrapper for simple splitter
233169 """
234- if text_splitter is not None :
170+ if is_markdown and markdown_text_splitter is not None :
171+ return markdown_text_splitter
172+ elif text_splitter is not None :
235173 return text_splitter
236-
237- # Return a callable wrapper that uses simple splitter
238- if _use_simple_splitter :
174+ else :
239175 actual_chunk_size = chunk_size or DEFAULT_CHUNK_SIZE
240176 actual_chunk_overlap = chunk_overlap or DEFAULT_CHUNK_OVERLAP
241-
242- class SimpleTextSplitter :
243- """Simple text splitter wrapper."""
244-
245- def __init__ (self , chunk_size : int , chunk_overlap : int ):
246- self .chunk_size = chunk_size
247- self .chunk_overlap = chunk_overlap
248-
249- def split_text (self , text : str ) -> list [str ]:
250- return _simple_split_text (text , self .chunk_size , self .chunk_overlap )
251-
252177 return SimpleTextSplitter (actual_chunk_size , actual_chunk_overlap )
253178
254- return None
255-
256179
257180def extract_role (message : dict [str , Any ]) -> str :
258181 """Extract role from message."""
0 commit comments