1+ import fs from 'fs/promises'
2+ import path from 'path'
3+ import { createLogger } from '@/lib/logs/console-logger'
4+ import { TextChunker } from './chunker'
5+ import { generateEmbeddings } from '@/app/api/knowledge/utils'
6+ import type { DocChunk , DocsChunkerOptions , HeaderInfo } from './types'
7+
8+ interface Frontmatter {
9+ title ?: string
10+ description ?: string
11+ [ key : string ] : any
12+ }
13+
14+ const logger = createLogger ( 'DocsChunker' )
15+
16+ /**
17+ * Docs-specific chunker that processes .mdx files and tracks header context
18+ */
19+ export class DocsChunker {
20+ private readonly textChunker : TextChunker
21+ private readonly baseUrl : string
22+
23+ constructor ( options : DocsChunkerOptions = { } ) {
24+ // Use the existing TextChunker for chunking logic
25+ this . textChunker = new TextChunker ( {
26+ chunkSize : options . chunkSize ?? 1024 ,
27+ minChunkSize : options . minChunkSize ?? 100 ,
28+ overlap : options . overlap ?? 200 ,
29+ } )
30+ this . baseUrl = options . baseUrl ?? 'https://docs.simstudio.ai'
31+ }
32+
33+ /**
34+ * Process all .mdx files in the docs directory
35+ */
36+ async chunkAllDocs ( docsPath : string ) : Promise < DocChunk [ ] > {
37+ const allChunks : DocChunk [ ] = [ ]
38+
39+ try {
40+ const mdxFiles = await this . findMdxFiles ( docsPath )
41+ logger . info ( `Found ${ mdxFiles . length } .mdx files to process` )
42+
43+ for ( const filePath of mdxFiles ) {
44+ try {
45+ const chunks = await this . chunkMdxFile ( filePath , docsPath )
46+ allChunks . push ( ...chunks )
47+ logger . info ( `Processed ${ filePath } : ${ chunks . length } chunks` )
48+ } catch ( error ) {
49+ logger . error ( `Error processing ${ filePath } :` , error )
50+ }
51+ }
52+
53+ logger . info ( `Total chunks generated: ${ allChunks . length } ` )
54+ return allChunks
55+ } catch ( error ) {
56+ logger . error ( 'Error processing docs:' , error )
57+ throw error
58+ }
59+ }
60+
61+ /**
62+ * Process a single .mdx file
63+ */
64+ async chunkMdxFile ( filePath : string , basePath : string ) : Promise < DocChunk [ ] > {
65+ const content = await fs . readFile ( filePath , 'utf-8' )
66+ const relativePath = path . relative ( basePath , filePath )
67+
68+ // Parse frontmatter and content
69+ const { data : frontmatter , content : markdownContent } = this . parseFrontmatter ( content )
70+
71+ // Extract headers from the content
72+ const headers = this . extractHeaders ( markdownContent )
73+
74+ // Generate document URL
75+ const documentUrl = this . generateDocumentUrl ( relativePath )
76+
77+ // Split content into chunks
78+ const textChunks = await this . splitContent ( markdownContent )
79+
80+ // Generate embeddings for all chunks at once (batch processing)
81+ logger . info ( `Generating embeddings for ${ textChunks . length } chunks in ${ relativePath } ` )
82+ const embeddings = textChunks . length > 0 ? await generateEmbeddings ( textChunks ) : [ ]
83+ const embeddingModel = 'text-embedding-3-small'
84+
85+ // Convert to DocChunk objects with header context and embeddings
86+ const chunks : DocChunk [ ] = [ ]
87+ let currentPosition = 0
88+
89+ for ( let i = 0 ; i < textChunks . length ; i ++ ) {
90+ const chunkText = textChunks [ i ]
91+ const chunkStart = currentPosition
92+ const chunkEnd = currentPosition + chunkText . length
93+
94+ // Find the most relevant header for this chunk
95+ const relevantHeader = this . findRelevantHeader ( headers , chunkStart )
96+
97+ const chunk : DocChunk = {
98+ text : chunkText ,
99+ tokenCount : Math . ceil ( chunkText . length / 4 ) , // Simple token estimation
100+ sourceDocument : relativePath ,
101+ headerLink : relevantHeader ? `${ documentUrl } #${ relevantHeader . anchor } ` : documentUrl ,
102+ headerText : relevantHeader ?. text || frontmatter . title || 'Document Root' ,
103+ headerLevel : relevantHeader ?. level || 1 ,
104+ embedding : embeddings [ i ] || [ ] ,
105+ embeddingModel,
106+ metadata : {
107+ startIndex : chunkStart ,
108+ endIndex : chunkEnd ,
109+ hasFrontmatter : i === 0 && content . startsWith ( '---' ) ,
110+ documentTitle : frontmatter . title ,
111+ documentDescription : frontmatter . description ,
112+ } ,
113+ }
114+
115+ chunks . push ( chunk )
116+ currentPosition = chunkEnd
117+ }
118+
119+ return chunks
120+ }
121+
122+ /**
123+ * Find all .mdx files recursively
124+ */
125+ private async findMdxFiles ( dirPath : string ) : Promise < string [ ] > {
126+ const files : string [ ] = [ ]
127+
128+ const entries = await fs . readdir ( dirPath , { withFileTypes : true } )
129+
130+ for ( const entry of entries ) {
131+ const fullPath = path . join ( dirPath , entry . name )
132+
133+ if ( entry . isDirectory ( ) ) {
134+ const subFiles = await this . findMdxFiles ( fullPath )
135+ files . push ( ...subFiles )
136+ } else if ( entry . isFile ( ) && entry . name . endsWith ( '.mdx' ) ) {
137+ files . push ( fullPath )
138+ }
139+ }
140+
141+ return files
142+ }
143+
144+ /**
145+ * Extract headers and their positions from markdown content
146+ */
147+ private extractHeaders ( content : string ) : HeaderInfo [ ] {
148+ const headers : HeaderInfo [ ] = [ ]
149+ const headerRegex = / ^ ( # { 1 , 6 } ) \s + ( .+ ) $ / gm
150+ let match
151+
152+ while ( ( match = headerRegex . exec ( content ) ) !== null ) {
153+ const level = match [ 1 ] . length
154+ const text = match [ 2 ] . trim ( )
155+ const anchor = this . generateAnchor ( text )
156+
157+ headers . push ( {
158+ text,
159+ level,
160+ anchor,
161+ position : match . index ,
162+ } )
163+ }
164+
165+ return headers
166+ }
167+
168+ /**
169+ * Generate URL-safe anchor from header text
170+ */
171+ private generateAnchor ( headerText : string ) : string {
172+ return headerText
173+ . toLowerCase ( )
174+ . replace ( / [ ^ \w \s - ] / g, '' ) // Remove special characters except hyphens
175+ . replace ( / \s + / g, '-' ) // Replace spaces with hyphens
176+ . replace ( / - + / g, '-' ) // Replace multiple hyphens with single
177+ . replace ( / ^ - | - $ / g, '' ) // Remove leading/trailing hyphens
178+ }
179+
180+ /**
181+ * Generate document URL from relative path
182+ */
183+ private generateDocumentUrl ( relativePath : string ) : string {
184+ // Convert file path to URL path
185+ // e.g., "tools/knowledge.mdx" -> "/tools/knowledge"
186+ const urlPath = relativePath
187+ . replace ( / \. m d x $ / , '' )
188+ . replace ( / \\ / g, '/' ) // Handle Windows paths
189+
190+ return `${ this . baseUrl } /${ urlPath } `
191+ }
192+
193+ /**
194+ * Find the most relevant header for a given position
195+ */
196+ private findRelevantHeader ( headers : HeaderInfo [ ] , position : number ) : HeaderInfo | null {
197+ if ( headers . length === 0 ) return null
198+
199+ // Find the last header that comes before this position
200+ let relevantHeader : HeaderInfo | null = null
201+
202+ for ( const header of headers ) {
203+ if ( header . position <= position ) {
204+ relevantHeader = header
205+ } else {
206+ break
207+ }
208+ }
209+
210+ return relevantHeader
211+ }
212+
213+ /**
214+ * Split content into chunks using the existing TextChunker
215+ */
216+ private async splitContent ( content : string ) : Promise < string [ ] > {
217+ // Clean the content first
218+ const cleanedContent = this . cleanContent ( content )
219+
220+ // Use the existing TextChunker
221+ const chunks = await this . textChunker . chunk ( cleanedContent )
222+
223+ return chunks . map ( chunk => chunk . text )
224+ }
225+
226+ /**
227+ * Clean content by removing MDX-specific elements and excessive whitespace
228+ */
229+ private cleanContent ( content : string ) : string {
230+ return content
231+ // Remove import statements
232+ . replace ( / ^ i m p o r t \s + .* $ / gm, '' )
233+ // Remove JSX components and React-style comments
234+ . replace ( / < [ ^ > ] + > / g, ' ' )
235+ . replace ( / \{ \/ \* [ \s \S ] * ?\* \/ \} / g, ' ' )
236+ // Remove excessive whitespace
237+ . replace ( / \n { 3 , } / g, '\n\n' )
238+ . replace ( / [ \t ] { 2 , } / g, ' ' )
239+ . trim ( )
240+ }
241+
242+
243+
244+ /**
245+ * Parse frontmatter from MDX content
246+ */
247+ private parseFrontmatter ( content : string ) : { data : Frontmatter ; content : string } {
248+ const frontmatterRegex = / ^ - - - \r ? \n ( [ \s \S ] * ?) \r ? \n - - - \r ? \n ( [ \s \S ] * ) $ /
249+ const match = content . match ( frontmatterRegex )
250+
251+ if ( ! match ) {
252+ return { data : { } , content }
253+ }
254+
255+ const [ , frontmatterText , markdownContent ] = match
256+ const data : Frontmatter = { }
257+
258+ // Simple YAML parsing for title and description
259+ const lines = frontmatterText . split ( '\n' )
260+ for ( const line of lines ) {
261+ const colonIndex = line . indexOf ( ':' )
262+ if ( colonIndex > 0 ) {
263+ const key = line . slice ( 0 , colonIndex ) . trim ( )
264+ const value = line . slice ( colonIndex + 1 ) . trim ( ) . replace ( / ^ [ ' " ] | [ ' " ] $ / g, '' )
265+ data [ key ] = value
266+ }
267+ }
268+
269+ return { data, content : markdownContent }
270+ }
271+
272+ }
0 commit comments