11import fs from 'fs/promises'
22import path from 'path'
33import { createLogger } from '@/lib/logs/console-logger'
4- import { TextChunker } from './chunker'
54import { generateEmbeddings } from '@/app/api/knowledge/utils'
5+ import { TextChunker } from './chunker'
66import type { DocChunk , DocsChunkerOptions , HeaderInfo } from './types'
77
88interface Frontmatter {
@@ -35,7 +35,7 @@ export class DocsChunker {
3535 */
3636 async chunkAllDocs ( docsPath : string ) : Promise < DocChunk [ ] > {
3737 const allChunks : DocChunk [ ] = [ ]
38-
38+
3939 try {
4040 const mdxFiles = await this . findMdxFiles ( docsPath )
4141 logger . info ( `Found ${ mdxFiles . length } .mdx files to process` )
@@ -64,36 +64,36 @@ export class DocsChunker {
6464 async chunkMdxFile ( filePath : string , basePath : string ) : Promise < DocChunk [ ] > {
6565 const content = await fs . readFile ( filePath , 'utf-8' )
6666 const relativePath = path . relative ( basePath , filePath )
67-
67+
6868 // Parse frontmatter and content
6969 const { data : frontmatter , content : markdownContent } = this . parseFrontmatter ( content )
70-
70+
7171 // Extract headers from the content
7272 const headers = this . extractHeaders ( markdownContent )
73-
73+
7474 // Generate document URL
7575 const documentUrl = this . generateDocumentUrl ( relativePath )
76-
76+
7777 // Split content into chunks
7878 const textChunks = await this . splitContent ( markdownContent )
79-
79+
8080 // Generate embeddings for all chunks at once (batch processing)
8181 logger . info ( `Generating embeddings for ${ textChunks . length } chunks in ${ relativePath } ` )
8282 const embeddings = textChunks . length > 0 ? await generateEmbeddings ( textChunks ) : [ ]
8383 const embeddingModel = 'text-embedding-3-small'
84-
84+
8585 // Convert to DocChunk objects with header context and embeddings
8686 const chunks : DocChunk [ ] = [ ]
8787 let currentPosition = 0
88-
88+
8989 for ( let i = 0 ; i < textChunks . length ; i ++ ) {
9090 const chunkText = textChunks [ i ]
9191 const chunkStart = currentPosition
9292 const chunkEnd = currentPosition + chunkText . length
93-
93+
9494 // Find the most relevant header for this chunk
9595 const relevantHeader = this . findRelevantHeader ( headers , chunkStart )
96-
96+
9797 const chunk : DocChunk = {
9898 text : chunkText ,
9999 tokenCount : Math . ceil ( chunkText . length / 4 ) , // Simple token estimation
@@ -111,11 +111,11 @@ export class DocsChunker {
111111 documentDescription : frontmatter . description ,
112112 } ,
113113 }
114-
114+
115115 chunks . push ( chunk )
116116 currentPosition = chunkEnd
117117 }
118-
118+
119119 return chunks
120120 }
121121
@@ -124,20 +124,20 @@ export class DocsChunker {
124124 */
125125 private async findMdxFiles ( dirPath : string ) : Promise < string [ ] > {
126126 const files : string [ ] = [ ]
127-
127+
128128 const entries = await fs . readdir ( dirPath , { withFileTypes : true } )
129-
129+
130130 for ( const entry of entries ) {
131131 const fullPath = path . join ( dirPath , entry . name )
132-
132+
133133 if ( entry . isDirectory ( ) ) {
134134 const subFiles = await this . findMdxFiles ( fullPath )
135135 files . push ( ...subFiles )
136136 } else if ( entry . isFile ( ) && entry . name . endsWith ( '.mdx' ) ) {
137137 files . push ( fullPath )
138138 }
139139 }
140-
140+
141141 return files
142142 }
143143
@@ -148,20 +148,20 @@ export class DocsChunker {
148148 const headers : HeaderInfo [ ] = [ ]
149149 const headerRegex = / ^ ( # { 1 , 6 } ) \s + ( .+ ) $ / gm
150150 let match
151-
151+
152152 while ( ( match = headerRegex . exec ( content ) ) !== null ) {
153153 const level = match [ 1 ] . length
154154 const text = match [ 2 ] . trim ( )
155155 const anchor = this . generateAnchor ( text )
156-
156+
157157 headers . push ( {
158158 text,
159159 level,
160160 anchor,
161161 position : match . index ,
162162 } )
163163 }
164-
164+
165165 return headers
166166 }
167167
@@ -183,10 +183,8 @@ export class DocsChunker {
183183 private generateDocumentUrl ( relativePath : string ) : string {
184184 // Convert file path to URL path
185185 // e.g., "tools/knowledge.mdx" -> "/tools/knowledge"
186- const urlPath = relativePath
187- . replace ( / \. m d x $ / , '' )
188- . replace ( / \\ / g, '/' ) // Handle Windows paths
189-
186+ const urlPath = relativePath . replace ( / \. m d x $ / , '' ) . replace ( / \\ / g, '/' ) // Handle Windows paths
187+
190188 return `${ this . baseUrl } /${ urlPath } `
191189 }
192190
@@ -195,18 +193,18 @@ export class DocsChunker {
195193 */
196194 private findRelevantHeader ( headers : HeaderInfo [ ] , position : number ) : HeaderInfo | null {
197195 if ( headers . length === 0 ) return null
198-
196+
199197 // Find the last header that comes before this position
200198 let relevantHeader : HeaderInfo | null = null
201-
199+
202200 for ( const header of headers ) {
203201 if ( header . position <= position ) {
204202 relevantHeader = header
205203 } else {
206204 break
207205 }
208206 }
209-
207+
210208 return relevantHeader
211209 }
212210
@@ -216,57 +214,59 @@ export class DocsChunker {
216214 private async splitContent ( content : string ) : Promise < string [ ] > {
217215 // Clean the content first
218216 const cleanedContent = this . cleanContent ( content )
219-
217+
220218 // Use the existing TextChunker
221219 const chunks = await this . textChunker . chunk ( cleanedContent )
222-
223- return chunks . map ( chunk => chunk . text )
220+
221+ return chunks . map ( ( chunk ) => chunk . text )
224222 }
225223
226224 /**
227225 * Clean content by removing MDX-specific elements and excessive whitespace
228226 */
229227 private cleanContent ( content : string ) : string {
230- return content
231- // Remove import statements
232- . replace ( / ^ i m p o r t \s + .* $ / gm, '' )
233- // Remove JSX components and React-style comments
234- . replace ( / < [ ^ > ] + > / g, ' ' )
235- . replace ( / \{ \/ \* [ \s \S ] * ?\* \/ \} / g, ' ' )
236- // Remove excessive whitespace
237- . replace ( / \n { 3 , } / g, '\n\n' )
238- . replace ( / [ \t ] { 2 , } / g, ' ' )
239- . trim ( )
228+ return (
229+ content
230+ // Remove import statements
231+ . replace ( / ^ i m p o r t \s + .* $ / gm, '' )
232+ // Remove JSX components and React-style comments
233+ . replace ( / < [ ^ > ] + > / g, ' ' )
234+ . replace ( / \{ \/ \* [ \s \S ] * ?\* \/ \} / g, ' ' )
235+ // Remove excessive whitespace
236+ . replace ( / \n { 3 , } / g, '\n\n' )
237+ . replace ( / [ \t ] { 2 , } / g, ' ' )
238+ . trim ( )
239+ )
240240 }
241241
242-
243-
244242 /**
245243 * Parse frontmatter from MDX content
246244 */
247245 private parseFrontmatter ( content : string ) : { data : Frontmatter ; content : string } {
248246 const frontmatterRegex = / ^ - - - \r ? \n ( [ \s \S ] * ?) \r ? \n - - - \r ? \n ( [ \s \S ] * ) $ /
249247 const match = content . match ( frontmatterRegex )
250-
248+
251249 if ( ! match ) {
252250 return { data : { } , content }
253251 }
254-
252+
255253 const [ , frontmatterText , markdownContent ] = match
256254 const data : Frontmatter = { }
257-
255+
258256 // Simple YAML parsing for title and description
259257 const lines = frontmatterText . split ( '\n' )
260258 for ( const line of lines ) {
261259 const colonIndex = line . indexOf ( ':' )
262260 if ( colonIndex > 0 ) {
263261 const key = line . slice ( 0 , colonIndex ) . trim ( )
264- const value = line . slice ( colonIndex + 1 ) . trim ( ) . replace ( / ^ [ ' " ] | [ ' " ] $ / g, '' )
262+ const value = line
263+ . slice ( colonIndex + 1 )
264+ . trim ( )
265+ . replace ( / ^ [ ' " ] | [ ' " ] $ / g, '' )
265266 data [ key ] = value
266267 }
267268 }
268-
269+
269270 return { data, content : markdownContent }
270271 }
271-
272- }
272+ }
0 commit comments