refactor(chunker): replace chonkie with custom TextChunker (#479)

adiologydev · web-flow · commit 3a0e389883f6 · 2025-06-13T00:26:17.000-07:00
* refactor(chunker): replace chonkie with custom TextChunker implementation and update document processing logic

* chore: cleanup unimplemented types
diff --git a/apps/sim/app/api/knowledge/utils.ts b/apps/sim/app/api/knowledge/utils.ts
@@ -179,7 +179,11 @@ export async function checkDocumentAccess(
     .limit(1)
 
   if (kb.length === 0) {
-    return { hasAccess: false, notFound: true, reason: 'Knowledge base not found' }
+    return {
+      hasAccess: false,
+      notFound: true,
+      reason: 'Knowledge base not found',
+    }
   }
 
   const kbData = kb[0]
@@ -204,7 +208,11 @@ export async function checkDocumentAccess(
     return { hasAccess: false, notFound: true, reason: 'Document not found' }
   }
 
-  return { hasAccess: true, document: doc[0] as DocumentData, knowledgeBase: kbData }
+  return {
+    hasAccess: true,
+    document: doc[0] as DocumentData,
+    knowledgeBase: kbData,
+  }
 }
 
 /**
@@ -226,7 +234,11 @@ export async function checkChunkAccess(
     .limit(1)
 
   if (kb.length === 0) {
-    return { hasAccess: false, notFound: true, reason: 'Knowledge base not found' }
+    return {
+      hasAccess: false,
+      notFound: true,
+      reason: 'Knowledge base not found',
+    }
   }
 
   const kbData = kb[0]
@@ -425,8 +437,8 @@ export async function processDocumentAsync(
       tokenCount: Math.ceil(chunk.text.length / 4),
       embedding: embeddings[chunkIndex] || null,
       embeddingModel: 'text-embedding-3-small',
-      startOffset: chunk.startIndex || 0,
-      endOffset: chunk.endIndex || chunk.text.length,
+      startOffset: chunk.metadata.startIndex,
+      endOffset: chunk.metadata.endIndex,
       overlapTokens: 0,
       metadata: {},
       searchRank: '1.0',
diff --git a/apps/sim/lib/documents/chunker.ts b/apps/sim/lib/documents/chunker.ts
@@ -0,0 +1,260 @@
+export interface ChunkMetadata {
+  startIndex: number
+  endIndex: number
+  tokenCount: number
+}
+
+export interface TextChunk {
+  text: string
+  metadata: ChunkMetadata
+}
+
+export interface ChunkerOptions {
+  chunkSize?: number
+  minChunkSize?: number
+  overlap?: number
+}
+
+export interface Chunk {
+  text: string
+  tokenCount: number
+  metadata: {
+    startIndex: number
+    endIndex: number
+  }
+}
+
+/**
+ * Lightweight text chunker optimized for RAG applications
+ * Uses hierarchical splitting with smart token estimation
+ */
+export class TextChunker {
+  private readonly chunkSize: number
+  private readonly minChunkSize: number
+  private readonly overlap: number
+
+  // Hierarchical separators ordered from largest to smallest semantic units
+  private readonly separators = [
+    '\n\n\n', // Document sections
+    '\n---\n', // Markdown horizontal rules
+    '\n***\n', // Markdown horizontal rules (alternative)
+    '\n___\n', // Markdown horizontal rules (alternative)
+    '\n# ', // Markdown H1 headings
+    '\n## ', // Markdown H2 headings
+    '\n### ', // Markdown H3 headings
+    '\n#### ', // Markdown H4 headings
+    '\n##### ', // Markdown H5 headings
+    '\n###### ', // Markdown H6 headings
+    '\n\n', // Paragraphs
+    '\n', // Lines
+    '. ', // Sentences
+    '! ', // Exclamations
+    '? ', // Questions
+    '; ', // Semicolons
+    ', ', // Commas
+    ' ', // Words
+  ]
+
+  constructor(options: ChunkerOptions = {}) {
+    this.chunkSize = options.chunkSize ?? 512
+    this.minChunkSize = options.minChunkSize ?? 50
+    this.overlap = options.overlap ?? 0
+  }
+
+  /**
+   * Estimate token count - optimized for common tokenizers
+   */
+  private estimateTokens(text: string): number {
+    // Handle empty or whitespace-only text
+    if (!text?.trim()) return 0
+
+    const words = text.trim().split(/\s+/)
+    let tokenCount = 0
+
+    for (const word of words) {
+      if (word.length === 0) continue
+
+      // Short words (1-4 chars) are usually 1 token
+      if (word.length <= 4) {
+        tokenCount += 1
+      }
+      // Medium words (5-8 chars) are usually 1-2 tokens
+      else if (word.length <= 8) {
+        tokenCount += Math.ceil(word.length / 5)
+      }
+      // Long words get split more by subword tokenization
+      else {
+        tokenCount += Math.ceil(word.length / 4)
+      }
+    }
+
+    return tokenCount
+  }
+
+  /**
+   * Split text recursively using hierarchical separators
+   */
+  private splitRecursively(text: string, separatorIndex = 0): string[] {
+    const tokenCount = this.estimateTokens(text)
+
+    // If chunk is small enough, return it
+    if (tokenCount <= this.chunkSize) {
+      return text.length >= this.minChunkSize ? [text] : []
+    }
+
+    // If we've run out of separators, force split by character count
+    if (separatorIndex >= this.separators.length) {
+      const chunks: string[] = []
+      const targetLength = Math.ceil((text.length * this.chunkSize) / tokenCount)
+
+      for (let i = 0; i < text.length; i += targetLength) {
+        const chunk = text.slice(i, i + targetLength).trim()
+        if (chunk.length >= this.minChunkSize) {
+          chunks.push(chunk)
+        }
+      }
+      return chunks
+    }
+
+    const separator = this.separators[separatorIndex]
+    const parts = text.split(separator).filter((part) => part.trim())
+
+    // If no split occurred, try next separator
+    if (parts.length <= 1) {
+      return this.splitRecursively(text, separatorIndex + 1)
+    }
+
+    const chunks: string[] = []
+    let currentChunk = ''
+
+    for (const part of parts) {
+      const testChunk = currentChunk + (currentChunk ? separator : '') + part
+
+      if (this.estimateTokens(testChunk) <= this.chunkSize) {
+        currentChunk = testChunk
+      } else {
+        // Save current chunk if it meets minimum size
+        if (currentChunk.trim() && currentChunk.length >= this.minChunkSize) {
+          chunks.push(currentChunk.trim())
+        }
+
+        // Start new chunk with current part
+        // If part itself is too large, split it further
+        if (this.estimateTokens(part) > this.chunkSize) {
+          chunks.push(...this.splitRecursively(part, separatorIndex + 1))
+          currentChunk = ''
+        } else {
+          currentChunk = part
+        }
+      }
+    }
+
+    // Add final chunk if it exists and meets minimum size
+    if (currentChunk.trim() && currentChunk.length >= this.minChunkSize) {
+      chunks.push(currentChunk.trim())
+    }
+
+    return chunks
+  }
+
+  /**
+   * Add overlap between chunks if specified
+   */
+  private addOverlap(chunks: string[]): string[] {
+    if (this.overlap <= 0 || chunks.length <= 1) {
+      return chunks
+    }
+
+    const overlappedChunks: string[] = []
+
+    for (let i = 0; i < chunks.length; i++) {
+      let chunk = chunks[i]
+
+      // Add overlap from previous chunk
+      if (i > 0) {
+        const prevChunk = chunks[i - 1]
+        const words = prevChunk.split(/\s+/)
+        const overlapWords = words.slice(-Math.min(this.overlap, words.length))
+
+        if (overlapWords.length > 0) {
+          chunk = `${overlapWords.join(' ')} ${chunk}`
+        }
+      }
+
+      overlappedChunks.push(chunk)
+    }
+
+    return overlappedChunks
+  }
+
+  /**
+   * Clean and normalize text
+   */
+  private cleanText(text: string): string {
+    return text
+      .replace(/\r\n/g, '\n') // Normalize Windows line endings
+      .replace(/\r/g, '\n') // Normalize old Mac line endings
+      .replace(/\n{3,}/g, '\n\n') // Limit consecutive newlines
+      .replace(/\t/g, ' ') // Convert tabs to spaces
+      .replace(/ {2,}/g, ' ') // Collapse multiple spaces
+      .trim()
+  }
+
+  /**
+   * Main chunking method
+   */
+  async chunk(text: string): Promise<Chunk[]> {
+    if (!text?.trim()) {
+      return []
+    }
+
+    // Clean the text
+    const cleanedText = this.cleanText(text)
+
+    // Split into chunks
+    let chunks = this.splitRecursively(cleanedText)
+
+    // Add overlap if configured
+    chunks = this.addOverlap(chunks)
+
+    // Convert to Chunk objects with metadata
+    let previousEndIndex = 0
+    return chunks.map((chunkText, index) => {
+      let startIndex: number
+      let actualContentLength: number
+
+      if (index === 0 || this.overlap <= 0) {
+        // First chunk or no overlap - start from previous end
+        startIndex = previousEndIndex
+        actualContentLength = chunkText.length
+      } else {
+        // Calculate overlap length in characters
+        const prevChunk = chunks[index - 1]
+        const prevWords = prevChunk.split(/\s+/)
+        const overlapWords = prevWords.slice(-Math.min(this.overlap, prevWords.length))
+        const overlapLength = Math.min(
+          chunkText.length,
+          overlapWords.length > 0 ? overlapWords.join(' ').length + 1 : 0 // +1 for space
+        )
+
+        startIndex = previousEndIndex - overlapLength
+        actualContentLength = chunkText.length - overlapLength
+      }
+
+      const safeStart = Math.max(0, startIndex)
+      const endIndexSafe = safeStart + actualContentLength
+
+      const chunk: Chunk = {
+        text: chunkText,
+        tokenCount: this.estimateTokens(chunkText),
+        metadata: {
+          startIndex: safeStart,
+          endIndex: endIndexSafe,
+        },
+      }
+
+      previousEndIndex = endIndexSafe
+      return chunk
+    })
+  }
+}
diff --git a/apps/sim/lib/documents/document-processor.ts b/apps/sim/lib/documents/document-processor.ts
@@ -1,5 +1,4 @@
-import { RecursiveChunker } from 'chonkie'
-import type { RecursiveChunk } from 'chonkie/types'
+import { type Chunk, TextChunker } from '@/lib/documents/chunker'
 import { env } from '@/lib/env'
 import { isSupportedFileType, parseBuffer, parseFile } from '@/lib/file-parsers'
 import { createLogger } from '@/lib/logs/console-logger'
@@ -26,7 +25,7 @@ class APIError extends Error {
 
 export interface ProcessedDocument {
   content: string
-  chunks: RecursiveChunk[]
+  chunks: Chunk[]
   metadata: {
     filename: string
     fileSize: number
@@ -235,40 +234,31 @@ async function parseDocument(
 }
 
 /**
- * Chunk text content using RecursiveChunker
+ * Chunk text content using TextChunker
  */
-async function chunkContent(
-  content: string,
-  options: DocumentProcessingOptions
-): Promise<RecursiveChunk[]> {
-  const chunker = await RecursiveChunker.create({
+async function chunkContent(content: string, options: DocumentProcessingOptions): Promise<Chunk[]> {
+  const chunker = new TextChunker({
     chunkSize: options.chunkSize || 512,
-    minCharactersPerChunk: options.minCharactersPerChunk || 24,
+    minChunkSize: options.minCharactersPerChunk || 24,
   })
 
   try {
-    logger.info('Chunking content with RecursiveChunker', {
+    logger.info('Chunking content with TextChunker', {
       contentLength: content.length,
       chunkSize: options.chunkSize || 512,
     })
 
     const chunks = await chunker.chunk(content)
 
     logger.info(`Successfully created ${chunks.length} chunks`)
-    return chunks as RecursiveChunk[]
+    return chunks
   } catch (error) {
     logger.error('Chunking failed:', error)
     throw new Error(
       `Text chunking failed: ${error instanceof Error ? error.message : 'Unknown error'}`
     )
   }
 }
-/**
- * Calculate token count estimation (rough approximation: 4 chars per token)
- */
-function estimateTokenCount(text: string): number {
-  return Math.ceil(text.length / 4)
-}
 
 /**
  * Process a single document: parse content and create chunks
@@ -300,7 +290,7 @@ export async function processDocument(
 
     // Step 3: Calculate metadata
     const characterCount = content.length
-    const tokenCount = estimateTokenCount(content)
+    const tokenCount = chunks.reduce((acc, chunk) => acc + chunk.tokenCount, 0)
     const chunkCount = chunks.length
 
     const processedDocument: ProcessedDocument = {
diff --git a/apps/sim/package.json b/apps/sim/package.json
@@ -68,7 +68,6 @@
     "ai": "^4.3.2",
     "better-auth": "^1.2.9",
     "browser-image-compression": "^2.0.2",
-    "chonkie": "^0.2.5",
     "class-variance-authority": "^0.7.1",
     "clsx": "^2.1.1",
     "cmdk": "^1.0.0",
diff --git a/bun.lock b/bun.lock