Skip to content

Commit 3a0e389

Browse files
authored
refactor(chunker): replace chonkie with custom TextChunker (#479)
* refactor(chunker): replace chonkie with custom TextChunker implementation and update document processing logic * chore: cleanup unimplemented types
1 parent 187af53 commit 3a0e389

File tree

5 files changed

+295
-171
lines changed

5 files changed

+295
-171
lines changed

apps/sim/app/api/knowledge/utils.ts

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,11 @@ export async function checkDocumentAccess(
179179
.limit(1)
180180

181181
if (kb.length === 0) {
182-
return { hasAccess: false, notFound: true, reason: 'Knowledge base not found' }
182+
return {
183+
hasAccess: false,
184+
notFound: true,
185+
reason: 'Knowledge base not found',
186+
}
183187
}
184188

185189
const kbData = kb[0]
@@ -204,7 +208,11 @@ export async function checkDocumentAccess(
204208
return { hasAccess: false, notFound: true, reason: 'Document not found' }
205209
}
206210

207-
return { hasAccess: true, document: doc[0] as DocumentData, knowledgeBase: kbData }
211+
return {
212+
hasAccess: true,
213+
document: doc[0] as DocumentData,
214+
knowledgeBase: kbData,
215+
}
208216
}
209217

210218
/**
@@ -226,7 +234,11 @@ export async function checkChunkAccess(
226234
.limit(1)
227235

228236
if (kb.length === 0) {
229-
return { hasAccess: false, notFound: true, reason: 'Knowledge base not found' }
237+
return {
238+
hasAccess: false,
239+
notFound: true,
240+
reason: 'Knowledge base not found',
241+
}
230242
}
231243

232244
const kbData = kb[0]
@@ -425,8 +437,8 @@ export async function processDocumentAsync(
425437
tokenCount: Math.ceil(chunk.text.length / 4),
426438
embedding: embeddings[chunkIndex] || null,
427439
embeddingModel: 'text-embedding-3-small',
428-
startOffset: chunk.startIndex || 0,
429-
endOffset: chunk.endIndex || chunk.text.length,
440+
startOffset: chunk.metadata.startIndex,
441+
endOffset: chunk.metadata.endIndex,
430442
overlapTokens: 0,
431443
metadata: {},
432444
searchRank: '1.0',

apps/sim/lib/documents/chunker.ts

Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
export interface ChunkMetadata {
2+
startIndex: number
3+
endIndex: number
4+
tokenCount: number
5+
}
6+
7+
export interface TextChunk {
8+
text: string
9+
metadata: ChunkMetadata
10+
}
11+
12+
export interface ChunkerOptions {
13+
chunkSize?: number
14+
minChunkSize?: number
15+
overlap?: number
16+
}
17+
18+
export interface Chunk {
19+
text: string
20+
tokenCount: number
21+
metadata: {
22+
startIndex: number
23+
endIndex: number
24+
}
25+
}
26+
27+
/**
28+
* Lightweight text chunker optimized for RAG applications
29+
* Uses hierarchical splitting with smart token estimation
30+
*/
31+
export class TextChunker {
32+
private readonly chunkSize: number
33+
private readonly minChunkSize: number
34+
private readonly overlap: number
35+
36+
// Hierarchical separators ordered from largest to smallest semantic units
37+
private readonly separators = [
38+
'\n\n\n', // Document sections
39+
'\n---\n', // Markdown horizontal rules
40+
'\n***\n', // Markdown horizontal rules (alternative)
41+
'\n___\n', // Markdown horizontal rules (alternative)
42+
'\n# ', // Markdown H1 headings
43+
'\n## ', // Markdown H2 headings
44+
'\n### ', // Markdown H3 headings
45+
'\n#### ', // Markdown H4 headings
46+
'\n##### ', // Markdown H5 headings
47+
'\n###### ', // Markdown H6 headings
48+
'\n\n', // Paragraphs
49+
'\n', // Lines
50+
'. ', // Sentences
51+
'! ', // Exclamations
52+
'? ', // Questions
53+
'; ', // Semicolons
54+
', ', // Commas
55+
' ', // Words
56+
]
57+
58+
constructor(options: ChunkerOptions = {}) {
59+
this.chunkSize = options.chunkSize ?? 512
60+
this.minChunkSize = options.minChunkSize ?? 50
61+
this.overlap = options.overlap ?? 0
62+
}
63+
64+
/**
65+
* Estimate token count - optimized for common tokenizers
66+
*/
67+
private estimateTokens(text: string): number {
68+
// Handle empty or whitespace-only text
69+
if (!text?.trim()) return 0
70+
71+
const words = text.trim().split(/\s+/)
72+
let tokenCount = 0
73+
74+
for (const word of words) {
75+
if (word.length === 0) continue
76+
77+
// Short words (1-4 chars) are usually 1 token
78+
if (word.length <= 4) {
79+
tokenCount += 1
80+
}
81+
// Medium words (5-8 chars) are usually 1-2 tokens
82+
else if (word.length <= 8) {
83+
tokenCount += Math.ceil(word.length / 5)
84+
}
85+
// Long words get split more by subword tokenization
86+
else {
87+
tokenCount += Math.ceil(word.length / 4)
88+
}
89+
}
90+
91+
return tokenCount
92+
}
93+
94+
/**
95+
* Split text recursively using hierarchical separators
96+
*/
97+
private splitRecursively(text: string, separatorIndex = 0): string[] {
98+
const tokenCount = this.estimateTokens(text)
99+
100+
// If chunk is small enough, return it
101+
if (tokenCount <= this.chunkSize) {
102+
return text.length >= this.minChunkSize ? [text] : []
103+
}
104+
105+
// If we've run out of separators, force split by character count
106+
if (separatorIndex >= this.separators.length) {
107+
const chunks: string[] = []
108+
const targetLength = Math.ceil((text.length * this.chunkSize) / tokenCount)
109+
110+
for (let i = 0; i < text.length; i += targetLength) {
111+
const chunk = text.slice(i, i + targetLength).trim()
112+
if (chunk.length >= this.minChunkSize) {
113+
chunks.push(chunk)
114+
}
115+
}
116+
return chunks
117+
}
118+
119+
const separator = this.separators[separatorIndex]
120+
const parts = text.split(separator).filter((part) => part.trim())
121+
122+
// If no split occurred, try next separator
123+
if (parts.length <= 1) {
124+
return this.splitRecursively(text, separatorIndex + 1)
125+
}
126+
127+
const chunks: string[] = []
128+
let currentChunk = ''
129+
130+
for (const part of parts) {
131+
const testChunk = currentChunk + (currentChunk ? separator : '') + part
132+
133+
if (this.estimateTokens(testChunk) <= this.chunkSize) {
134+
currentChunk = testChunk
135+
} else {
136+
// Save current chunk if it meets minimum size
137+
if (currentChunk.trim() && currentChunk.length >= this.minChunkSize) {
138+
chunks.push(currentChunk.trim())
139+
}
140+
141+
// Start new chunk with current part
142+
// If part itself is too large, split it further
143+
if (this.estimateTokens(part) > this.chunkSize) {
144+
chunks.push(...this.splitRecursively(part, separatorIndex + 1))
145+
currentChunk = ''
146+
} else {
147+
currentChunk = part
148+
}
149+
}
150+
}
151+
152+
// Add final chunk if it exists and meets minimum size
153+
if (currentChunk.trim() && currentChunk.length >= this.minChunkSize) {
154+
chunks.push(currentChunk.trim())
155+
}
156+
157+
return chunks
158+
}
159+
160+
/**
161+
* Add overlap between chunks if specified
162+
*/
163+
private addOverlap(chunks: string[]): string[] {
164+
if (this.overlap <= 0 || chunks.length <= 1) {
165+
return chunks
166+
}
167+
168+
const overlappedChunks: string[] = []
169+
170+
for (let i = 0; i < chunks.length; i++) {
171+
let chunk = chunks[i]
172+
173+
// Add overlap from previous chunk
174+
if (i > 0) {
175+
const prevChunk = chunks[i - 1]
176+
const words = prevChunk.split(/\s+/)
177+
const overlapWords = words.slice(-Math.min(this.overlap, words.length))
178+
179+
if (overlapWords.length > 0) {
180+
chunk = `${overlapWords.join(' ')} ${chunk}`
181+
}
182+
}
183+
184+
overlappedChunks.push(chunk)
185+
}
186+
187+
return overlappedChunks
188+
}
189+
190+
/**
191+
* Clean and normalize text
192+
*/
193+
private cleanText(text: string): string {
194+
return text
195+
.replace(/\r\n/g, '\n') // Normalize Windows line endings
196+
.replace(/\r/g, '\n') // Normalize old Mac line endings
197+
.replace(/\n{3,}/g, '\n\n') // Limit consecutive newlines
198+
.replace(/\t/g, ' ') // Convert tabs to spaces
199+
.replace(/ {2,}/g, ' ') // Collapse multiple spaces
200+
.trim()
201+
}
202+
203+
/**
204+
* Main chunking method
205+
*/
206+
async chunk(text: string): Promise<Chunk[]> {
207+
if (!text?.trim()) {
208+
return []
209+
}
210+
211+
// Clean the text
212+
const cleanedText = this.cleanText(text)
213+
214+
// Split into chunks
215+
let chunks = this.splitRecursively(cleanedText)
216+
217+
// Add overlap if configured
218+
chunks = this.addOverlap(chunks)
219+
220+
// Convert to Chunk objects with metadata
221+
let previousEndIndex = 0
222+
return chunks.map((chunkText, index) => {
223+
let startIndex: number
224+
let actualContentLength: number
225+
226+
if (index === 0 || this.overlap <= 0) {
227+
// First chunk or no overlap - start from previous end
228+
startIndex = previousEndIndex
229+
actualContentLength = chunkText.length
230+
} else {
231+
// Calculate overlap length in characters
232+
const prevChunk = chunks[index - 1]
233+
const prevWords = prevChunk.split(/\s+/)
234+
const overlapWords = prevWords.slice(-Math.min(this.overlap, prevWords.length))
235+
const overlapLength = Math.min(
236+
chunkText.length,
237+
overlapWords.length > 0 ? overlapWords.join(' ').length + 1 : 0 // +1 for space
238+
)
239+
240+
startIndex = previousEndIndex - overlapLength
241+
actualContentLength = chunkText.length - overlapLength
242+
}
243+
244+
const safeStart = Math.max(0, startIndex)
245+
const endIndexSafe = safeStart + actualContentLength
246+
247+
const chunk: Chunk = {
248+
text: chunkText,
249+
tokenCount: this.estimateTokens(chunkText),
250+
metadata: {
251+
startIndex: safeStart,
252+
endIndex: endIndexSafe,
253+
},
254+
}
255+
256+
previousEndIndex = endIndexSafe
257+
return chunk
258+
})
259+
}
260+
}

apps/sim/lib/documents/document-processor.ts

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
import { RecursiveChunker } from 'chonkie'
2-
import type { RecursiveChunk } from 'chonkie/types'
1+
import { type Chunk, TextChunker } from '@/lib/documents/chunker'
32
import { env } from '@/lib/env'
43
import { isSupportedFileType, parseBuffer, parseFile } from '@/lib/file-parsers'
54
import { createLogger } from '@/lib/logs/console-logger'
@@ -26,7 +25,7 @@ class APIError extends Error {
2625

2726
export interface ProcessedDocument {
2827
content: string
29-
chunks: RecursiveChunk[]
28+
chunks: Chunk[]
3029
metadata: {
3130
filename: string
3231
fileSize: number
@@ -235,40 +234,31 @@ async function parseDocument(
235234
}
236235

237236
/**
238-
* Chunk text content using RecursiveChunker
237+
* Chunk text content using TextChunker
239238
*/
240-
async function chunkContent(
241-
content: string,
242-
options: DocumentProcessingOptions
243-
): Promise<RecursiveChunk[]> {
244-
const chunker = await RecursiveChunker.create({
239+
async function chunkContent(content: string, options: DocumentProcessingOptions): Promise<Chunk[]> {
240+
const chunker = new TextChunker({
245241
chunkSize: options.chunkSize || 512,
246-
minCharactersPerChunk: options.minCharactersPerChunk || 24,
242+
minChunkSize: options.minCharactersPerChunk || 24,
247243
})
248244

249245
try {
250-
logger.info('Chunking content with RecursiveChunker', {
246+
logger.info('Chunking content with TextChunker', {
251247
contentLength: content.length,
252248
chunkSize: options.chunkSize || 512,
253249
})
254250

255251
const chunks = await chunker.chunk(content)
256252

257253
logger.info(`Successfully created ${chunks.length} chunks`)
258-
return chunks as RecursiveChunk[]
254+
return chunks
259255
} catch (error) {
260256
logger.error('Chunking failed:', error)
261257
throw new Error(
262258
`Text chunking failed: ${error instanceof Error ? error.message : 'Unknown error'}`
263259
)
264260
}
265261
}
266-
/**
267-
* Calculate token count estimation (rough approximation: 4 chars per token)
268-
*/
269-
function estimateTokenCount(text: string): number {
270-
return Math.ceil(text.length / 4)
271-
}
272262

273263
/**
274264
* Process a single document: parse content and create chunks
@@ -300,7 +290,7 @@ export async function processDocument(
300290

301291
// Step 3: Calculate metadata
302292
const characterCount = content.length
303-
const tokenCount = estimateTokenCount(content)
293+
const tokenCount = chunks.reduce((acc, chunk) => acc + chunk.tokenCount, 0)
304294
const chunkCount = chunks.length
305295

306296
const processedDocument: ProcessedDocument = {

apps/sim/package.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@
6868
"ai": "^4.3.2",
6969
"better-auth": "^1.2.9",
7070
"browser-image-compression": "^2.0.2",
71-
"chonkie": "^0.2.5",
7271
"class-variance-authority": "^0.7.1",
7372
"clsx": "^2.1.1",
7473
"cmdk": "^1.0.0",

0 commit comments

Comments
 (0)