|
| 1 | +export interface ChunkMetadata { |
| 2 | + startIndex: number |
| 3 | + endIndex: number |
| 4 | + tokenCount: number |
| 5 | +} |
| 6 | + |
| 7 | +export interface TextChunk { |
| 8 | + text: string |
| 9 | + metadata: ChunkMetadata |
| 10 | +} |
| 11 | + |
| 12 | +export interface ChunkerOptions { |
| 13 | + chunkSize?: number |
| 14 | + minChunkSize?: number |
| 15 | + overlap?: number |
| 16 | +} |
| 17 | + |
| 18 | +export interface Chunk { |
| 19 | + text: string |
| 20 | + tokenCount: number |
| 21 | + metadata: { |
| 22 | + startIndex: number |
| 23 | + endIndex: number |
| 24 | + } |
| 25 | +} |
| 26 | + |
| 27 | +/** |
| 28 | + * Lightweight text chunker optimized for RAG applications |
| 29 | + * Uses hierarchical splitting with smart token estimation |
| 30 | + */ |
| 31 | +export class TextChunker { |
| 32 | + private readonly chunkSize: number |
| 33 | + private readonly minChunkSize: number |
| 34 | + private readonly overlap: number |
| 35 | + |
| 36 | + // Hierarchical separators ordered from largest to smallest semantic units |
| 37 | + private readonly separators = [ |
| 38 | + '\n\n\n', // Document sections |
| 39 | + '\n---\n', // Markdown horizontal rules |
| 40 | + '\n***\n', // Markdown horizontal rules (alternative) |
| 41 | + '\n___\n', // Markdown horizontal rules (alternative) |
| 42 | + '\n# ', // Markdown H1 headings |
| 43 | + '\n## ', // Markdown H2 headings |
| 44 | + '\n### ', // Markdown H3 headings |
| 45 | + '\n#### ', // Markdown H4 headings |
| 46 | + '\n##### ', // Markdown H5 headings |
| 47 | + '\n###### ', // Markdown H6 headings |
| 48 | + '\n\n', // Paragraphs |
| 49 | + '\n', // Lines |
| 50 | + '. ', // Sentences |
| 51 | + '! ', // Exclamations |
| 52 | + '? ', // Questions |
| 53 | + '; ', // Semicolons |
| 54 | + ', ', // Commas |
| 55 | + ' ', // Words |
| 56 | + ] |
| 57 | + |
| 58 | + constructor(options: ChunkerOptions = {}) { |
| 59 | + this.chunkSize = options.chunkSize ?? 512 |
| 60 | + this.minChunkSize = options.minChunkSize ?? 50 |
| 61 | + this.overlap = options.overlap ?? 0 |
| 62 | + } |
| 63 | + |
| 64 | + /** |
| 65 | + * Estimate token count - optimized for common tokenizers |
| 66 | + */ |
| 67 | + private estimateTokens(text: string): number { |
| 68 | + // Handle empty or whitespace-only text |
| 69 | + if (!text?.trim()) return 0 |
| 70 | + |
| 71 | + const words = text.trim().split(/\s+/) |
| 72 | + let tokenCount = 0 |
| 73 | + |
| 74 | + for (const word of words) { |
| 75 | + if (word.length === 0) continue |
| 76 | + |
| 77 | + // Short words (1-4 chars) are usually 1 token |
| 78 | + if (word.length <= 4) { |
| 79 | + tokenCount += 1 |
| 80 | + } |
| 81 | + // Medium words (5-8 chars) are usually 1-2 tokens |
| 82 | + else if (word.length <= 8) { |
| 83 | + tokenCount += Math.ceil(word.length / 5) |
| 84 | + } |
| 85 | + // Long words get split more by subword tokenization |
| 86 | + else { |
| 87 | + tokenCount += Math.ceil(word.length / 4) |
| 88 | + } |
| 89 | + } |
| 90 | + |
| 91 | + return tokenCount |
| 92 | + } |
| 93 | + |
| 94 | + /** |
| 95 | + * Split text recursively using hierarchical separators |
| 96 | + */ |
| 97 | + private splitRecursively(text: string, separatorIndex = 0): string[] { |
| 98 | + const tokenCount = this.estimateTokens(text) |
| 99 | + |
| 100 | + // If chunk is small enough, return it |
| 101 | + if (tokenCount <= this.chunkSize) { |
| 102 | + return text.length >= this.minChunkSize ? [text] : [] |
| 103 | + } |
| 104 | + |
| 105 | + // If we've run out of separators, force split by character count |
| 106 | + if (separatorIndex >= this.separators.length) { |
| 107 | + const chunks: string[] = [] |
| 108 | + const targetLength = Math.ceil((text.length * this.chunkSize) / tokenCount) |
| 109 | + |
| 110 | + for (let i = 0; i < text.length; i += targetLength) { |
| 111 | + const chunk = text.slice(i, i + targetLength).trim() |
| 112 | + if (chunk.length >= this.minChunkSize) { |
| 113 | + chunks.push(chunk) |
| 114 | + } |
| 115 | + } |
| 116 | + return chunks |
| 117 | + } |
| 118 | + |
| 119 | + const separator = this.separators[separatorIndex] |
| 120 | + const parts = text.split(separator).filter((part) => part.trim()) |
| 121 | + |
| 122 | + // If no split occurred, try next separator |
| 123 | + if (parts.length <= 1) { |
| 124 | + return this.splitRecursively(text, separatorIndex + 1) |
| 125 | + } |
| 126 | + |
| 127 | + const chunks: string[] = [] |
| 128 | + let currentChunk = '' |
| 129 | + |
| 130 | + for (const part of parts) { |
| 131 | + const testChunk = currentChunk + (currentChunk ? separator : '') + part |
| 132 | + |
| 133 | + if (this.estimateTokens(testChunk) <= this.chunkSize) { |
| 134 | + currentChunk = testChunk |
| 135 | + } else { |
| 136 | + // Save current chunk if it meets minimum size |
| 137 | + if (currentChunk.trim() && currentChunk.length >= this.minChunkSize) { |
| 138 | + chunks.push(currentChunk.trim()) |
| 139 | + } |
| 140 | + |
| 141 | + // Start new chunk with current part |
| 142 | + // If part itself is too large, split it further |
| 143 | + if (this.estimateTokens(part) > this.chunkSize) { |
| 144 | + chunks.push(...this.splitRecursively(part, separatorIndex + 1)) |
| 145 | + currentChunk = '' |
| 146 | + } else { |
| 147 | + currentChunk = part |
| 148 | + } |
| 149 | + } |
| 150 | + } |
| 151 | + |
| 152 | + // Add final chunk if it exists and meets minimum size |
| 153 | + if (currentChunk.trim() && currentChunk.length >= this.minChunkSize) { |
| 154 | + chunks.push(currentChunk.trim()) |
| 155 | + } |
| 156 | + |
| 157 | + return chunks |
| 158 | + } |
| 159 | + |
| 160 | + /** |
| 161 | + * Add overlap between chunks if specified |
| 162 | + */ |
| 163 | + private addOverlap(chunks: string[]): string[] { |
| 164 | + if (this.overlap <= 0 || chunks.length <= 1) { |
| 165 | + return chunks |
| 166 | + } |
| 167 | + |
| 168 | + const overlappedChunks: string[] = [] |
| 169 | + |
| 170 | + for (let i = 0; i < chunks.length; i++) { |
| 171 | + let chunk = chunks[i] |
| 172 | + |
| 173 | + // Add overlap from previous chunk |
| 174 | + if (i > 0) { |
| 175 | + const prevChunk = chunks[i - 1] |
| 176 | + const words = prevChunk.split(/\s+/) |
| 177 | + const overlapWords = words.slice(-Math.min(this.overlap, words.length)) |
| 178 | + |
| 179 | + if (overlapWords.length > 0) { |
| 180 | + chunk = `${overlapWords.join(' ')} ${chunk}` |
| 181 | + } |
| 182 | + } |
| 183 | + |
| 184 | + overlappedChunks.push(chunk) |
| 185 | + } |
| 186 | + |
| 187 | + return overlappedChunks |
| 188 | + } |
| 189 | + |
| 190 | + /** |
| 191 | + * Clean and normalize text |
| 192 | + */ |
| 193 | + private cleanText(text: string): string { |
| 194 | + return text |
| 195 | + .replace(/\r\n/g, '\n') // Normalize Windows line endings |
| 196 | + .replace(/\r/g, '\n') // Normalize old Mac line endings |
| 197 | + .replace(/\n{3,}/g, '\n\n') // Limit consecutive newlines |
| 198 | + .replace(/\t/g, ' ') // Convert tabs to spaces |
| 199 | + .replace(/ {2,}/g, ' ') // Collapse multiple spaces |
| 200 | + .trim() |
| 201 | + } |
| 202 | + |
| 203 | + /** |
| 204 | + * Main chunking method |
| 205 | + */ |
| 206 | + async chunk(text: string): Promise<Chunk[]> { |
| 207 | + if (!text?.trim()) { |
| 208 | + return [] |
| 209 | + } |
| 210 | + |
| 211 | + // Clean the text |
| 212 | + const cleanedText = this.cleanText(text) |
| 213 | + |
| 214 | + // Split into chunks |
| 215 | + let chunks = this.splitRecursively(cleanedText) |
| 216 | + |
| 217 | + // Add overlap if configured |
| 218 | + chunks = this.addOverlap(chunks) |
| 219 | + |
| 220 | + // Convert to Chunk objects with metadata |
| 221 | + let previousEndIndex = 0 |
| 222 | + return chunks.map((chunkText, index) => { |
| 223 | + let startIndex: number |
| 224 | + let actualContentLength: number |
| 225 | + |
| 226 | + if (index === 0 || this.overlap <= 0) { |
| 227 | + // First chunk or no overlap - start from previous end |
| 228 | + startIndex = previousEndIndex |
| 229 | + actualContentLength = chunkText.length |
| 230 | + } else { |
| 231 | + // Calculate overlap length in characters |
| 232 | + const prevChunk = chunks[index - 1] |
| 233 | + const prevWords = prevChunk.split(/\s+/) |
| 234 | + const overlapWords = prevWords.slice(-Math.min(this.overlap, prevWords.length)) |
| 235 | + const overlapLength = Math.min( |
| 236 | + chunkText.length, |
| 237 | + overlapWords.length > 0 ? overlapWords.join(' ').length + 1 : 0 // +1 for space |
| 238 | + ) |
| 239 | + |
| 240 | + startIndex = previousEndIndex - overlapLength |
| 241 | + actualContentLength = chunkText.length - overlapLength |
| 242 | + } |
| 243 | + |
| 244 | + const safeStart = Math.max(0, startIndex) |
| 245 | + const endIndexSafe = safeStart + actualContentLength |
| 246 | + |
| 247 | + const chunk: Chunk = { |
| 248 | + text: chunkText, |
| 249 | + tokenCount: this.estimateTokens(chunkText), |
| 250 | + metadata: { |
| 251 | + startIndex: safeStart, |
| 252 | + endIndex: endIndexSafe, |
| 253 | + }, |
| 254 | + } |
| 255 | + |
| 256 | + previousEndIndex = endIndexSafe |
| 257 | + return chunk |
| 258 | + }) |
| 259 | + } |
| 260 | +} |
0 commit comments