diff --git a/apps/docs/content/docs/en/knowledgebase/chunking-strategies.mdx b/apps/docs/content/docs/en/knowledgebase/chunking-strategies.mdx new file mode 100644 index 00000000000..159b636edc3 --- /dev/null +++ b/apps/docs/content/docs/en/knowledgebase/chunking-strategies.mdx @@ -0,0 +1,137 @@ +--- +title: Chunking Strategies +description: How Sim splits documents into searchable chunks, and which strategy to pick for your content +--- + +import { FAQ } from '@/components/ui/faq' + +Sim splits every uploaded document into chunks before generating embeddings. The strategy controls *where* those splits happen. + +## How chunking works + +Every chunker follows a two-phase pattern: + +1. **Split** — break the document at boundaries (paragraphs, sentences, tokens, or a custom regex) +2. **Pack** — merge adjacent splits up to the maximum chunk size + +This is documented in [LangChain's text splitter guide](https://python.langchain.com/docs/concepts/text_splitters/), which states the principle: *"no resulting merged split should exceed the designated chunk size."* LlamaIndex, Chonkie, and Unstructured follow the same convention. + +The packing step is what keeps chunks roughly uniform. It also means a chunk usually spans multiple splits — a precise split boundary is not the same as a chunk boundary. Most "why is my regex not producing one chunk per match" surprises trace back to this. + +## Configuration shared by all strategies + +| Setting | Unit | Default | Range | Description | +|---------|------|---------|-------|-------------| +| Max Chunk Size | tokens | 1,024 | 100–4,000 | Upper bound on chunk size. 1 token ≈ 4 characters. | +| Min Chunk Size | characters | 100 | 100–2,000 | Tiny fragments below this are dropped. | +| Overlap | tokens | 200 | 0–500 | Tokens repeated between adjacent chunks to preserve context. | + +[Pinecone's chunking guide](https://www.pinecone.io/learn/chunking-strategies/) covers the tradeoffs in size and overlap. + +## Strategies + +### Auto + +Sim inspects the file and routes to the right chunker: + +- `.json`, `.jsonl`, `.yaml`, `.yml` → structural chunking (records are never split mid-way; small records may still be batched together up to the chunk size) +- `.csv`, `.xlsx`, `.xls`, `.tsv` → grouped by row, with headers preserved +- Everything else (`.pdf`, `.docx`, `.txt`, `.md`, `.html`, `.pptx`, …) → Text strategy + +Routing is based on detected MIME type and content shape, not just the extension — a `.txt` file containing valid JSON is still routed structurally. + +Pick Auto unless you've confirmed it isn't producing the chunks you want. + +### Text + +Hierarchical splitter that walks down a separator list: horizontal rules → markdown headings → paragraphs (`\n\n`) → lines (`\n`) → sentence punctuation (`. ! ?`) → clause punctuation (`; ,`) → spaces. It tries the largest separator first and falls back when a piece is still too large. + +Same algorithm as LangChain's [`RecursiveCharacterTextSplitter`](https://python.langchain.com/docs/concepts/text_splitters/#text-structured-based), the de facto standard for prose. + +Use it for general prose. + +### Recursive + +Same algorithm as Text, but you supply your own separator hierarchy or pick a built-in recipe (`plain`, `markdown`, `code`). + +The recipe pattern comes from [Chonkie](https://github.com/chonkie-inc/chonkie), which ships pre-built separator sets for common content types. + +Use Recursive when your content has structural markers the default Text separators miss — splitting code on `\nclass `, `\nfunction `, then `\n\n`, for example. + +### Sentence + +Splits on sentence boundaries (`. `, `! `, `? `, with abbreviation handling) and packs whole sentences up to the chunk size. A sentence is never split mid-way unless it individually exceeds the limit. + +This is the technique behind [LlamaIndex's `SentenceSplitter`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/), which is the recommended default for prose in their stack. + +Use it when sentence integrity matters — Q&A, legal text, or anything where mid-sentence cuts hurt comprehension. + +### Token + +Fixed-size sliding window aligned to word boundaries. No awareness of paragraphs or sentences. + +LlamaIndex provides the same as `TokenTextSplitter`. Useful when downstream processing requires uniform chunk sizes; otherwise prefer Text or Sentence. + +### Regex + +Splits on every match of a regex pattern you supply, then packs splits up to the chunk size by default — the same merge behavior as every other chunker. A precise boundary regex like `(?=\n\s*\{\s*"id"\s*:)` will still produce chunks containing multiple matches if those matches are small enough to fit together. This is standard across LangChain, LlamaIndex, Chonkie, and [Unstructured](https://docs.unstructured.io/api-reference/partition/chunking-documents). + +Use Regex when your content has explicit delimiters that don't fit any other strategy. + +#### Strict boundaries + +The regex strategy has an opt-in **"Each match is its own chunk (don't merge)"** checkbox. When enabled: + +- Every regex match becomes its own chunk +- Adjacent splits are not packed together +- Overlap is disabled +- Splits that exceed the chunk size are still sub-split at word boundaries + +This matches the `join=False` knob in [txtai](https://neuml.github.io/txtai/) and the `split_length=1` pattern in Haystack's `DocumentSplitter`. Most libraries don't expose this directly because they expect users to switch to a structural parser instead — see "One record per chunk" below. + +Turn it on when each match is a discrete record (one QA pair, one log entry) and you need each isolated for retrieval. + +## How to choose + +Pick **Auto** unless you have a reason not to. + +If Auto isn't right: + +- Sentence integrity matters → **Sentence** +- Your content has structural markers Text doesn't know about → **Recursive** +- You need uniform chunk sizes → **Token** +- You have explicit delimiters → **Regex** +- Each record must be its own chunk → see below + +## One record per chunk + +Each record (each QA pair, each log line, each row) as its own chunk is structural chunking, not regex chunking. Two paths: + +1. **Convert to JSONL** (one record per line) and upload. Sim's Auto strategy treats it as structured data and never splits a record mid-way. Small records may still be batched together up to the chunk size — to force one record per chunk, lower the max chunk size to roughly the size of one record. See [LlamaIndex's `JSONNodeParser`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/) and [Unstructured's element-based chunking](https://docs.unstructured.io/api-reference/partition/chunking-documents). + +2. **Use Regex with strict boundaries enabled** when you can't restructure the source. + +Prefer option 1. Structural parsers handle nested records, escaped delimiters, and malformed entries that regex won't. + +## Further reading + +- [LangChain — Text Splitters](https://python.langchain.com/docs/concepts/text_splitters/) +- [LlamaIndex — Node Parsers](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/) +- [Chonkie](https://github.com/chonkie-inc/chonkie) +- [Unstructured — Chunking](https://docs.unstructured.io/api-reference/partition/chunking-documents) +- [Pinecone — Chunking Strategies](https://www.pinecone.io/learn/chunking-strategies/) + +## FAQ + + diff --git a/apps/docs/content/docs/en/knowledgebase/index.mdx b/apps/docs/content/docs/en/knowledgebase/index.mdx index 6213414ed92..45b109349d9 100644 --- a/apps/docs/content/docs/en/knowledgebase/index.mdx +++ b/apps/docs/content/docs/en/knowledgebase/index.mdx @@ -44,6 +44,8 @@ When creating a knowledge base, you can configure how documents are split into c | **Min Chunk Size** | characters | 100 | 100-2,000 | Minimum chunk size to avoid tiny fragments | | **Overlap** | tokens | 200 | 0-500 | Context overlap between consecutive chunks | +You can also pick a chunking strategy (Auto, Text, Recursive, Sentence, Token, or Regex) to control where splits happen. See [Chunking Strategies](/docs/knowledgebase/chunking-strategies) for a breakdown of when to use each. + - **Hierarchical splitting**: Respects document structure (sections, paragraphs, sentences) ### Editing Capabilities diff --git a/apps/docs/content/docs/en/knowledgebase/meta.json b/apps/docs/content/docs/en/knowledgebase/meta.json index e304c09ce7a..6c42e7be3eb 100644 --- a/apps/docs/content/docs/en/knowledgebase/meta.json +++ b/apps/docs/content/docs/en/knowledgebase/meta.json @@ -1,4 +1,4 @@ { "title": "Knowledge Base", - "pages": ["index", "connectors", "tags"] + "pages": ["index", "chunking-strategies", "connectors", "tags"] } diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx index 4aacb8cf0b1..d1f00495a09 100644 --- a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx +++ b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx @@ -9,6 +9,7 @@ import { useForm } from 'react-hook-form' import { z } from 'zod' import { Button, + Checkbox, Combobox, type ComboboxOption, Input, @@ -75,6 +76,7 @@ const FormSchema = z .max(500, 'Overlap must be less than 500 tokens'), strategy: z.enum(['auto', 'text', 'regex', 'recursive', 'sentence', 'token']).default('auto'), regexPattern: z.string().optional(), + regexStrictBoundaries: z.boolean().default(false), customSeparators: z.string().optional(), }) .refine( @@ -175,6 +177,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({ overlapSize: 200, strategy: 'auto', regexPattern: '', + regexStrictBoundaries: false, customSeparators: '', }, mode: 'onSubmit', @@ -182,6 +185,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({ const nameValue = watch('name') const strategyValue = watch('strategy') + const regexStrictBoundariesValue = watch('regexStrictBoundaries') useEffect(() => { if (open) { @@ -199,6 +203,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({ overlapSize: 200, strategy: 'auto', regexPattern: '', + regexStrictBoundaries: false, customSeparators: '', }) } @@ -304,7 +309,10 @@ export const CreateBaseModal = memo(function CreateBaseModal({ try { const strategyOptions: StrategyOptions | undefined = data.strategy === 'regex' && data.regexPattern - ? { pattern: data.regexPattern } + ? { + pattern: data.regexPattern, + ...(data.regexStrictBoundaries && { strictBoundaries: true }), + } : data.strategy === 'recursive' && data.customSeparators?.trim() ? { separators: data.customSeparators @@ -495,6 +503,28 @@ export const CreateBaseModal = memo(function CreateBaseModal({

Text will be split at each match of this regex pattern.

+ )} diff --git a/apps/sim/lib/api/contracts/knowledge/base.ts b/apps/sim/lib/api/contracts/knowledge/base.ts index 1705b11800d..86f0438c260 100644 --- a/apps/sim/lib/api/contracts/knowledge/base.ts +++ b/apps/sim/lib/api/contracts/knowledge/base.ts @@ -21,6 +21,7 @@ export const chunkingStrategyOptionsSchema = z pattern: z.string().max(500).optional(), separators: z.array(z.string()).optional(), recipe: z.enum(['plain', 'markdown', 'code']).optional(), + strictBoundaries: z.boolean().optional(), }) .strict() satisfies z.ZodType @@ -44,6 +45,9 @@ export const chunkingConfigSchema = z message: 'Regex pattern is required when using the regex chunking strategy', } ) + .refine((data) => data.strategy === 'regex' || data.strategyOptions?.strictBoundaries !== true, { + message: 'strictBoundaries is only valid for the regex chunking strategy', + }) export const createKnowledgeBaseBodySchema = z.object({ name: z.string().min(1, 'Name is required'), diff --git a/apps/sim/lib/chunkers/regex-chunker.test.ts b/apps/sim/lib/chunkers/regex-chunker.test.ts index 5716d45b28a..f4f55112e55 100644 --- a/apps/sim/lib/chunkers/regex-chunker.test.ts +++ b/apps/sim/lib/chunkers/regex-chunker.test.ts @@ -183,4 +183,209 @@ describe('RegexChunker', () => { expect(() => new RegexChunker({ pattern: '[,;]' })).not.toThrow() }) }) + + describe('capturing groups', () => { + it.concurrent( + 'should not include delimiter text as a chunk when pattern has capturing groups', + async () => { + const chunker = new RegexChunker({ + pattern: '(---)', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = 'Section one content.---Section two content.---Section three content.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + expect(chunks[0].text).toBe('Section one content.') + expect(chunks[1].text).toBe('Section two content.') + expect(chunks[2].text).toBe('Section three content.') + for (const chunk of chunks) { + expect(chunk.text).not.toBe('---') + } + } + ) + + it.concurrent( + 'should not include delimiter text when pattern uses named capture groups', + async () => { + const chunker = new RegexChunker({ + pattern: '(?---)', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = 'Section one content.---Section two content.---Section three content.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + expect(chunks[0].text).toBe('Section one content.') + expect(chunks[1].text).toBe('Section two content.') + expect(chunks[2].text).toBe('Section three content.') + for (const chunk of chunks) { + expect(chunk.text).not.toBe('---') + } + } + ) + + it.concurrent('should preserve lookbehind whose body contains a > character', async () => { + const chunker = new RegexChunker({ + pattern: '(?<=)', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = '
one
two
three
' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + expect(chunks[0].text).toBe('
one
') + expect(chunks[1].text).toBe('
two
') + expect(chunks[2].text).toBe('
three
') + }) + + it.concurrent('should leave non-capturing groups and lookarounds intact', async () => { + const chunker = new RegexChunker({ + pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = '{"id": 1, "v": "a"}\n{"id": 2, "v": "b"}\n{"id": 3, "v": "c"}' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + }) + }) + + describe('strictBoundaries mode', () => { + it.concurrent( + 'should produce one chunk per match without merging small adjacent segments', + async () => { + const chunker = new RegexChunker({ + pattern: '\\n\\n', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = 'Short.\n\nAlso short.\n\nTiny.\n\nSmall too.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(4) + expect(chunks[0].text).toBe('Short.') + expect(chunks[1].text).toBe('Also short.') + expect(chunks[2].text).toBe('Tiny.') + expect(chunks[3].text).toBe('Small too.') + } + ) + + it.concurrent('should produce one chunk per QA record using lookahead pattern', async () => { + const chunker = new RegexChunker({ + pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = + '{"id": 1, "q": "first?", "a": "yes"}\n{"id": 2, "q": "second?", "a": "no"}\n{"id": 3, "q": "third?", "a": "maybe"}' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + expect(chunks[0].text).toContain('"id": 1') + expect(chunks[0].text).not.toContain('"id": 2') + expect(chunks[1].text).toContain('"id": 2') + expect(chunks[1].text).not.toContain('"id": 3') + expect(chunks[2].text).toContain('"id": 3') + }) + + it.concurrent('should not apply overlap even when chunkOverlap is set', async () => { + const chunker = new RegexChunker({ + pattern: '\\n\\n', + chunkSize: 100, + chunkOverlap: 50, + strictBoundaries: true, + }) + const text = 'First section content.\n\nSecond section content.\n\nThird section content.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + expect(chunks[0].text).toBe('First section content.') + expect(chunks[1].text).toBe('Second section content.') + expect(chunks[2].text).toBe('Third section content.') + }) + + it.concurrent( + 'should still split when content fits in single chunk if matches exist', + async () => { + const chunker = new RegexChunker({ + pattern: '\\n\\n', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = 'A.\n\nB.\n\nC.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + } + ) + + it.concurrent('should sub-chunk a single oversized segment at word boundaries', async () => { + const chunker = new RegexChunker({ + pattern: '---', + chunkSize: 10, + strictBoundaries: true, + }) + const longSegment = + 'This is a very long segment with many words that exceeds the chunk size limit significantly.' + const text = `${longSegment}---short` + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(2) + expect(chunks[chunks.length - 1].text).toBe('short') + }) + + it.concurrent('should return single chunk when regex finds no matches', async () => { + const chunker = new RegexChunker({ + pattern: '###NOMATCH###', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = 'Plain text with no delimiter at all.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + expect(chunks[0].text).toBe(text) + }) + + it.concurrent('should return empty array for empty input', async () => { + const chunker = new RegexChunker({ + pattern: '\\n\\n', + strictBoundaries: true, + }) + const chunks = await chunker.chunk('') + expect(chunks).toEqual([]) + }) + + it.concurrent( + 'should default to merging behavior when strictBoundaries is omitted', + async () => { + const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 100 }) + const text = 'Short.\n\nAlso short.\n\nTiny.\n\nSmall too.' + const chunks = await chunker.chunk(text) + expect(chunks).toHaveLength(1) + } + ) + + it.concurrent('should produce non-overlapping startIndex/endIndex metadata', async () => { + const chunker = new RegexChunker({ + pattern: '\\n\\n', + chunkSize: 1024, + chunkOverlap: 50, + strictBoundaries: true, + }) + const text = 'First.\n\nSecond.\n\nThird.' + const chunks = await chunker.chunk(text) + + for (let i = 1; i < chunks.length; i++) { + expect(chunks[i].metadata.startIndex).toBeGreaterThanOrEqual( + chunks[i - 1].metadata.endIndex + ) + } + }) + }) }) diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts index 4276287c627..0cafa47b0d3 100644 --- a/apps/sim/lib/chunkers/regex-chunker.ts +++ b/apps/sim/lib/chunkers/regex-chunker.ts @@ -15,16 +15,56 @@ const logger = createLogger('RegexChunker') const MAX_PATTERN_LENGTH = 500 +const NAMED_GROUP_PREFIX = /^\(\?<(?![=!])[^>]+>/ + +/** + * Converts unescaped capturing groups `(...)` and named capturing groups + * `(?...)` into non-capturing groups `(?:...)`. `String.prototype.split()` + * interleaves captured text (named or otherwise) into the result array, which + * would surface delimiter text as spurious chunks. Lookarounds (`(?=`, `(?!`, + * `(?<=`, `(? s.trim().length > 0) if (segments.length <= 1) { + if (this.strictBoundaries) { + logger.info('Regex pattern produced no splits in strict mode, returning single chunk') + return buildChunks([cleaned.trim()], 0) + } logger.warn( 'Regex pattern did not produce any splits, falling back to word-boundary splitting' ) @@ -95,6 +139,12 @@ export class RegexChunker { return buildChunks(chunks, this.chunkOverlap) } + if (this.strictBoundaries) { + const chunks = this.expandOversizedSegments(segments) + logger.info(`Chunked into ${chunks.length} strict-boundary regex chunks`) + return buildChunks(chunks, 0) + } + const merged = this.mergeSegments(segments) let chunks = merged @@ -107,6 +157,32 @@ export class RegexChunker { return buildChunks(chunks, this.chunkOverlap) } + /** + * In strict-boundary mode each segment becomes its own chunk. Segments that + * exceed chunkSize are still split at word boundaries to preserve the token + * limit invariant; this is a safety floor, not a merge. + */ + private expandOversizedSegments(segments: string[]): string[] { + const result: string[] = [] + const chunkSizeChars = tokensToChars(this.chunkSize) + + for (const segment of segments) { + const trimmed = segment.trim() + if (!trimmed) continue + + if (estimateTokens(trimmed) <= this.chunkSize) { + result.push(trimmed) + } else { + const subChunks = splitAtWordBoundaries(trimmed, chunkSizeChars) + for (const sub of subChunks) { + if (sub.trim()) result.push(sub) + } + } + } + + return result + } + private mergeSegments(segments: string[]): string[] { const chunks: string[] = [] let current = '' diff --git a/apps/sim/lib/chunkers/types.ts b/apps/sim/lib/chunkers/types.ts index 692e84d12fc..ef38a85b808 100644 --- a/apps/sim/lib/chunkers/types.ts +++ b/apps/sim/lib/chunkers/types.ts @@ -54,6 +54,7 @@ export interface StrategyOptions { pattern?: string separators?: string[] recipe?: RecursiveRecipe + strictBoundaries?: boolean } export interface SentenceChunkerOptions extends ChunkerOptions { @@ -67,4 +68,11 @@ export interface RecursiveChunkerOptions extends ChunkerOptions { export interface RegexChunkerOptions extends ChunkerOptions { pattern: string + /** + * When true, each regex match becomes its own chunk and small adjacent + * segments are not merged together. Overlap is also disabled. Useful for + * structural inputs where boundaries (e.g. one record per match) must be + * preserved exactly. + */ + strictBoundaries?: boolean } diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts index 249108205a9..6f3a7d9e7b6 100644 --- a/apps/sim/lib/knowledge/documents/document-processor.ts +++ b/apps/sim/lib/knowledge/documents/document-processor.ts @@ -154,6 +154,7 @@ async function applyStrategy( const chunker = new RegexChunker({ ...baseOptions, pattern: strategyOptions.pattern, + strictBoundaries: strategyOptions.strictBoundaries, }) return chunker.chunk(content) } diff --git a/bun.lock b/bun.lock index f70b8c4e926..b1882bc4eb6 100644 --- a/bun.lock +++ b/bun.lock @@ -1,5 +1,6 @@ { "lockfileVersion": 1, + "configVersion": 0, "workspaces": { "": { "name": "simstudio",