From 19e04fb6046a150ff2c3dba6bd1fa29268628d5f Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Thu, 30 Apr 2026 17:32:01 -0700 Subject: [PATCH 1/4] feat(knowledge): add chunking strategies and regex strict boundaries - Add Token, Sentence, Recursive, and Regex chunkers with strategy selection in create-base modal - Add opt-in strict boundaries mode for regex chunker so each match becomes its own chunk - Add chunking strategies docs page with industry references --- .../en/knowledgebase/chunking-strategies.mdx | 137 ++++++++++++++++++ .../content/docs/en/knowledgebase/index.mdx | 2 + .../content/docs/en/knowledgebase/meta.json | 2 +- .../create-base-modal/create-base-modal.tsx | 32 +++- apps/sim/lib/api/contracts/knowledge/base.ts | 1 + apps/sim/lib/chunkers/regex-chunker.test.ts | 134 +++++++++++++++++ apps/sim/lib/chunkers/regex-chunker.ts | 40 ++++- apps/sim/lib/chunkers/types.ts | 8 + .../knowledge/documents/document-processor.ts | 1 + bun.lock | 1 + 10 files changed, 355 insertions(+), 3 deletions(-) create mode 100644 apps/docs/content/docs/en/knowledgebase/chunking-strategies.mdx diff --git a/apps/docs/content/docs/en/knowledgebase/chunking-strategies.mdx b/apps/docs/content/docs/en/knowledgebase/chunking-strategies.mdx new file mode 100644 index 00000000000..159b636edc3 --- /dev/null +++ b/apps/docs/content/docs/en/knowledgebase/chunking-strategies.mdx @@ -0,0 +1,137 @@ +--- +title: Chunking Strategies +description: How Sim splits documents into searchable chunks, and which strategy to pick for your content +--- + +import { FAQ } from '@/components/ui/faq' + +Sim splits every uploaded document into chunks before generating embeddings. The strategy controls *where* those splits happen. + +## How chunking works + +Every chunker follows a two-phase pattern: + +1. **Split** — break the document at boundaries (paragraphs, sentences, tokens, or a custom regex) +2. **Pack** — merge adjacent splits up to the maximum chunk size + +This is documented in [LangChain's text splitter guide](https://python.langchain.com/docs/concepts/text_splitters/), which states the principle: *"no resulting merged split should exceed the designated chunk size."* LlamaIndex, Chonkie, and Unstructured follow the same convention. + +The packing step is what keeps chunks roughly uniform. It also means a chunk usually spans multiple splits — a precise split boundary is not the same as a chunk boundary. Most "why is my regex not producing one chunk per match" surprises trace back to this. + +## Configuration shared by all strategies + +| Setting | Unit | Default | Range | Description | +|---------|------|---------|-------|-------------| +| Max Chunk Size | tokens | 1,024 | 100–4,000 | Upper bound on chunk size. 1 token ≈ 4 characters. | +| Min Chunk Size | characters | 100 | 100–2,000 | Tiny fragments below this are dropped. | +| Overlap | tokens | 200 | 0–500 | Tokens repeated between adjacent chunks to preserve context. | + +[Pinecone's chunking guide](https://www.pinecone.io/learn/chunking-strategies/) covers the tradeoffs in size and overlap. + +## Strategies + +### Auto + +Sim inspects the file and routes to the right chunker: + +- `.json`, `.jsonl`, `.yaml`, `.yml` → structural chunking (records are never split mid-way; small records may still be batched together up to the chunk size) +- `.csv`, `.xlsx`, `.xls`, `.tsv` → grouped by row, with headers preserved +- Everything else (`.pdf`, `.docx`, `.txt`, `.md`, `.html`, `.pptx`, …) → Text strategy + +Routing is based on detected MIME type and content shape, not just the extension — a `.txt` file containing valid JSON is still routed structurally. + +Pick Auto unless you've confirmed it isn't producing the chunks you want. + +### Text + +Hierarchical splitter that walks down a separator list: horizontal rules → markdown headings → paragraphs (`\n\n`) → lines (`\n`) → sentence punctuation (`. ! ?`) → clause punctuation (`; ,`) → spaces. It tries the largest separator first and falls back when a piece is still too large. + +Same algorithm as LangChain's [`RecursiveCharacterTextSplitter`](https://python.langchain.com/docs/concepts/text_splitters/#text-structured-based), the de facto standard for prose. + +Use it for general prose. + +### Recursive + +Same algorithm as Text, but you supply your own separator hierarchy or pick a built-in recipe (`plain`, `markdown`, `code`). + +The recipe pattern comes from [Chonkie](https://github.com/chonkie-inc/chonkie), which ships pre-built separator sets for common content types. + +Use Recursive when your content has structural markers the default Text separators miss — splitting code on `\nclass `, `\nfunction `, then `\n\n`, for example. + +### Sentence + +Splits on sentence boundaries (`. `, `! `, `? `, with abbreviation handling) and packs whole sentences up to the chunk size. A sentence is never split mid-way unless it individually exceeds the limit. + +This is the technique behind [LlamaIndex's `SentenceSplitter`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/), which is the recommended default for prose in their stack. + +Use it when sentence integrity matters — Q&A, legal text, or anything where mid-sentence cuts hurt comprehension. + +### Token + +Fixed-size sliding window aligned to word boundaries. No awareness of paragraphs or sentences. + +LlamaIndex provides the same as `TokenTextSplitter`. Useful when downstream processing requires uniform chunk sizes; otherwise prefer Text or Sentence. + +### Regex + +Splits on every match of a regex pattern you supply, then packs splits up to the chunk size by default — the same merge behavior as every other chunker. A precise boundary regex like `(?=\n\s*\{\s*"id"\s*:)` will still produce chunks containing multiple matches if those matches are small enough to fit together. This is standard across LangChain, LlamaIndex, Chonkie, and [Unstructured](https://docs.unstructured.io/api-reference/partition/chunking-documents). + +Use Regex when your content has explicit delimiters that don't fit any other strategy. + +#### Strict boundaries + +The regex strategy has an opt-in **"Each match is its own chunk (don't merge)"** checkbox. When enabled: + +- Every regex match becomes its own chunk +- Adjacent splits are not packed together +- Overlap is disabled +- Splits that exceed the chunk size are still sub-split at word boundaries + +This matches the `join=False` knob in [txtai](https://neuml.github.io/txtai/) and the `split_length=1` pattern in Haystack's `DocumentSplitter`. Most libraries don't expose this directly because they expect users to switch to a structural parser instead — see "One record per chunk" below. + +Turn it on when each match is a discrete record (one QA pair, one log entry) and you need each isolated for retrieval. + +## How to choose + +Pick **Auto** unless you have a reason not to. + +If Auto isn't right: + +- Sentence integrity matters → **Sentence** +- Your content has structural markers Text doesn't know about → **Recursive** +- You need uniform chunk sizes → **Token** +- You have explicit delimiters → **Regex** +- Each record must be its own chunk → see below + +## One record per chunk + +Each record (each QA pair, each log line, each row) as its own chunk is structural chunking, not regex chunking. Two paths: + +1. **Convert to JSONL** (one record per line) and upload. Sim's Auto strategy treats it as structured data and never splits a record mid-way. Small records may still be batched together up to the chunk size — to force one record per chunk, lower the max chunk size to roughly the size of one record. See [LlamaIndex's `JSONNodeParser`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/) and [Unstructured's element-based chunking](https://docs.unstructured.io/api-reference/partition/chunking-documents). + +2. **Use Regex with strict boundaries enabled** when you can't restructure the source. + +Prefer option 1. Structural parsers handle nested records, escaped delimiters, and malformed entries that regex won't. + +## Further reading + +- [LangChain — Text Splitters](https://python.langchain.com/docs/concepts/text_splitters/) +- [LlamaIndex — Node Parsers](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/) +- [Chonkie](https://github.com/chonkie-inc/chonkie) +- [Unstructured — Chunking](https://docs.unstructured.io/api-reference/partition/chunking-documents) +- [Pinecone — Chunking Strategies](https://www.pinecone.io/learn/chunking-strategies/) + +## FAQ + + diff --git a/apps/docs/content/docs/en/knowledgebase/index.mdx b/apps/docs/content/docs/en/knowledgebase/index.mdx index 6213414ed92..45b109349d9 100644 --- a/apps/docs/content/docs/en/knowledgebase/index.mdx +++ b/apps/docs/content/docs/en/knowledgebase/index.mdx @@ -44,6 +44,8 @@ When creating a knowledge base, you can configure how documents are split into c | **Min Chunk Size** | characters | 100 | 100-2,000 | Minimum chunk size to avoid tiny fragments | | **Overlap** | tokens | 200 | 0-500 | Context overlap between consecutive chunks | +You can also pick a chunking strategy (Auto, Text, Recursive, Sentence, Token, or Regex) to control where splits happen. See [Chunking Strategies](/docs/knowledgebase/chunking-strategies) for a breakdown of when to use each. + - **Hierarchical splitting**: Respects document structure (sections, paragraphs, sentences) ### Editing Capabilities diff --git a/apps/docs/content/docs/en/knowledgebase/meta.json b/apps/docs/content/docs/en/knowledgebase/meta.json index e304c09ce7a..6c42e7be3eb 100644 --- a/apps/docs/content/docs/en/knowledgebase/meta.json +++ b/apps/docs/content/docs/en/knowledgebase/meta.json @@ -1,4 +1,4 @@ { "title": "Knowledge Base", - "pages": ["index", "connectors", "tags"] + "pages": ["index", "chunking-strategies", "connectors", "tags"] } diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx index 4aacb8cf0b1..d1f00495a09 100644 --- a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx +++ b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx @@ -9,6 +9,7 @@ import { useForm } from 'react-hook-form' import { z } from 'zod' import { Button, + Checkbox, Combobox, type ComboboxOption, Input, @@ -75,6 +76,7 @@ const FormSchema = z .max(500, 'Overlap must be less than 500 tokens'), strategy: z.enum(['auto', 'text', 'regex', 'recursive', 'sentence', 'token']).default('auto'), regexPattern: z.string().optional(), + regexStrictBoundaries: z.boolean().default(false), customSeparators: z.string().optional(), }) .refine( @@ -175,6 +177,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({ overlapSize: 200, strategy: 'auto', regexPattern: '', + regexStrictBoundaries: false, customSeparators: '', }, mode: 'onSubmit', @@ -182,6 +185,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({ const nameValue = watch('name') const strategyValue = watch('strategy') + const regexStrictBoundariesValue = watch('regexStrictBoundaries') useEffect(() => { if (open) { @@ -199,6 +203,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({ overlapSize: 200, strategy: 'auto', regexPattern: '', + regexStrictBoundaries: false, customSeparators: '', }) } @@ -304,7 +309,10 @@ export const CreateBaseModal = memo(function CreateBaseModal({ try { const strategyOptions: StrategyOptions | undefined = data.strategy === 'regex' && data.regexPattern - ? { pattern: data.regexPattern } + ? { + pattern: data.regexPattern, + ...(data.regexStrictBoundaries && { strictBoundaries: true }), + } : data.strategy === 'recursive' && data.customSeparators?.trim() ? { separators: data.customSeparators @@ -495,6 +503,28 @@ export const CreateBaseModal = memo(function CreateBaseModal({

Text will be split at each match of this regex pattern.

+ )} diff --git a/apps/sim/lib/api/contracts/knowledge/base.ts b/apps/sim/lib/api/contracts/knowledge/base.ts index 1705b11800d..c9b29dc0698 100644 --- a/apps/sim/lib/api/contracts/knowledge/base.ts +++ b/apps/sim/lib/api/contracts/knowledge/base.ts @@ -21,6 +21,7 @@ export const chunkingStrategyOptionsSchema = z pattern: z.string().max(500).optional(), separators: z.array(z.string()).optional(), recipe: z.enum(['plain', 'markdown', 'code']).optional(), + strictBoundaries: z.boolean().optional(), }) .strict() satisfies z.ZodType diff --git a/apps/sim/lib/chunkers/regex-chunker.test.ts b/apps/sim/lib/chunkers/regex-chunker.test.ts index 5716d45b28a..278339fcd19 100644 --- a/apps/sim/lib/chunkers/regex-chunker.test.ts +++ b/apps/sim/lib/chunkers/regex-chunker.test.ts @@ -183,4 +183,138 @@ describe('RegexChunker', () => { expect(() => new RegexChunker({ pattern: '[,;]' })).not.toThrow() }) }) + + describe('strictBoundaries mode', () => { + it.concurrent( + 'should produce one chunk per match without merging small adjacent segments', + async () => { + const chunker = new RegexChunker({ + pattern: '\\n\\n', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = 'Short.\n\nAlso short.\n\nTiny.\n\nSmall too.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(4) + expect(chunks[0].text).toBe('Short.') + expect(chunks[1].text).toBe('Also short.') + expect(chunks[2].text).toBe('Tiny.') + expect(chunks[3].text).toBe('Small too.') + } + ) + + it.concurrent('should produce one chunk per QA record using lookahead pattern', async () => { + const chunker = new RegexChunker({ + pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = + '{"id": 1, "q": "first?", "a": "yes"}\n{"id": 2, "q": "second?", "a": "no"}\n{"id": 3, "q": "third?", "a": "maybe"}' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + expect(chunks[0].text).toContain('"id": 1') + expect(chunks[0].text).not.toContain('"id": 2') + expect(chunks[1].text).toContain('"id": 2') + expect(chunks[1].text).not.toContain('"id": 3') + expect(chunks[2].text).toContain('"id": 3') + }) + + it.concurrent('should not apply overlap even when chunkOverlap is set', async () => { + const chunker = new RegexChunker({ + pattern: '\\n\\n', + chunkSize: 100, + chunkOverlap: 50, + strictBoundaries: true, + }) + const text = 'First section content.\n\nSecond section content.\n\nThird section content.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + expect(chunks[0].text).toBe('First section content.') + expect(chunks[1].text).toBe('Second section content.') + expect(chunks[2].text).toBe('Third section content.') + }) + + it.concurrent( + 'should still split when content fits in single chunk if matches exist', + async () => { + const chunker = new RegexChunker({ + pattern: '\\n\\n', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = 'A.\n\nB.\n\nC.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + } + ) + + it.concurrent('should sub-chunk a single oversized segment at word boundaries', async () => { + const chunker = new RegexChunker({ + pattern: '---', + chunkSize: 10, + strictBoundaries: true, + }) + const longSegment = + 'This is a very long segment with many words that exceeds the chunk size limit significantly.' + const text = `${longSegment}---short` + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(2) + expect(chunks[chunks.length - 1].text).toBe('short') + }) + + it.concurrent('should return single chunk when regex finds no matches', async () => { + const chunker = new RegexChunker({ + pattern: '###NOMATCH###', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = 'Plain text with no delimiter at all.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + expect(chunks[0].text).toBe(text) + }) + + it.concurrent('should return empty array for empty input', async () => { + const chunker = new RegexChunker({ + pattern: '\\n\\n', + strictBoundaries: true, + }) + const chunks = await chunker.chunk('') + expect(chunks).toEqual([]) + }) + + it.concurrent( + 'should default to merging behavior when strictBoundaries is omitted', + async () => { + const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 100 }) + const text = 'Short.\n\nAlso short.\n\nTiny.\n\nSmall too.' + const chunks = await chunker.chunk(text) + expect(chunks).toHaveLength(1) + } + ) + + it.concurrent('should produce non-overlapping startIndex/endIndex metadata', async () => { + const chunker = new RegexChunker({ + pattern: '\\n\\n', + chunkSize: 1024, + chunkOverlap: 50, + strictBoundaries: true, + }) + const text = 'First.\n\nSecond.\n\nThird.' + const chunks = await chunker.chunk(text) + + for (let i = 1; i < chunks.length; i++) { + expect(chunks[i].metadata.startIndex).toBeGreaterThanOrEqual( + chunks[i - 1].metadata.endIndex + ) + } + }) + }) }) diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts index 4276287c627..0de118b8de5 100644 --- a/apps/sim/lib/chunkers/regex-chunker.ts +++ b/apps/sim/lib/chunkers/regex-chunker.ts @@ -19,12 +19,14 @@ export class RegexChunker { private readonly chunkSize: number private readonly chunkOverlap: number private readonly regex: RegExp + private readonly strictBoundaries: boolean constructor(options: RegexChunkerOptions) { const resolved = resolveChunkerOptions(options) this.chunkSize = resolved.chunkSize this.chunkOverlap = resolved.chunkOverlap this.regex = this.compilePattern(options.pattern) + this.strictBoundaries = options.strictBoundaries ?? false } private compilePattern(pattern: string): RegExp { @@ -74,7 +76,7 @@ export class RegexChunker { const cleaned = cleanText(content) - if (estimateTokens(cleaned) <= this.chunkSize) { + if (!this.strictBoundaries && estimateTokens(cleaned) <= this.chunkSize) { logger.info('Content fits in single chunk') return buildChunks([cleaned], 0) } @@ -83,6 +85,10 @@ export class RegexChunker { const segments = cleaned.split(this.regex).filter((s) => s.trim().length > 0) if (segments.length <= 1) { + if (this.strictBoundaries) { + logger.info('Regex pattern produced no splits in strict mode, returning single chunk') + return buildChunks([cleaned.trim()], 0) + } logger.warn( 'Regex pattern did not produce any splits, falling back to word-boundary splitting' ) @@ -95,6 +101,12 @@ export class RegexChunker { return buildChunks(chunks, this.chunkOverlap) } + if (this.strictBoundaries) { + const chunks = this.expandOversizedSegments(segments) + logger.info(`Chunked into ${chunks.length} strict-boundary regex chunks`) + return buildChunks(chunks, 0) + } + const merged = this.mergeSegments(segments) let chunks = merged @@ -107,6 +119,32 @@ export class RegexChunker { return buildChunks(chunks, this.chunkOverlap) } + /** + * In strict-boundary mode each segment becomes its own chunk. Segments that + * exceed chunkSize are still split at word boundaries to preserve the token + * limit invariant; this is a safety floor, not a merge. + */ + private expandOversizedSegments(segments: string[]): string[] { + const result: string[] = [] + const chunkSizeChars = tokensToChars(this.chunkSize) + + for (const segment of segments) { + const trimmed = segment.trim() + if (!trimmed) continue + + if (estimateTokens(trimmed) <= this.chunkSize) { + result.push(trimmed) + } else { + const subChunks = splitAtWordBoundaries(trimmed, chunkSizeChars) + for (const sub of subChunks) { + if (sub.trim()) result.push(sub) + } + } + } + + return result + } + private mergeSegments(segments: string[]): string[] { const chunks: string[] = [] let current = '' diff --git a/apps/sim/lib/chunkers/types.ts b/apps/sim/lib/chunkers/types.ts index 692e84d12fc..ef38a85b808 100644 --- a/apps/sim/lib/chunkers/types.ts +++ b/apps/sim/lib/chunkers/types.ts @@ -54,6 +54,7 @@ export interface StrategyOptions { pattern?: string separators?: string[] recipe?: RecursiveRecipe + strictBoundaries?: boolean } export interface SentenceChunkerOptions extends ChunkerOptions { @@ -67,4 +68,11 @@ export interface RecursiveChunkerOptions extends ChunkerOptions { export interface RegexChunkerOptions extends ChunkerOptions { pattern: string + /** + * When true, each regex match becomes its own chunk and small adjacent + * segments are not merged together. Overlap is also disabled. Useful for + * structural inputs where boundaries (e.g. one record per match) must be + * preserved exactly. + */ + strictBoundaries?: boolean } diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts index 249108205a9..6f3a7d9e7b6 100644 --- a/apps/sim/lib/knowledge/documents/document-processor.ts +++ b/apps/sim/lib/knowledge/documents/document-processor.ts @@ -154,6 +154,7 @@ async function applyStrategy( const chunker = new RegexChunker({ ...baseOptions, pattern: strategyOptions.pattern, + strictBoundaries: strategyOptions.strictBoundaries, }) return chunker.chunk(content) } diff --git a/bun.lock b/bun.lock index f70b8c4e926..b1882bc4eb6 100644 --- a/bun.lock +++ b/bun.lock @@ -1,5 +1,6 @@ { "lockfileVersion": 1, + "configVersion": 0, "workspaces": { "": { "name": "simstudio", From daaadb0f917f01ba1b6c3cff19fa59cf48ab7478 Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Thu, 30 Apr 2026 17:47:03 -0700 Subject: [PATCH 2/4] fix(chunkers): strip capturing groups and validate strictBoundaries scope - Convert capturing groups to non-capturing in regex chunker so split() doesn't surface delimiter text as spurious chunks - Reject strictBoundaries in chunkingConfigSchema when strategy is not regex --- apps/sim/lib/api/contracts/knowledge/base.ts | 3 ++ apps/sim/lib/chunkers/regex-chunker.test.ts | 35 ++++++++++++++++++++ apps/sim/lib/chunkers/regex-chunker.ts | 29 +++++++++++++++- 3 files changed, 66 insertions(+), 1 deletion(-) diff --git a/apps/sim/lib/api/contracts/knowledge/base.ts b/apps/sim/lib/api/contracts/knowledge/base.ts index c9b29dc0698..86f0438c260 100644 --- a/apps/sim/lib/api/contracts/knowledge/base.ts +++ b/apps/sim/lib/api/contracts/knowledge/base.ts @@ -45,6 +45,9 @@ export const chunkingConfigSchema = z message: 'Regex pattern is required when using the regex chunking strategy', } ) + .refine((data) => data.strategy === 'regex' || data.strategyOptions?.strictBoundaries !== true, { + message: 'strictBoundaries is only valid for the regex chunking strategy', + }) export const createKnowledgeBaseBodySchema = z.object({ name: z.string().min(1, 'Name is required'), diff --git a/apps/sim/lib/chunkers/regex-chunker.test.ts b/apps/sim/lib/chunkers/regex-chunker.test.ts index 278339fcd19..e6c8122cafb 100644 --- a/apps/sim/lib/chunkers/regex-chunker.test.ts +++ b/apps/sim/lib/chunkers/regex-chunker.test.ts @@ -184,6 +184,41 @@ describe('RegexChunker', () => { }) }) + describe('capturing groups', () => { + it.concurrent( + 'should not include delimiter text as a chunk when pattern has capturing groups', + async () => { + const chunker = new RegexChunker({ + pattern: '(---)', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = 'Section one content.---Section two content.---Section three content.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + expect(chunks[0].text).toBe('Section one content.') + expect(chunks[1].text).toBe('Section two content.') + expect(chunks[2].text).toBe('Section three content.') + for (const chunk of chunks) { + expect(chunk.text).not.toBe('---') + } + } + ) + + it.concurrent('should leave non-capturing groups and lookarounds intact', async () => { + const chunker = new RegexChunker({ + pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = '{"id": 1, "v": "a"}\n{"id": 2, "v": "b"}\n{"id": 3, "v": "c"}' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + }) + }) + describe('strictBoundaries mode', () => { it.concurrent( 'should produce one chunk per match without merging small adjacent segments', diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts index 0de118b8de5..e7253ac03ec 100644 --- a/apps/sim/lib/chunkers/regex-chunker.ts +++ b/apps/sim/lib/chunkers/regex-chunker.ts @@ -15,6 +15,33 @@ const logger = createLogger('RegexChunker') const MAX_PATTERN_LENGTH = 500 +/** + * Converts unescaped capturing groups `(...)` into non-capturing groups `(?:...)`. + * `String.prototype.split()` interleaves captured groups into the result array, + * which would surface delimiter text as spurious chunks. Lookarounds, named + * groups, and other `(?...)` constructs are left untouched. + */ +function toNonCapturing(pattern: string): string { + let result = '' + let inClass = false + for (let i = 0; i < pattern.length; i++) { + const c = pattern[i] + if (c === '\\' && i + 1 < pattern.length) { + result += c + pattern[i + 1] + i++ + continue + } + if (c === '[') inClass = true + else if (c === ']') inClass = false + if (!inClass && c === '(' && pattern[i + 1] !== '?') { + result += '(?:' + continue + } + result += c + } + return result +} + export class RegexChunker { private readonly chunkSize: number private readonly chunkOverlap: number @@ -39,7 +66,7 @@ export class RegexChunker { } try { - const regex = new RegExp(pattern, 'g') + const regex = new RegExp(toNonCapturing(pattern), 'g') const testStrings = [ 'a'.repeat(10000), From 236b94823497db4d64c27b073b78770b5fbde55c Mon Sep 17 00:00:00 2001 From: Waleed Latif Date: Thu, 30 Apr 2026 17:55:16 -0700 Subject: [PATCH 3/4] fix(chunkers): also strip named capture groups in regex patterns Named groups (?...) are still capturing groups so split() interleaves their matched text. Convert them to non-capturing alongside plain ( groups. --- apps/sim/lib/chunkers/regex-chunker.test.ts | 21 +++++++++++++++++ apps/sim/lib/chunkers/regex-chunker.ts | 25 +++++++++++++++------ 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/apps/sim/lib/chunkers/regex-chunker.test.ts b/apps/sim/lib/chunkers/regex-chunker.test.ts index e6c8122cafb..db9fdab5d4b 100644 --- a/apps/sim/lib/chunkers/regex-chunker.test.ts +++ b/apps/sim/lib/chunkers/regex-chunker.test.ts @@ -206,6 +206,27 @@ describe('RegexChunker', () => { } ) + it.concurrent( + 'should not include delimiter text when pattern uses named capture groups', + async () => { + const chunker = new RegexChunker({ + pattern: '(?---)', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = 'Section one content.---Section two content.---Section three content.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + expect(chunks[0].text).toBe('Section one content.') + expect(chunks[1].text).toBe('Section two content.') + expect(chunks[2].text).toBe('Section three content.') + for (const chunk of chunks) { + expect(chunk.text).not.toBe('---') + } + } + ) + it.concurrent('should leave non-capturing groups and lookarounds intact', async () => { const chunker = new RegexChunker({ pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)', diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts index e7253ac03ec..ab8d41e5648 100644 --- a/apps/sim/lib/chunkers/regex-chunker.ts +++ b/apps/sim/lib/chunkers/regex-chunker.ts @@ -15,11 +15,14 @@ const logger = createLogger('RegexChunker') const MAX_PATTERN_LENGTH = 500 +const NAMED_GROUP_PREFIX = /^\(\?<[^>]+>/ + /** - * Converts unescaped capturing groups `(...)` into non-capturing groups `(?:...)`. - * `String.prototype.split()` interleaves captured groups into the result array, - * which would surface delimiter text as spurious chunks. Lookarounds, named - * groups, and other `(?...)` constructs are left untouched. + * Converts unescaped capturing groups `(...)` and named capturing groups + * `(?...)` into non-capturing groups `(?:...)`. `String.prototype.split()` + * interleaves captured text (named or otherwise) into the result array, which + * would surface delimiter text as spurious chunks. Lookarounds (`(?=`, `(?!`, + * `(?<=`, `(? Date: Thu, 30 Apr 2026 18:05:19 -0700 Subject: [PATCH 4/4] fix(chunkers): exclude lookbehind from named-group rewrite Tighten NAMED_GROUP_PREFIX with negative lookahead so patterns like (?<=) are not misidentified as named capture groups. --- apps/sim/lib/chunkers/regex-chunker.test.ts | 15 +++++++++++++++ apps/sim/lib/chunkers/regex-chunker.ts | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/apps/sim/lib/chunkers/regex-chunker.test.ts b/apps/sim/lib/chunkers/regex-chunker.test.ts index db9fdab5d4b..f4f55112e55 100644 --- a/apps/sim/lib/chunkers/regex-chunker.test.ts +++ b/apps/sim/lib/chunkers/regex-chunker.test.ts @@ -227,6 +227,21 @@ describe('RegexChunker', () => { } ) + it.concurrent('should preserve lookbehind whose body contains a > character', async () => { + const chunker = new RegexChunker({ + pattern: '(?<=)', + chunkSize: 1024, + strictBoundaries: true, + }) + const text = '
one
two
three
' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(3) + expect(chunks[0].text).toBe('
one
') + expect(chunks[1].text).toBe('
two
') + expect(chunks[2].text).toBe('
three
') + }) + it.concurrent('should leave non-capturing groups and lookarounds intact', async () => { const chunker = new RegexChunker({ pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)', diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts index ab8d41e5648..0cafa47b0d3 100644 --- a/apps/sim/lib/chunkers/regex-chunker.ts +++ b/apps/sim/lib/chunkers/regex-chunker.ts @@ -15,7 +15,7 @@ const logger = createLogger('RegexChunker') const MAX_PATTERN_LENGTH = 500 -const NAMED_GROUP_PREFIX = /^\(\?<[^>]+>/ +const NAMED_GROUP_PREFIX = /^\(\?<(?![=!])[^>]+>/ /** * Converts unescaped capturing groups `(...)` and named capturing groups