From 19e04fb6046a150ff2c3dba6bd1fa29268628d5f Mon Sep 17 00:00:00 2001
From: Waleed Latif <walif6@gmail.com>
Date: Thu, 30 Apr 2026 17:32:01 -0700
Subject: [PATCH 1/4] feat(knowledge): add chunking strategies and regex strict
 boundaries

- Add Token, Sentence, Recursive, and Regex chunkers with strategy selection in create-base modal
- Add opt-in strict boundaries mode for regex chunker so each match becomes its own chunk
- Add chunking strategies docs page with industry references
---
 .../en/knowledgebase/chunking-strategies.mdx  | 137 ++++++++++++++++++
 .../content/docs/en/knowledgebase/index.mdx   |   2 +
 .../content/docs/en/knowledgebase/meta.json   |   2 +-
 .../create-base-modal/create-base-modal.tsx   |  32 +++-
 apps/sim/lib/api/contracts/knowledge/base.ts  |   1 +
 apps/sim/lib/chunkers/regex-chunker.test.ts   | 134 +++++++++++++++++
 apps/sim/lib/chunkers/regex-chunker.ts        |  40 ++++-
 apps/sim/lib/chunkers/types.ts                |   8 +
 .../knowledge/documents/document-processor.ts |   1 +
 bun.lock                                      |   1 +
 10 files changed, 355 insertions(+), 3 deletions(-)
 create mode 100644 apps/docs/content/docs/en/knowledgebase/chunking-strategies.mdx

diff --git a/apps/docs/content/docs/en/knowledgebase/chunking-strategies.mdx b/apps/docs/content/docs/en/knowledgebase/chunking-strategies.mdx
new file mode 100644
index 00000000000..159b636edc3
--- /dev/null
+++ b/apps/docs/content/docs/en/knowledgebase/chunking-strategies.mdx
@@ -0,0 +1,137 @@
+---
+title: Chunking Strategies
+description: How Sim splits documents into searchable chunks, and which strategy to pick for your content
+---
+
+import { FAQ } from '@/components/ui/faq'
+
+Sim splits every uploaded document into chunks before generating embeddings. The strategy controls *where* those splits happen.
+
+## How chunking works
+
+Every chunker follows a two-phase pattern:
+
+1. **Split** — break the document at boundaries (paragraphs, sentences, tokens, or a custom regex)
+2. **Pack** — merge adjacent splits up to the maximum chunk size
+
+This is documented in [LangChain's text splitter guide](https://python.langchain.com/docs/concepts/text_splitters/), which states the principle: *"no resulting merged split should exceed the designated chunk size."* LlamaIndex, Chonkie, and Unstructured follow the same convention.
+
+The packing step is what keeps chunks roughly uniform. It also means a chunk usually spans multiple splits — a precise split boundary is not the same as a chunk boundary. Most "why is my regex not producing one chunk per match" surprises trace back to this.
+
+## Configuration shared by all strategies
+
+| Setting | Unit | Default | Range | Description |
+|---------|------|---------|-------|-------------|
+| Max Chunk Size | tokens | 1,024 | 100–4,000 | Upper bound on chunk size. 1 token ≈ 4 characters. |
+| Min Chunk Size | characters | 100 | 100–2,000 | Tiny fragments below this are dropped. |
+| Overlap | tokens | 200 | 0–500 | Tokens repeated between adjacent chunks to preserve context. |
+
+[Pinecone's chunking guide](https://www.pinecone.io/learn/chunking-strategies/) covers the tradeoffs in size and overlap.
+
+## Strategies
+
+### Auto
+
+Sim inspects the file and routes to the right chunker:
+
+- `.json`, `.jsonl`, `.yaml`, `.yml` → structural chunking (records are never split mid-way; small records may still be batched together up to the chunk size)
+- `.csv`, `.xlsx`, `.xls`, `.tsv` → grouped by row, with headers preserved
+- Everything else (`.pdf`, `.docx`, `.txt`, `.md`, `.html`, `.pptx`, …) → Text strategy
+
+Routing is based on detected MIME type and content shape, not just the extension — a `.txt` file containing valid JSON is still routed structurally.
+
+Pick Auto unless you've confirmed it isn't producing the chunks you want.
+
+### Text
+
+Hierarchical splitter that walks down a separator list: horizontal rules → markdown headings → paragraphs (`\n\n`) → lines (`\n`) → sentence punctuation (`. ! ?`) → clause punctuation (`; ,`) → spaces. It tries the largest separator first and falls back when a piece is still too large.
+
+Same algorithm as LangChain's [`RecursiveCharacterTextSplitter`](https://python.langchain.com/docs/concepts/text_splitters/#text-structured-based), the de facto standard for prose.
+
+Use it for general prose.
+
+### Recursive
+
+Same algorithm as Text, but you supply your own separator hierarchy or pick a built-in recipe (`plain`, `markdown`, `code`).
+
+The recipe pattern comes from [Chonkie](https://github.com/chonkie-inc/chonkie), which ships pre-built separator sets for common content types.
+
+Use Recursive when your content has structural markers the default Text separators miss — splitting code on `\nclass `, `\nfunction `, then `\n\n`, for example.
+
+### Sentence
+
+Splits on sentence boundaries (`. `, `! `, `? `, with abbreviation handling) and packs whole sentences up to the chunk size. A sentence is never split mid-way unless it individually exceeds the limit.
+
+This is the technique behind [LlamaIndex's `SentenceSplitter`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/), which is the recommended default for prose in their stack.
+
+Use it when sentence integrity matters — Q&A, legal text, or anything where mid-sentence cuts hurt comprehension.
+
+### Token
+
+Fixed-size sliding window aligned to word boundaries. No awareness of paragraphs or sentences.
+
+LlamaIndex provides the same as `TokenTextSplitter`. Useful when downstream processing requires uniform chunk sizes; otherwise prefer Text or Sentence.
+
+### Regex
+
+Splits on every match of a regex pattern you supply, then packs splits up to the chunk size by default — the same merge behavior as every other chunker. A precise boundary regex like `(?=\n\s*\{\s*"id"\s*:)` will still produce chunks containing multiple matches if those matches are small enough to fit together. This is standard across LangChain, LlamaIndex, Chonkie, and [Unstructured](https://docs.unstructured.io/api-reference/partition/chunking-documents).
+
+Use Regex when your content has explicit delimiters that don't fit any other strategy.
+
+#### Strict boundaries
+
+The regex strategy has an opt-in **"Each match is its own chunk (don't merge)"** checkbox. When enabled:
+
+- Every regex match becomes its own chunk
+- Adjacent splits are not packed together
+- Overlap is disabled
+- Splits that exceed the chunk size are still sub-split at word boundaries
+
+This matches the `join=False` knob in [txtai](https://neuml.github.io/txtai/) and the `split_length=1` pattern in Haystack's `DocumentSplitter`. Most libraries don't expose this directly because they expect users to switch to a structural parser instead — see "One record per chunk" below.
+
+Turn it on when each match is a discrete record (one QA pair, one log entry) and you need each isolated for retrieval.
+
+## How to choose
+
+Pick **Auto** unless you have a reason not to.
+
+If Auto isn't right:
+
+- Sentence integrity matters → **Sentence**
+- Your content has structural markers Text doesn't know about → **Recursive**
+- You need uniform chunk sizes → **Token**
+- You have explicit delimiters → **Regex**
+- Each record must be its own chunk → see below
+
+## One record per chunk
+
+Each record (each QA pair, each log line, each row) as its own chunk is structural chunking, not regex chunking. Two paths:
+
+1. **Convert to JSONL** (one record per line) and upload. Sim's Auto strategy treats it as structured data and never splits a record mid-way. Small records may still be batched together up to the chunk size — to force one record per chunk, lower the max chunk size to roughly the size of one record. See [LlamaIndex's `JSONNodeParser`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/) and [Unstructured's element-based chunking](https://docs.unstructured.io/api-reference/partition/chunking-documents).
+
+2. **Use Regex with strict boundaries enabled** when you can't restructure the source.
+
+Prefer option 1. Structural parsers handle nested records, escaped delimiters, and malformed entries that regex won't.
+
+## Further reading
+
+- [LangChain — Text Splitters](https://python.langchain.com/docs/concepts/text_splitters/)
+- [LlamaIndex — Node Parsers](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/)
+- [Chonkie](https://github.com/chonkie-inc/chonkie)
+- [Unstructured — Chunking](https://docs.unstructured.io/api-reference/partition/chunking-documents)
+- [Pinecone — Chunking Strategies](https://www.pinecone.io/learn/chunking-strategies/)
+
+## FAQ
+
+<FAQ items={[
+  { question: "Which strategy should I pick if I'm not sure?", answer: "Auto. JSON/JSONL/YAML go through structural chunking, CSVs are grouped by row, everything else uses Text. Only override Auto if you've confirmed it isn't producing the chunks you want." },
+  { question: "Why does my chunk contain multiple records even though my regex is precise?", answer: "Every chunker follows split-then-pack: small adjacent splits are merged up to the chunk size to keep chunks roughly uniform. To preserve every match as its own chunk, enable 'Each match is its own chunk (don't merge)' under the Regex strategy, or convert your file to JSONL." },
+  { question: "What's the difference between Text and Recursive?", answer: "Same algorithm. Text uses a built-in separator hierarchy for general prose. Recursive lets you supply your own separators or pick a recipe (plain, markdown, code) when the default doesn't capture your structure." },
+  { question: "When should I use Sentence over Text?", answer: "When sentence integrity matters — Q&A, legal text, or anything where mid-sentence cuts hurt comprehension. Text may split mid-sentence at lower levels of its hierarchy; Sentence never does unless a single sentence exceeds the chunk size." },
+  { question: "Does Token chunking respect any structure?", answer: "No. It's a fixed-size sliding window aligned to word boundaries. Use it only when downstream processing requires uniform chunk sizes." },
+  { question: "What does overlap actually do?", answer: "Overlap repeats tokens from the end of one chunk at the start of the next, so a query spanning a chunk boundary can still match. Higher values increase storage and may surface duplicate hits in search." },
+  { question: "How do I get one chunk per record?", answer: "Convert to JSONL and lower the max chunk size to roughly the size of one record — Auto handles the rest. If you can't restructure the source, use Regex with 'Each match is its own chunk' enabled." },
+  { question: "Do larger chunks always retrieve better?", answer: "No. Larger chunks dilute relevance — the embedding represents the average of more content, so specific queries match worse. 256–1,024 tokens is a typical range; experiment for your data." },
+  { question: "Can I change the chunking strategy on an existing knowledge base?", answer: "No. Chunking config is set at creation. To change it, create a new knowledge base and re-upload your documents." },
+  { question: "Why isn't my regex producing any splits?", answer: "Sim normalizes content before splitting: \\r\\n becomes \\n, runs of three or more newlines collapse to \\n\\n, and tabs become spaces. Patterns that depend on those characters won't match. Also: in non-strict mode, content that fits within the chunk size returns as a single chunk regardless of matches — enable strict boundaries to force splits." },
+]} />
diff --git a/apps/docs/content/docs/en/knowledgebase/index.mdx b/apps/docs/content/docs/en/knowledgebase/index.mdx
index 6213414ed92..45b109349d9 100644
--- a/apps/docs/content/docs/en/knowledgebase/index.mdx
+++ b/apps/docs/content/docs/en/knowledgebase/index.mdx
@@ -44,6 +44,8 @@ When creating a knowledge base, you can configure how documents are split into c
 | **Min Chunk Size** | characters | 100 | 100-2,000 | Minimum chunk size to avoid tiny fragments |
 | **Overlap** | tokens | 200 | 0-500 | Context overlap between consecutive chunks |
 
+You can also pick a chunking strategy (Auto, Text, Recursive, Sentence, Token, or Regex) to control where splits happen. See [Chunking Strategies](/docs/knowledgebase/chunking-strategies) for a breakdown of when to use each.
+
 - **Hierarchical splitting**: Respects document structure (sections, paragraphs, sentences)
 
 ### Editing Capabilities
diff --git a/apps/docs/content/docs/en/knowledgebase/meta.json b/apps/docs/content/docs/en/knowledgebase/meta.json
index e304c09ce7a..6c42e7be3eb 100644
--- a/apps/docs/content/docs/en/knowledgebase/meta.json
+++ b/apps/docs/content/docs/en/knowledgebase/meta.json
@@ -1,4 +1,4 @@
 {
   "title": "Knowledge Base",
-  "pages": ["index", "connectors", "tags"]
+  "pages": ["index", "chunking-strategies", "connectors", "tags"]
 }
diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx
index 4aacb8cf0b1..d1f00495a09 100644
--- a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx
@@ -9,6 +9,7 @@ import { useForm } from 'react-hook-form'
 import { z } from 'zod'
 import {
   Button,
+  Checkbox,
   Combobox,
   type ComboboxOption,
   Input,
@@ -75,6 +76,7 @@ const FormSchema = z
       .max(500, 'Overlap must be less than 500 tokens'),
     strategy: z.enum(['auto', 'text', 'regex', 'recursive', 'sentence', 'token']).default('auto'),
     regexPattern: z.string().optional(),
+    regexStrictBoundaries: z.boolean().default(false),
     customSeparators: z.string().optional(),
   })
   .refine(
@@ -175,6 +177,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({
       overlapSize: 200,
       strategy: 'auto',
       regexPattern: '',
+      regexStrictBoundaries: false,
       customSeparators: '',
     },
     mode: 'onSubmit',
@@ -182,6 +185,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({
 
   const nameValue = watch('name')
   const strategyValue = watch('strategy')
+  const regexStrictBoundariesValue = watch('regexStrictBoundaries')
 
   useEffect(() => {
     if (open) {
@@ -199,6 +203,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({
         overlapSize: 200,
         strategy: 'auto',
         regexPattern: '',
+        regexStrictBoundaries: false,
         customSeparators: '',
       })
     }
@@ -304,7 +309,10 @@ export const CreateBaseModal = memo(function CreateBaseModal({
     try {
       const strategyOptions: StrategyOptions | undefined =
         data.strategy === 'regex' && data.regexPattern
-          ? { pattern: data.regexPattern }
+          ? {
+              pattern: data.regexPattern,
+              ...(data.regexStrictBoundaries && { strictBoundaries: true }),
+            }
           : data.strategy === 'recursive' && data.customSeparators?.trim()
             ? {
                 separators: data.customSeparators
@@ -495,6 +503,28 @@ export const CreateBaseModal = memo(function CreateBaseModal({
                     <p className='text-[var(--text-muted)] text-xs'>
                       Text will be split at each match of this regex pattern.
                     </p>
+                    <label
+                      htmlFor='regexStrictBoundaries'
+                      className='mt-1 flex cursor-pointer items-start gap-2'
+                    >
+                      <Checkbox
+                        id='regexStrictBoundaries'
+                        checked={regexStrictBoundariesValue}
+                        onCheckedChange={(checked) =>
+                          setValue('regexStrictBoundaries', checked === true)
+                        }
+                        className='mt-0.5'
+                      />
+                      <div className='flex flex-col gap-0.5'>
+                        <span className='text-[var(--text-primary)] text-sm'>
+                          Each match is its own chunk (don&apos;t merge)
+                        </span>
+                        <span className='text-[var(--text-muted)] text-xs'>
+                          Preserve boundaries exactly. Recommended when each match is a discrete
+                          record (e.g. one QA pair per chunk).
+                        </span>
+                      </div>
+                    </label>
                   </div>
                 )}
 
diff --git a/apps/sim/lib/api/contracts/knowledge/base.ts b/apps/sim/lib/api/contracts/knowledge/base.ts
index 1705b11800d..c9b29dc0698 100644
--- a/apps/sim/lib/api/contracts/knowledge/base.ts
+++ b/apps/sim/lib/api/contracts/knowledge/base.ts
@@ -21,6 +21,7 @@ export const chunkingStrategyOptionsSchema = z
     pattern: z.string().max(500).optional(),
     separators: z.array(z.string()).optional(),
     recipe: z.enum(['plain', 'markdown', 'code']).optional(),
+    strictBoundaries: z.boolean().optional(),
   })
   .strict() satisfies z.ZodType<StrategyOptions>
 
diff --git a/apps/sim/lib/chunkers/regex-chunker.test.ts b/apps/sim/lib/chunkers/regex-chunker.test.ts
index 5716d45b28a..278339fcd19 100644
--- a/apps/sim/lib/chunkers/regex-chunker.test.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.test.ts
@@ -183,4 +183,138 @@ describe('RegexChunker', () => {
       expect(() => new RegexChunker({ pattern: '[,;]' })).not.toThrow()
     })
   })
+
+  describe('strictBoundaries mode', () => {
+    it.concurrent(
+      'should produce one chunk per match without merging small adjacent segments',
+      async () => {
+        const chunker = new RegexChunker({
+          pattern: '\\n\\n',
+          chunkSize: 1024,
+          strictBoundaries: true,
+        })
+        const text = 'Short.\n\nAlso short.\n\nTiny.\n\nSmall too.'
+        const chunks = await chunker.chunk(text)
+
+        expect(chunks).toHaveLength(4)
+        expect(chunks[0].text).toBe('Short.')
+        expect(chunks[1].text).toBe('Also short.')
+        expect(chunks[2].text).toBe('Tiny.')
+        expect(chunks[3].text).toBe('Small too.')
+      }
+    )
+
+    it.concurrent('should produce one chunk per QA record using lookahead pattern', async () => {
+      const chunker = new RegexChunker({
+        pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)',
+        chunkSize: 1024,
+        strictBoundaries: true,
+      })
+      const text =
+        '{"id": 1, "q": "first?", "a": "yes"}\n{"id": 2, "q": "second?", "a": "no"}\n{"id": 3, "q": "third?", "a": "maybe"}'
+      const chunks = await chunker.chunk(text)
+
+      expect(chunks).toHaveLength(3)
+      expect(chunks[0].text).toContain('"id": 1')
+      expect(chunks[0].text).not.toContain('"id": 2')
+      expect(chunks[1].text).toContain('"id": 2')
+      expect(chunks[1].text).not.toContain('"id": 3')
+      expect(chunks[2].text).toContain('"id": 3')
+    })
+
+    it.concurrent('should not apply overlap even when chunkOverlap is set', async () => {
+      const chunker = new RegexChunker({
+        pattern: '\\n\\n',
+        chunkSize: 100,
+        chunkOverlap: 50,
+        strictBoundaries: true,
+      })
+      const text = 'First section content.\n\nSecond section content.\n\nThird section content.'
+      const chunks = await chunker.chunk(text)
+
+      expect(chunks).toHaveLength(3)
+      expect(chunks[0].text).toBe('First section content.')
+      expect(chunks[1].text).toBe('Second section content.')
+      expect(chunks[2].text).toBe('Third section content.')
+    })
+
+    it.concurrent(
+      'should still split when content fits in single chunk if matches exist',
+      async () => {
+        const chunker = new RegexChunker({
+          pattern: '\\n\\n',
+          chunkSize: 1024,
+          strictBoundaries: true,
+        })
+        const text = 'A.\n\nB.\n\nC.'
+        const chunks = await chunker.chunk(text)
+
+        expect(chunks).toHaveLength(3)
+      }
+    )
+
+    it.concurrent('should sub-chunk a single oversized segment at word boundaries', async () => {
+      const chunker = new RegexChunker({
+        pattern: '---',
+        chunkSize: 10,
+        strictBoundaries: true,
+      })
+      const longSegment =
+        'This is a very long segment with many words that exceeds the chunk size limit significantly.'
+      const text = `${longSegment}---short`
+      const chunks = await chunker.chunk(text)
+
+      expect(chunks.length).toBeGreaterThan(2)
+      expect(chunks[chunks.length - 1].text).toBe('short')
+    })
+
+    it.concurrent('should return single chunk when regex finds no matches', async () => {
+      const chunker = new RegexChunker({
+        pattern: '###NOMATCH###',
+        chunkSize: 1024,
+        strictBoundaries: true,
+      })
+      const text = 'Plain text with no delimiter at all.'
+      const chunks = await chunker.chunk(text)
+
+      expect(chunks).toHaveLength(1)
+      expect(chunks[0].text).toBe(text)
+    })
+
+    it.concurrent('should return empty array for empty input', async () => {
+      const chunker = new RegexChunker({
+        pattern: '\\n\\n',
+        strictBoundaries: true,
+      })
+      const chunks = await chunker.chunk('')
+      expect(chunks).toEqual([])
+    })
+
+    it.concurrent(
+      'should default to merging behavior when strictBoundaries is omitted',
+      async () => {
+        const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 100 })
+        const text = 'Short.\n\nAlso short.\n\nTiny.\n\nSmall too.'
+        const chunks = await chunker.chunk(text)
+        expect(chunks).toHaveLength(1)
+      }
+    )
+
+    it.concurrent('should produce non-overlapping startIndex/endIndex metadata', async () => {
+      const chunker = new RegexChunker({
+        pattern: '\\n\\n',
+        chunkSize: 1024,
+        chunkOverlap: 50,
+        strictBoundaries: true,
+      })
+      const text = 'First.\n\nSecond.\n\nThird.'
+      const chunks = await chunker.chunk(text)
+
+      for (let i = 1; i < chunks.length; i++) {
+        expect(chunks[i].metadata.startIndex).toBeGreaterThanOrEqual(
+          chunks[i - 1].metadata.endIndex
+        )
+      }
+    })
+  })
 })
diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts
index 4276287c627..0de118b8de5 100644
--- a/apps/sim/lib/chunkers/regex-chunker.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.ts
@@ -19,12 +19,14 @@ export class RegexChunker {
   private readonly chunkSize: number
   private readonly chunkOverlap: number
   private readonly regex: RegExp
+  private readonly strictBoundaries: boolean
 
   constructor(options: RegexChunkerOptions) {
     const resolved = resolveChunkerOptions(options)
     this.chunkSize = resolved.chunkSize
     this.chunkOverlap = resolved.chunkOverlap
     this.regex = this.compilePattern(options.pattern)
+    this.strictBoundaries = options.strictBoundaries ?? false
   }
 
   private compilePattern(pattern: string): RegExp {
@@ -74,7 +76,7 @@ export class RegexChunker {
 
     const cleaned = cleanText(content)
 
-    if (estimateTokens(cleaned) <= this.chunkSize) {
+    if (!this.strictBoundaries && estimateTokens(cleaned) <= this.chunkSize) {
       logger.info('Content fits in single chunk')
       return buildChunks([cleaned], 0)
     }
@@ -83,6 +85,10 @@ export class RegexChunker {
     const segments = cleaned.split(this.regex).filter((s) => s.trim().length > 0)
 
     if (segments.length <= 1) {
+      if (this.strictBoundaries) {
+        logger.info('Regex pattern produced no splits in strict mode, returning single chunk')
+        return buildChunks([cleaned.trim()], 0)
+      }
       logger.warn(
         'Regex pattern did not produce any splits, falling back to word-boundary splitting'
       )
@@ -95,6 +101,12 @@ export class RegexChunker {
       return buildChunks(chunks, this.chunkOverlap)
     }
 
+    if (this.strictBoundaries) {
+      const chunks = this.expandOversizedSegments(segments)
+      logger.info(`Chunked into ${chunks.length} strict-boundary regex chunks`)
+      return buildChunks(chunks, 0)
+    }
+
     const merged = this.mergeSegments(segments)
 
     let chunks = merged
@@ -107,6 +119,32 @@ export class RegexChunker {
     return buildChunks(chunks, this.chunkOverlap)
   }
 
+  /**
+   * In strict-boundary mode each segment becomes its own chunk. Segments that
+   * exceed chunkSize are still split at word boundaries to preserve the token
+   * limit invariant; this is a safety floor, not a merge.
+   */
+  private expandOversizedSegments(segments: string[]): string[] {
+    const result: string[] = []
+    const chunkSizeChars = tokensToChars(this.chunkSize)
+
+    for (const segment of segments) {
+      const trimmed = segment.trim()
+      if (!trimmed) continue
+
+      if (estimateTokens(trimmed) <= this.chunkSize) {
+        result.push(trimmed)
+      } else {
+        const subChunks = splitAtWordBoundaries(trimmed, chunkSizeChars)
+        for (const sub of subChunks) {
+          if (sub.trim()) result.push(sub)
+        }
+      }
+    }
+
+    return result
+  }
+
   private mergeSegments(segments: string[]): string[] {
     const chunks: string[] = []
     let current = ''
diff --git a/apps/sim/lib/chunkers/types.ts b/apps/sim/lib/chunkers/types.ts
index 692e84d12fc..ef38a85b808 100644
--- a/apps/sim/lib/chunkers/types.ts
+++ b/apps/sim/lib/chunkers/types.ts
@@ -54,6 +54,7 @@ export interface StrategyOptions {
   pattern?: string
   separators?: string[]
   recipe?: RecursiveRecipe
+  strictBoundaries?: boolean
 }
 
 export interface SentenceChunkerOptions extends ChunkerOptions {
@@ -67,4 +68,11 @@ export interface RecursiveChunkerOptions extends ChunkerOptions {
 
 export interface RegexChunkerOptions extends ChunkerOptions {
   pattern: string
+  /**
+   * When true, each regex match becomes its own chunk and small adjacent
+   * segments are not merged together. Overlap is also disabled. Useful for
+   * structural inputs where boundaries (e.g. one record per match) must be
+   * preserved exactly.
+   */
+  strictBoundaries?: boolean
 }
diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts
index 249108205a9..6f3a7d9e7b6 100644
--- a/apps/sim/lib/knowledge/documents/document-processor.ts
+++ b/apps/sim/lib/knowledge/documents/document-processor.ts
@@ -154,6 +154,7 @@ async function applyStrategy(
       const chunker = new RegexChunker({
         ...baseOptions,
         pattern: strategyOptions.pattern,
+        strictBoundaries: strategyOptions.strictBoundaries,
       })
       return chunker.chunk(content)
     }
diff --git a/bun.lock b/bun.lock
index f70b8c4e926..b1882bc4eb6 100644
--- a/bun.lock
+++ b/bun.lock
@@ -1,5 +1,6 @@
 {
   "lockfileVersion": 1,
+  "configVersion": 0,
   "workspaces": {
     "": {
       "name": "simstudio",

From daaadb0f917f01ba1b6c3cff19fa59cf48ab7478 Mon Sep 17 00:00:00 2001
From: Waleed Latif <walif6@gmail.com>
Date: Thu, 30 Apr 2026 17:47:03 -0700
Subject: [PATCH 2/4] fix(chunkers): strip capturing groups and validate
 strictBoundaries scope

- Convert capturing groups to non-capturing in regex chunker so split() doesn't surface delimiter text as spurious chunks
- Reject strictBoundaries in chunkingConfigSchema when strategy is not regex
---
 apps/sim/lib/api/contracts/knowledge/base.ts |  3 ++
 apps/sim/lib/chunkers/regex-chunker.test.ts  | 35 ++++++++++++++++++++
 apps/sim/lib/chunkers/regex-chunker.ts       | 29 +++++++++++++++-
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/apps/sim/lib/api/contracts/knowledge/base.ts b/apps/sim/lib/api/contracts/knowledge/base.ts
index c9b29dc0698..86f0438c260 100644
--- a/apps/sim/lib/api/contracts/knowledge/base.ts
+++ b/apps/sim/lib/api/contracts/knowledge/base.ts
@@ -45,6 +45,9 @@ export const chunkingConfigSchema = z
       message: 'Regex pattern is required when using the regex chunking strategy',
     }
   )
+  .refine((data) => data.strategy === 'regex' || data.strategyOptions?.strictBoundaries !== true, {
+    message: 'strictBoundaries is only valid for the regex chunking strategy',
+  })
 
 export const createKnowledgeBaseBodySchema = z.object({
   name: z.string().min(1, 'Name is required'),
diff --git a/apps/sim/lib/chunkers/regex-chunker.test.ts b/apps/sim/lib/chunkers/regex-chunker.test.ts
index 278339fcd19..e6c8122cafb 100644
--- a/apps/sim/lib/chunkers/regex-chunker.test.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.test.ts
@@ -184,6 +184,41 @@ describe('RegexChunker', () => {
     })
   })
 
+  describe('capturing groups', () => {
+    it.concurrent(
+      'should not include delimiter text as a chunk when pattern has capturing groups',
+      async () => {
+        const chunker = new RegexChunker({
+          pattern: '(---)',
+          chunkSize: 1024,
+          strictBoundaries: true,
+        })
+        const text = 'Section one content.---Section two content.---Section three content.'
+        const chunks = await chunker.chunk(text)
+
+        expect(chunks).toHaveLength(3)
+        expect(chunks[0].text).toBe('Section one content.')
+        expect(chunks[1].text).toBe('Section two content.')
+        expect(chunks[2].text).toBe('Section three content.')
+        for (const chunk of chunks) {
+          expect(chunk.text).not.toBe('---')
+        }
+      }
+    )
+
+    it.concurrent('should leave non-capturing groups and lookarounds intact', async () => {
+      const chunker = new RegexChunker({
+        pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)',
+        chunkSize: 1024,
+        strictBoundaries: true,
+      })
+      const text = '{"id": 1, "v": "a"}\n{"id": 2, "v": "b"}\n{"id": 3, "v": "c"}'
+      const chunks = await chunker.chunk(text)
+
+      expect(chunks).toHaveLength(3)
+    })
+  })
+
   describe('strictBoundaries mode', () => {
     it.concurrent(
       'should produce one chunk per match without merging small adjacent segments',
diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts
index 0de118b8de5..e7253ac03ec 100644
--- a/apps/sim/lib/chunkers/regex-chunker.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.ts
@@ -15,6 +15,33 @@ const logger = createLogger('RegexChunker')
 
 const MAX_PATTERN_LENGTH = 500
 
+/**
+ * Converts unescaped capturing groups `(...)` into non-capturing groups `(?:...)`.
+ * `String.prototype.split()` interleaves captured groups into the result array,
+ * which would surface delimiter text as spurious chunks. Lookarounds, named
+ * groups, and other `(?...)` constructs are left untouched.
+ */
+function toNonCapturing(pattern: string): string {
+  let result = ''
+  let inClass = false
+  for (let i = 0; i < pattern.length; i++) {
+    const c = pattern[i]
+    if (c === '\\' && i + 1 < pattern.length) {
+      result += c + pattern[i + 1]
+      i++
+      continue
+    }
+    if (c === '[') inClass = true
+    else if (c === ']') inClass = false
+    if (!inClass && c === '(' && pattern[i + 1] !== '?') {
+      result += '(?:'
+      continue
+    }
+    result += c
+  }
+  return result
+}
+
 export class RegexChunker {
   private readonly chunkSize: number
   private readonly chunkOverlap: number
@@ -39,7 +66,7 @@ export class RegexChunker {
     }
 
     try {
-      const regex = new RegExp(pattern, 'g')
+      const regex = new RegExp(toNonCapturing(pattern), 'g')
 
       const testStrings = [
         'a'.repeat(10000),

From 236b94823497db4d64c27b073b78770b5fbde55c Mon Sep 17 00:00:00 2001
From: Waleed Latif <walif6@gmail.com>
Date: Thu, 30 Apr 2026 17:55:16 -0700
Subject: [PATCH 3/4] fix(chunkers): also strip named capture groups in regex
 patterns

Named groups (?<name>...) are still capturing groups so split() interleaves
their matched text. Convert them to non-capturing alongside plain ( groups.
---
 apps/sim/lib/chunkers/regex-chunker.test.ts | 21 +++++++++++++++++
 apps/sim/lib/chunkers/regex-chunker.ts      | 25 +++++++++++++++------
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/apps/sim/lib/chunkers/regex-chunker.test.ts b/apps/sim/lib/chunkers/regex-chunker.test.ts
index e6c8122cafb..db9fdab5d4b 100644
--- a/apps/sim/lib/chunkers/regex-chunker.test.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.test.ts
@@ -206,6 +206,27 @@ describe('RegexChunker', () => {
       }
     )
 
+    it.concurrent(
+      'should not include delimiter text when pattern uses named capture groups',
+      async () => {
+        const chunker = new RegexChunker({
+          pattern: '(?<sep>---)',
+          chunkSize: 1024,
+          strictBoundaries: true,
+        })
+        const text = 'Section one content.---Section two content.---Section three content.'
+        const chunks = await chunker.chunk(text)
+
+        expect(chunks).toHaveLength(3)
+        expect(chunks[0].text).toBe('Section one content.')
+        expect(chunks[1].text).toBe('Section two content.')
+        expect(chunks[2].text).toBe('Section three content.')
+        for (const chunk of chunks) {
+          expect(chunk.text).not.toBe('---')
+        }
+      }
+    )
+
     it.concurrent('should leave non-capturing groups and lookarounds intact', async () => {
       const chunker = new RegexChunker({
         pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)',
diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts
index e7253ac03ec..ab8d41e5648 100644
--- a/apps/sim/lib/chunkers/regex-chunker.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.ts
@@ -15,11 +15,14 @@ const logger = createLogger('RegexChunker')
 
 const MAX_PATTERN_LENGTH = 500
 
+const NAMED_GROUP_PREFIX = /^\(\?<[^>]+>/
+
 /**
- * Converts unescaped capturing groups `(...)` into non-capturing groups `(?:...)`.
- * `String.prototype.split()` interleaves captured groups into the result array,
- * which would surface delimiter text as spurious chunks. Lookarounds, named
- * groups, and other `(?...)` constructs are left untouched.
+ * Converts unescaped capturing groups `(...)` and named capturing groups
+ * `(?<name>...)` into non-capturing groups `(?:...)`. `String.prototype.split()`
+ * interleaves captured text (named or otherwise) into the result array, which
+ * would surface delimiter text as spurious chunks. Lookarounds (`(?=`, `(?!`,
+ * `(?<=`, `(?<!`) and other `(?...)` constructs are left untouched.
  */
 function toNonCapturing(pattern: string): string {
   let result = ''
@@ -33,9 +36,17 @@ function toNonCapturing(pattern: string): string {
     }
     if (c === '[') inClass = true
     else if (c === ']') inClass = false
-    if (!inClass && c === '(' && pattern[i + 1] !== '?') {
-      result += '(?:'
-      continue
+    if (!inClass && c === '(') {
+      if (pattern[i + 1] !== '?') {
+        result += '(?:'
+        continue
+      }
+      const namedMatch = pattern.slice(i).match(NAMED_GROUP_PREFIX)
+      if (namedMatch) {
+        result += '(?:'
+        i += namedMatch[0].length - 1
+        continue
+      }
     }
     result += c
   }

From e2fffd2cb82dc3f62c3cab7a41cffecf20a2b379 Mon Sep 17 00:00:00 2001
From: Waleed Latif <walif6@gmail.com>
Date: Thu, 30 Apr 2026 18:05:19 -0700
Subject: [PATCH 4/4] fix(chunkers): exclude lookbehind from named-group
 rewrite

Tighten NAMED_GROUP_PREFIX with negative lookahead so patterns like
(?<=<tag>) are not misidentified as named capture groups.
---
 apps/sim/lib/chunkers/regex-chunker.test.ts | 15 +++++++++++++++
 apps/sim/lib/chunkers/regex-chunker.ts      |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/apps/sim/lib/chunkers/regex-chunker.test.ts b/apps/sim/lib/chunkers/regex-chunker.test.ts
index db9fdab5d4b..f4f55112e55 100644
--- a/apps/sim/lib/chunkers/regex-chunker.test.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.test.ts
@@ -227,6 +227,21 @@ describe('RegexChunker', () => {
       }
     )
 
+    it.concurrent('should preserve lookbehind whose body contains a > character', async () => {
+      const chunker = new RegexChunker({
+        pattern: '(?<=</section>)',
+        chunkSize: 1024,
+        strictBoundaries: true,
+      })
+      const text = '<section>one</section><section>two</section><section>three</section>'
+      const chunks = await chunker.chunk(text)
+
+      expect(chunks).toHaveLength(3)
+      expect(chunks[0].text).toBe('<section>one</section>')
+      expect(chunks[1].text).toBe('<section>two</section>')
+      expect(chunks[2].text).toBe('<section>three</section>')
+    })
+
     it.concurrent('should leave non-capturing groups and lookarounds intact', async () => {
       const chunker = new RegexChunker({
         pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)',
diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts
index ab8d41e5648..0cafa47b0d3 100644
--- a/apps/sim/lib/chunkers/regex-chunker.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.ts
@@ -15,7 +15,7 @@ const logger = createLogger('RegexChunker')
 
 const MAX_PATTERN_LENGTH = 500
 
-const NAMED_GROUP_PREFIX = /^\(\?<[^>]+>/
+const NAMED_GROUP_PREFIX = /^\(\?<(?![=!])[^>]+>/
 
 /**
  * Converts unescaped capturing groups `(...)` and named capturing groups