Skip to content

Commit 4872e75

Browse files
committed
chore(chunkers): lint formatting
1 parent 211fe90 commit 4872e75

8 files changed

Lines changed: 68 additions & 48 deletions

File tree

apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,7 @@ const FormSchema = z
7474
.min(0, 'Overlap must be non-negative')
7575
.max(500, 'Overlap must be less than 500 tokens'),
7676
/** Chunking strategy */
77-
strategy: z
78-
.enum(['auto', 'text', 'regex', 'recursive', 'sentence', 'token'])
79-
.default('auto'),
77+
strategy: z.enum(['auto', 'text', 'regex', 'recursive', 'sentence', 'token']).default('auto'),
8078
/** Regex pattern (required when strategy is 'regex') */
8179
regexPattern: z.string().optional(),
8280
/** Custom separators for recursive strategy (comma-separated) */
@@ -474,14 +472,17 @@ export const CreateBaseModal = memo(function CreateBaseModal({
474472
<Button
475473
type='button'
476474
variant='default'
477-
className='w-full justify-between border border-[var(--border-1)] !bg-[var(--surface-1)] font-normal'
475+
className='!bg-[var(--surface-1)] w-full justify-between border border-[var(--border-1)] font-normal'
478476
>
479477
{STRATEGY_OPTIONS.find((o) => o.value === strategyValue)?.label ??
480478
'Auto (detect from content)'}
481479
<ChevronDown className='h-[12px] w-[12px] text-[var(--text-icon)]' />
482480
</Button>
483481
</DropdownMenuTrigger>
484-
<DropdownMenuContent align='start' className='w-[var(--radix-dropdown-menu-trigger-width)]'>
482+
<DropdownMenuContent
483+
align='start'
484+
className='w-[var(--radix-dropdown-menu-trigger-width)]'
485+
>
485486
<DropdownMenuRadioGroup
486487
value={strategyValue}
487488
onValueChange={(value) =>

apps/sim/lib/chunkers/docs-chunker.ts

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,9 @@ export class DocsChunker {
218218
* Returns both the chunks and the cleaned content so header extraction
219219
* operates on the same text that was chunked (aligned positions).
220220
*/
221-
private async splitContent(content: string): Promise<{ chunks: string[]; cleanedContent: string }> {
221+
private async splitContent(
222+
content: string
223+
): Promise<{ chunks: string[]; cleanedContent: string }> {
222224
const cleanedContent = this.cleanContent(content)
223225

224226
const tableBoundaries = this.detectTableBoundaries(cleanedContent)
@@ -240,19 +242,17 @@ export class DocsChunker {
240242
* Clean content by removing MDX-specific elements and excessive whitespace
241243
*/
242244
private cleanContent(content: string): string {
243-
return (
244-
content
245-
.replace(/\r\n/g, '\n')
246-
.replace(/\r/g, '\n')
247-
.replace(/^import\s+.*$/gm, '')
248-
.replace(/^export\s+.*$/gm, '')
249-
.replace(/<\/?[a-zA-Z][^>]*>/g, ' ')
250-
.replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
251-
.replace(/\{[^{}]*\}/g, ' ')
252-
.replace(/\n{3,}/g, '\n\n')
253-
.replace(/[ \t]{2,}/g, ' ')
254-
.trim()
255-
)
245+
return content
246+
.replace(/\r\n/g, '\n')
247+
.replace(/\r/g, '\n')
248+
.replace(/^import\s+.*$/gm, '')
249+
.replace(/^export\s+.*$/gm, '')
250+
.replace(/<\/?[a-zA-Z][^>]*>/g, ' ')
251+
.replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
252+
.replace(/\{[^{}]*\}/g, ' ')
253+
.replace(/\n{3,}/g, '\n\n')
254+
.replace(/[ \t]{2,}/g, ' ')
255+
.trim()
256256
}
257257

258258
/**

apps/sim/lib/chunkers/json-yaml-chunker.ts

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,7 @@ export class JsonYamlChunker {
5656
const chunks = this.chunkStructuredData(data, [], 0)
5757

5858
const totalTokens = chunks.reduce((sum, c) => sum + c.tokenCount, 0)
59-
logger.info(
60-
`JSON chunking complete: ${chunks.length} chunks, ${totalTokens} total tokens`
61-
)
59+
logger.info(`JSON chunking complete: ${chunks.length} chunks, ${totalTokens} total tokens`)
6260

6361
return chunks
6462
} catch (error) {
@@ -84,11 +82,13 @@ export class JsonYamlChunker {
8482
return []
8583
}
8684

87-
return [{
88-
text: content,
89-
tokenCount: estimateTokens(content),
90-
metadata: { startIndex: 0, endIndex: content.length },
91-
}]
85+
return [
86+
{
87+
text: content,
88+
tokenCount: estimateTokens(content),
89+
metadata: { startIndex: 0, endIndex: content.length },
90+
},
91+
]
9292
}
9393

9494
/**
@@ -108,7 +108,9 @@ export class JsonYamlChunker {
108108

109109
if (itemTokens > this.chunkSize) {
110110
if (currentBatch.length > 0) {
111-
chunks.push(this.buildBatchChunk(contextHeader, currentBatch, i - currentBatch.length, i - 1))
111+
chunks.push(
112+
this.buildBatchChunk(contextHeader, currentBatch, i - currentBatch.length, i - 1)
113+
)
112114
currentBatch = []
113115
currentTokens = 0
114116
}
@@ -123,7 +125,9 @@ export class JsonYamlChunker {
123125
})
124126
}
125127
} else if (currentTokens + itemTokens > this.chunkSize && currentBatch.length > 0) {
126-
chunks.push(this.buildBatchChunk(contextHeader, currentBatch, i - currentBatch.length, i - 1))
128+
chunks.push(
129+
this.buildBatchChunk(contextHeader, currentBatch, i - currentBatch.length, i - 1)
130+
)
127131
currentBatch = [item]
128132
currentTokens = itemTokens
129133
} else {
@@ -133,7 +137,14 @@ export class JsonYamlChunker {
133137
}
134138

135139
if (currentBatch.length > 0) {
136-
chunks.push(this.buildBatchChunk(contextHeader, currentBatch, arr.length - currentBatch.length, arr.length - 1))
140+
chunks.push(
141+
this.buildBatchChunk(
142+
contextHeader,
143+
currentBatch,
144+
arr.length - currentBatch.length,
145+
arr.length - 1
146+
)
147+
)
137148
}
138149

139150
return chunks
@@ -152,11 +163,13 @@ export class JsonYamlChunker {
152163
if (fullTokens <= this.chunkSize) {
153164
const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : ''
154165
const text = contextHeader + fullContent
155-
return [{
156-
text,
157-
tokenCount: estimateTokens(text),
158-
metadata: { startIndex: 0, endIndex: text.length },
159-
}]
166+
return [
167+
{
168+
text,
169+
tokenCount: estimateTokens(text),
170+
metadata: { startIndex: 0, endIndex: text.length },
171+
},
172+
]
160173
}
161174

162175
const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : ''
@@ -188,7 +201,10 @@ export class JsonYamlChunker {
188201
metadata: { startIndex: 0, endIndex: valueStr.length },
189202
})
190203
}
191-
} else if (currentTokens + valueTokens > this.chunkSize && Object.keys(currentObj).length > 0) {
204+
} else if (
205+
currentTokens + valueTokens > this.chunkSize &&
206+
Object.keys(currentObj).length > 0
207+
) {
192208
const objContent = contextHeader + JSON.stringify(currentObj, null, 2)
193209
chunks.push({
194210
text: objContent,
@@ -218,7 +234,12 @@ export class JsonYamlChunker {
218234
/**
219235
* Build a chunk from a batch of array items
220236
*/
221-
private buildBatchChunk(contextHeader: string, batch: JsonValue[], startIdx: number, endIdx: number): Chunk {
237+
private buildBatchChunk(
238+
contextHeader: string,
239+
batch: JsonValue[],
240+
startIdx: number,
241+
endIdx: number
242+
): Chunk {
222243
const batchContent = contextHeader + JSON.stringify(batch, null, 2)
223244
return {
224245
text: batchContent,

apps/sim/lib/chunkers/sentence-chunker.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ export class SentenceChunker {
3434
*/
3535
private splitSentences(text: string): string[] {
3636
return text
37-
.split(/(?<!\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|St|Rev|Gen|Sgt|No|Fig|Vol|Ch|vs|etc|Inc|Ltd|Corp|approx|dept|est|govt|Jan|Feb|Mar|Apr|Aug|Sep|Oct|Nov|Dec|i\.e|e\.g))(?<![A-Z])(?<!\.\.)(?<!\d)(?<=[.!?])\s+/)
37+
.split(
38+
/(?<!\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|St|Rev|Gen|Sgt|No|Fig|Vol|Ch|vs|etc|Inc|Ltd|Corp|approx|dept|est|govt|Jan|Feb|Mar|Apr|Aug|Sep|Oct|Nov|Dec|i\.e|e\.g))(?<![A-Z])(?<!\.\.)(?<!\d)(?<=[.!?])\s+/
39+
)
3840
.filter((s) => s.trim().length > 0)
3941
}
4042

apps/sim/lib/chunkers/structured-data-chunker.ts

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -179,9 +179,7 @@ export class StructuredDataChunker {
179179
const delimiters = [',', '\t', '|']
180180
for (const delimiter of delimiters) {
181181
const escaped = delimiter.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
182-
const counts = lines.map(
183-
(line) => (line.match(new RegExp(escaped, 'g')) || []).length
184-
)
182+
const counts = lines.map((line) => (line.match(new RegExp(escaped, 'g')) || []).length)
185183
const avgCount = counts.reduce((a, b) => a + b, 0) / counts.length
186184

187185
const tolerance = Math.max(1, Math.ceil(avgCount * 0.2))

apps/sim/lib/chunkers/utils.ts

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,7 @@ export function splitAtWordBoundaries(
106106
/**
107107
* Build Chunk objects from text segments with startIndex/endIndex metadata
108108
*/
109-
export function buildChunks(
110-
texts: string[],
111-
overlapTokens: number
112-
): Chunk[] {
109+
export function buildChunks(texts: string[], overlapTokens: number): Chunk[] {
113110
let previousEndIndex = 0
114111
const overlapChars = tokensToChars(overlapTokens)
115112

apps/sim/lib/knowledge/documents/document-processor.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,9 @@ async function applyStrategy(
154154
}
155155
case 'regex': {
156156
if (!strategyOptions?.pattern) {
157-
logger.warn('Regex strategy requested but no pattern provided, falling back to text chunker')
157+
logger.warn(
158+
'Regex strategy requested but no pattern provided, falling back to text chunker'
159+
)
158160
const chunker = new TextChunker(baseOptions)
159161
return chunker.chunk(content)
160162
}
@@ -164,7 +166,6 @@ async function applyStrategy(
164166
})
165167
return chunker.chunk(content)
166168
}
167-
case 'text':
168169
default: {
169170
const chunker = new TextChunker(baseOptions)
170171
return chunker.chunk(content)

apps/sim/lib/knowledge/documents/service.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,12 @@ import {
2727
} from 'drizzle-orm'
2828
import { recordUsage } from '@/lib/billing/core/usage-log'
2929
import { checkAndBillOverageThreshold } from '@/lib/billing/threshold-billing'
30+
import type { ChunkingStrategy, StrategyOptions } from '@/lib/chunkers/types'
3031
import { createBullMQJobData, isBullMQEnabled } from '@/lib/core/bullmq'
3132
import { env } from '@/lib/core/config/env'
3233
import { getCostMultiplier, isTriggerDevEnabled } from '@/lib/core/config/feature-flags'
3334
import { generateId } from '@/lib/core/utils/uuid'
3435
import { enqueueWorkspaceDispatch } from '@/lib/core/workspace-dispatch'
35-
import type { ChunkingStrategy, StrategyOptions } from '@/lib/chunkers/types'
3636
import { processDocument } from '@/lib/knowledge/documents/document-processor'
3737
import type { DocumentSortField, SortOrder } from '@/lib/knowledge/documents/types'
3838
import { generateEmbeddings } from '@/lib/knowledge/embeddings'

0 commit comments

Comments
 (0)