Skip to content

Commit 53624f3

Browse files
waleedlatif1claude
andcommitted
fix(knowledge): use provider tokenizer for chunks and bound rerank indices
- documents/service.ts: replace ceil(len/4) heuristic with estimateTokenCount using the embedding model's tokenizerProvider so token counts match billing - reranker.ts: filter Cohere rerank results to valid indices before mapping to defend against malformed responses - utils.test.ts: add embeddingModel to kb fixture so getEmbeddingModelInfo resolves Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 7a9ba8b commit 53624f3

3 files changed

Lines changed: 12 additions & 5 deletions

File tree

apps/sim/app/api/knowledge/utils.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ describe('Knowledge Utils', () => {
212212
id: 'kb1',
213213
userId: 'user1',
214214
workspaceId: null,
215+
embeddingModel: 'text-embedding-3-small',
215216
chunkingConfig: { maxSize: 1024, minSize: 1, overlap: 200 },
216217
})
217218
docRows.push({ id: 'doc1', knowledgeBaseId: 'kb1' })

apps/sim/lib/knowledge/documents/service.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import { env } from '@/lib/core/config/env'
3434
import { getCostMultiplier, isTriggerDevEnabled } from '@/lib/core/config/feature-flags'
3535
import { processDocument } from '@/lib/knowledge/documents/document-processor'
3636
import type { DocumentSortField, SortOrder } from '@/lib/knowledge/documents/types'
37+
import { getEmbeddingModelInfo } from '@/lib/knowledge/embedding-models'
3738
import { generateEmbeddings } from '@/lib/knowledge/embeddings'
3839
import {
3940
buildUndefinedTagsError,
@@ -43,6 +44,7 @@ import {
4344
validateTagValue,
4445
} from '@/lib/knowledge/tags/utils'
4546
import type { ProcessedDocumentTags } from '@/lib/knowledge/types'
47+
import { estimateTokenCount } from '@/lib/tokenization/estimators'
4648
import { deleteFile } from '@/lib/uploads/core/storage-service'
4749
import { extractStorageKey } from '@/lib/uploads/utils/file-utils'
4850
import type { DocumentProcessingPayload } from '@/background/knowledge-processing'
@@ -533,6 +535,8 @@ export async function processDocumentAsync(
533535

534536
logger.info(`[${documentId}] Creating embedding records with tags`)
535537

538+
const tokenizerProvider = getEmbeddingModelInfo(kbEmbeddingModel).tokenizerProvider
539+
536540
const embeddingRecords = processed.chunks.map((chunk, chunkIndex) => ({
537541
id: generateId(),
538542
knowledgeBaseId,
@@ -541,7 +545,7 @@ export async function processDocumentAsync(
541545
chunkHash: sha256Hex(chunk.text),
542546
content: chunk.text,
543547
contentLength: chunk.text.length,
544-
tokenCount: Math.ceil(chunk.text.length / 4),
548+
tokenCount: estimateTokenCount(chunk.text, tokenizerProvider),
545549
embedding: embeddings[chunkIndex] || null,
546550
embeddingModel: kbEmbeddingModel,
547551
startOffset: chunk.metadata.startIndex,

apps/sim/lib/knowledge/reranker.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,12 @@ export async function rerank<T extends RerankItem>(
150150
)
151151

152152
return {
153-
results: response.results.map((r) => ({
154-
item: cappedItems[r.index],
155-
relevanceScore: r.relevance_score,
156-
})),
153+
results: response.results
154+
.filter((r) => r.index >= 0 && r.index < cappedItems.length)
155+
.map((r) => ({
156+
item: cappedItems[r.index],
157+
relevanceScore: r.relevance_score,
158+
})),
157159
isBYOK,
158160
}
159161
}

0 commit comments

Comments
 (0)