fix(knowledge): match chunk tokenizer to KB embedding provider

waleedlatif1 · claude · waleedlatif1 · commit 96cf4dd71ed4 · 2026-04-29T19:50:46.000-07:00
Cursor bugbot: createChunk and updateChunk hardcoded the 'openai' tokenizer
when computing the stored tokenCount. For KBs using gemini-embedding-001 the
count was estimated with the wrong heuristic, leading to inaccurate stored
counts (and any billing derived from them). Now derive the tokenizer from
the KB's embedding model provider, matching the search route.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/apps/sim/lib/knowledge/chunks/service.ts b/apps/sim/lib/knowledge/chunks/service.ts
@@ -11,9 +11,19 @@ import type {
   ChunkQueryResult,
   CreateChunkData,
 } from '@/lib/knowledge/chunks/types'
+import { getEmbeddingModelInfo } from '@/lib/knowledge/embedding-models'
 import { generateEmbeddings } from '@/lib/knowledge/embeddings'
 import { estimateTokenCount } from '@/lib/tokenization/estimators'
 
+/**
+ * Map embedding model provider → tokenization provider id used by
+ * `estimateTokenCount`. Keeps stored token counts (and any cost computed
+ * from them) consistent with how the embedding provider tokenizes.
+ */
+function tokenizerProviderForEmbeddingModel(model: string): 'openai' | 'google' {
+  return getEmbeddingModelInfo(model).provider === 'gemini' ? 'google' : 'openai'
+}
+
 const logger = createLogger('ChunksService')
 
 /**
@@ -126,8 +136,11 @@ export async function createChunk(
     workspaceId
   )
 
-  // Calculate accurate token count
-  const tokenCount = estimateTokenCount(chunkData.content, 'openai')
+  // Calculate accurate token count using the tokenizer matching the KB's embedding provider.
+  const tokenCount = estimateTokenCount(
+    chunkData.content,
+    tokenizerProviderForEmbeddingModel(kbEmbeddingModel)
+  )
 
   const chunkId = generateId()
   const now = new Date()
@@ -385,8 +398,11 @@ export async function updateChunk(
         }
         const { embeddings } = await generateEmbeddings([content], chunkEmbeddingModel, workspaceId)
 
-        // Calculate accurate token count
-        const tokenCount = estimateTokenCount(content, 'openai')
+        // Calculate accurate token count using the tokenizer matching the KB's embedding provider.
+        const tokenCount = estimateTokenCount(
+          content,
+          tokenizerProviderForEmbeddingModel(chunkEmbeddingModel)
+        )
 
         dbUpdateData.content = content
         dbUpdateData.contentLength = newContentLength