fix(knowledge): infer MIME type from file extension in create/upsert tools (#3651)

waleedlatif1 · claude · web-flow · commit ff5d90e0c0f2 · 2026-03-18T10:49:18.000-07:00
* fix(knowledge): infer MIME type from file extension in create/upsert tools

Both create_document and upsert_document forced .txt extension and
text/plain MIME type regardless of the document name. Now the tools
infer the correct MIME type from the file extension (html, md, csv,
json, yaml, xml) and only default to .txt when no extension is given.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* refactor(knowledge): reuse existing getMimeTypeFromExtension from uploads

Replace duplicate EXTENSION_MIME_MAP and getMimeTypeFromExtension with
the existing, more comprehensive version from lib/uploads/utils/file-utils.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* fix(knowledge): fix btoa stack overflow and duplicate encoding in create_document

Same fixes as upsert_document: use loop-based String.fromCharCode
instead of spread, consolidate duplicate TextEncoder calls, and
check byte length instead of character length for 1MB limit.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* fix(knowledge): allowlist text-compatible MIME types in inferDocumentFileInfo

Use an explicit allowlist instead of only checking for octet-stream,
preventing binary MIME types (image/jpeg, audio/mpeg, etc.) from
leaking through when a user names a document with a binary extension.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* fix(knowledge): remove pdf/rtf from allowlist, normalize unrecognized extensions

- Remove application/pdf and application/rtf from TEXT_COMPATIBLE_MIME_TYPES
  since these tools pass plain text content, not binary
- Normalize unrecognized extensions (e.g. report.v2) to .txt instead of
  preserving the original extension with text/plain MIME type

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* fix(knowledge): handle dotfile names to avoid empty base in filename

Dotfiles like .env would produce an empty base, resulting in '.txt'.
Now falls back to the original name so .env becomes .env.txt.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/apps/sim/tools/knowledge/create_document.ts b/apps/sim/tools/knowledge/create_document.ts
@@ -1,4 +1,7 @@
-import type { KnowledgeCreateDocumentResponse } from '@/tools/knowledge/types'
+import {
+  inferDocumentFileInfo,
+  type KnowledgeCreateDocumentResponse,
+} from '@/tools/knowledge/types'
 import { enrichKBTagsSchema } from '@/tools/schema-enrichers'
 import { formatDocumentTagsForAPI, parseDocumentTags } from '@/tools/shared/tags'
 import type { ToolConfig } from '@/tools/types'
@@ -63,30 +66,36 @@ export const knowledgeCreateDocumentTool: ToolConfig<any, KnowledgeCreateDocumen
       if (!textContent || textContent.length < 1) {
         throw new Error('Document content cannot be empty')
       }
-      if (textContent.length > 1000000) {
+      const utf8Bytes = new TextEncoder().encode(textContent)
+      const contentBytes = utf8Bytes.length
+
+      if (contentBytes > 1_000_000) {
         throw new Error('Document content exceeds maximum size of 1MB')
       }
 
-      const contentBytes = new TextEncoder().encode(textContent).length
-
-      const utf8Bytes = new TextEncoder().encode(textContent)
-      const base64Content =
-        typeof Buffer !== 'undefined'
-          ? Buffer.from(textContent, 'utf8').toString('base64')
-          : btoa(String.fromCharCode(...utf8Bytes))
+      let base64Content: string
+      if (typeof Buffer !== 'undefined') {
+        base64Content = Buffer.from(textContent, 'utf8').toString('base64')
+      } else {
+        let binary = ''
+        for (let i = 0; i < utf8Bytes.length; i++) {
+          binary += String.fromCharCode(utf8Bytes[i])
+        }
+        base64Content = btoa(binary)
+      }
 
-      const dataUri = `data:text/plain;base64,${base64Content}`
+      const { filename, mimeType } = inferDocumentFileInfo(documentName)
+      const dataUri = `data:${mimeType};base64,${base64Content}`
 
-      // Parse document tags from various formats (object, array, JSON string)
       const parsedTags = parseDocumentTags(params.documentTags)
       const tagData = formatDocumentTagsForAPI(parsedTags)
 
       const documents = [
         {
-          filename: documentName.endsWith('.txt') ? documentName : `${documentName}.txt`,
+          filename,
           fileUrl: dataUri,
           fileSize: contentBytes,
-          mimeType: 'text/plain',
+          mimeType,
           ...tagData,
         },
       ]
diff --git a/apps/sim/tools/knowledge/types.ts b/apps/sim/tools/knowledge/types.ts
@@ -1,3 +1,38 @@
+import {
+  getFileExtension,
+  getMimeTypeFromExtension as getUploadMimeType,
+} from '@/lib/uploads/utils/file-utils'
+
+const TEXT_COMPATIBLE_MIME_TYPES = new Set([
+  'text/plain',
+  'text/html',
+  'text/markdown',
+  'text/csv',
+  'application/json',
+  'application/xml',
+  'application/x-yaml',
+])
+
+/**
+ * Extracts extension from a filename and returns the normalized filename and MIME type.
+ * If the extension maps to a recognized text-compatible MIME type, it is preserved.
+ * Otherwise, the filename is normalized to `.txt` with `text/plain`.
+ */
+export function inferDocumentFileInfo(documentName: string): {
+  filename: string
+  mimeType: string
+} {
+  const ext = getFileExtension(documentName)
+  if (ext) {
+    const mimeType = getUploadMimeType(ext)
+    if (TEXT_COMPATIBLE_MIME_TYPES.has(mimeType)) {
+      return { filename: documentName, mimeType }
+    }
+  }
+  const base = ext ? documentName.slice(0, documentName.lastIndexOf('.')) : documentName
+  return { filename: `${base || documentName}.txt`, mimeType: 'text/plain' }
+}
+
 export interface KnowledgeSearchResult {
   documentId: string
   documentName: string
diff --git a/apps/sim/tools/knowledge/upsert_document.ts b/apps/sim/tools/knowledge/upsert_document.ts
@@ -1,6 +1,7 @@
-import type {
-  KnowledgeUpsertDocumentParams,
-  KnowledgeUpsertDocumentResponse,
+import {
+  inferDocumentFileInfo,
+  type KnowledgeUpsertDocumentParams,
+  type KnowledgeUpsertDocumentResponse,
 } from '@/tools/knowledge/types'
 import { enrichKBTagsSchema } from '@/tools/schema-enrichers'
 import { formatDocumentTagsForAPI, parseDocumentTags } from '@/tools/shared/tags'
@@ -94,18 +95,17 @@ export const knowledgeUpsertDocumentTool: ToolConfig<
         base64Content = btoa(binary)
       }
 
-      const dataUri = `data:text/plain;base64,${base64Content}`
+      const { filename, mimeType } = inferDocumentFileInfo(documentName)
+      const dataUri = `data:${mimeType};base64,${base64Content}`
 
       const parsedTags = parseDocumentTags(params.documentTags)
       const tagData = formatDocumentTagsForAPI(parsedTags)
 
-      const filename = documentName.endsWith('.txt') ? documentName : `${documentName}.txt`
-
       const requestBody: Record<string, unknown> = {
         filename,
         fileUrl: dataUri,
         fileSize: contentBytes,
-        mimeType: 'text/plain',
+        mimeType,
         ...tagData,
         processingOptions: {
           chunkSize: 1024,