Skip to content

Commit ff5d90e

Browse files
waleedlatif1claude
andauthored
fix(knowledge): infer MIME type from file extension in create/upsert tools (#3651)
* fix(knowledge): infer MIME type from file extension in create/upsert tools Both create_document and upsert_document forced .txt extension and text/plain MIME type regardless of the document name. Now the tools infer the correct MIME type from the file extension (html, md, csv, json, yaml, xml) and only default to .txt when no extension is given. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * refactor(knowledge): reuse existing getMimeTypeFromExtension from uploads Replace duplicate EXTENSION_MIME_MAP and getMimeTypeFromExtension with the existing, more comprehensive version from lib/uploads/utils/file-utils. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(knowledge): fix btoa stack overflow and duplicate encoding in create_document Same fixes as upsert_document: use loop-based String.fromCharCode instead of spread, consolidate duplicate TextEncoder calls, and check byte length instead of character length for 1MB limit. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(knowledge): allowlist text-compatible MIME types in inferDocumentFileInfo Use an explicit allowlist instead of only checking for octet-stream, preventing binary MIME types (image/jpeg, audio/mpeg, etc.) from leaking through when a user names a document with a binary extension. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(knowledge): remove pdf/rtf from allowlist, normalize unrecognized extensions - Remove application/pdf and application/rtf from TEXT_COMPATIBLE_MIME_TYPES since these tools pass plain text content, not binary - Normalize unrecognized extensions (e.g. report.v2) to .txt instead of preserving the original extension with text/plain MIME type Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(knowledge): handle dotfile names to avoid empty base in filename Dotfiles like .env would produce an empty base, resulting in '.txt'. Now falls back to the original name so .env becomes .env.txt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8b24569 commit ff5d90e

File tree

3 files changed

+64
-20
lines changed

3 files changed

+64
-20
lines changed

apps/sim/tools/knowledge/create_document.ts

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
import type { KnowledgeCreateDocumentResponse } from '@/tools/knowledge/types'
1+
import {
2+
inferDocumentFileInfo,
3+
type KnowledgeCreateDocumentResponse,
4+
} from '@/tools/knowledge/types'
25
import { enrichKBTagsSchema } from '@/tools/schema-enrichers'
36
import { formatDocumentTagsForAPI, parseDocumentTags } from '@/tools/shared/tags'
47
import type { ToolConfig } from '@/tools/types'
@@ -63,30 +66,36 @@ export const knowledgeCreateDocumentTool: ToolConfig<any, KnowledgeCreateDocumen
6366
if (!textContent || textContent.length < 1) {
6467
throw new Error('Document content cannot be empty')
6568
}
66-
if (textContent.length > 1000000) {
69+
const utf8Bytes = new TextEncoder().encode(textContent)
70+
const contentBytes = utf8Bytes.length
71+
72+
if (contentBytes > 1_000_000) {
6773
throw new Error('Document content exceeds maximum size of 1MB')
6874
}
6975

70-
const contentBytes = new TextEncoder().encode(textContent).length
71-
72-
const utf8Bytes = new TextEncoder().encode(textContent)
73-
const base64Content =
74-
typeof Buffer !== 'undefined'
75-
? Buffer.from(textContent, 'utf8').toString('base64')
76-
: btoa(String.fromCharCode(...utf8Bytes))
76+
let base64Content: string
77+
if (typeof Buffer !== 'undefined') {
78+
base64Content = Buffer.from(textContent, 'utf8').toString('base64')
79+
} else {
80+
let binary = ''
81+
for (let i = 0; i < utf8Bytes.length; i++) {
82+
binary += String.fromCharCode(utf8Bytes[i])
83+
}
84+
base64Content = btoa(binary)
85+
}
7786

78-
const dataUri = `data:text/plain;base64,${base64Content}`
87+
const { filename, mimeType } = inferDocumentFileInfo(documentName)
88+
const dataUri = `data:${mimeType};base64,${base64Content}`
7989

80-
// Parse document tags from various formats (object, array, JSON string)
8190
const parsedTags = parseDocumentTags(params.documentTags)
8291
const tagData = formatDocumentTagsForAPI(parsedTags)
8392

8493
const documents = [
8594
{
86-
filename: documentName.endsWith('.txt') ? documentName : `${documentName}.txt`,
95+
filename,
8796
fileUrl: dataUri,
8897
fileSize: contentBytes,
89-
mimeType: 'text/plain',
98+
mimeType,
9099
...tagData,
91100
},
92101
]

apps/sim/tools/knowledge/types.ts

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,38 @@
1+
import {
2+
getFileExtension,
3+
getMimeTypeFromExtension as getUploadMimeType,
4+
} from '@/lib/uploads/utils/file-utils'
5+
6+
const TEXT_COMPATIBLE_MIME_TYPES = new Set([
7+
'text/plain',
8+
'text/html',
9+
'text/markdown',
10+
'text/csv',
11+
'application/json',
12+
'application/xml',
13+
'application/x-yaml',
14+
])
15+
16+
/**
17+
* Extracts extension from a filename and returns the normalized filename and MIME type.
18+
* If the extension maps to a recognized text-compatible MIME type, it is preserved.
19+
* Otherwise, the filename is normalized to `.txt` with `text/plain`.
20+
*/
21+
export function inferDocumentFileInfo(documentName: string): {
22+
filename: string
23+
mimeType: string
24+
} {
25+
const ext = getFileExtension(documentName)
26+
if (ext) {
27+
const mimeType = getUploadMimeType(ext)
28+
if (TEXT_COMPATIBLE_MIME_TYPES.has(mimeType)) {
29+
return { filename: documentName, mimeType }
30+
}
31+
}
32+
const base = ext ? documentName.slice(0, documentName.lastIndexOf('.')) : documentName
33+
return { filename: `${base || documentName}.txt`, mimeType: 'text/plain' }
34+
}
35+
136
export interface KnowledgeSearchResult {
237
documentId: string
338
documentName: string

apps/sim/tools/knowledge/upsert_document.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
import type {
2-
KnowledgeUpsertDocumentParams,
3-
KnowledgeUpsertDocumentResponse,
1+
import {
2+
inferDocumentFileInfo,
3+
type KnowledgeUpsertDocumentParams,
4+
type KnowledgeUpsertDocumentResponse,
45
} from '@/tools/knowledge/types'
56
import { enrichKBTagsSchema } from '@/tools/schema-enrichers'
67
import { formatDocumentTagsForAPI, parseDocumentTags } from '@/tools/shared/tags'
@@ -94,18 +95,17 @@ export const knowledgeUpsertDocumentTool: ToolConfig<
9495
base64Content = btoa(binary)
9596
}
9697

97-
const dataUri = `data:text/plain;base64,${base64Content}`
98+
const { filename, mimeType } = inferDocumentFileInfo(documentName)
99+
const dataUri = `data:${mimeType};base64,${base64Content}`
98100

99101
const parsedTags = parseDocumentTags(params.documentTags)
100102
const tagData = formatDocumentTagsForAPI(parsedTags)
101103

102-
const filename = documentName.endsWith('.txt') ? documentName : `${documentName}.txt`
103-
104104
const requestBody: Record<string, unknown> = {
105105
filename,
106106
fileUrl: dataUri,
107107
fileSize: contentBytes,
108-
mimeType: 'text/plain',
108+
mimeType,
109109
...tagData,
110110
processingOptions: {
111111
chunkSize: 1024,

0 commit comments

Comments
 (0)