Skip to content

Commit 5f89c71

Browse files
waleedlatif1claude
andauthored
feat(knowledge): add upsert document operation (#3644)
* feat(knowledge): add upsert document operation to Knowledge block Add a "Create or Update" (upsert) document capability that finds an existing document by ID or filename, deletes it if found, then creates a new document and queues re-processing. Includes new tool, API route, block wiring, and typed interfaces. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(knowledge): address review comments on upsert document - Reorder create-then-delete to prevent data loss if creation fails - Move Zod validation before workflow authorization for validated input - Fix btoa stack overflow for large content using loop-based encoding Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(knowledge): guard against empty createDocumentRecords result Add safety check before accessing firstDocument to prevent TypeError and data loss if createDocumentRecords unexpectedly returns empty. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(knowledge): prevent documentId fallthrough and use byte-count limit - Use if/else so filename lookup only runs when no documentId is provided, preventing stale IDs from silently replacing unrelated documents - Check utf8 byte length instead of character count for 1MB size limit, correctly handling multi-byte characters (CJK, emoji) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(knowledge): rollback on delete failure, deduplicate sub-block IDs - Add compensating rollback: if deleteDocument throws after create succeeds, clean up the new record to prevent orphaned pending docs - Merge duplicate name/content sub-blocks into single entries with array conditions, matching the documentTags pattern Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * lint * lint Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 2bc11a7 commit 5f89c71

File tree

6 files changed

+492
-4
lines changed

6 files changed

+492
-4
lines changed
Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
import { randomUUID } from 'crypto'
2+
import { db } from '@sim/db'
3+
import { document } from '@sim/db/schema'
4+
import { createLogger } from '@sim/logger'
5+
import { and, eq, isNull } from 'drizzle-orm'
6+
import { type NextRequest, NextResponse } from 'next/server'
7+
import { z } from 'zod'
8+
import { AuditAction, AuditResourceType, recordAudit } from '@/lib/audit/log'
9+
import { checkSessionOrInternalAuth } from '@/lib/auth/hybrid'
10+
import {
11+
createDocumentRecords,
12+
deleteDocument,
13+
getProcessingConfig,
14+
processDocumentsWithQueue,
15+
} from '@/lib/knowledge/documents/service'
16+
import { authorizeWorkflowByWorkspacePermission } from '@/lib/workflows/utils'
17+
import { checkKnowledgeBaseWriteAccess } from '@/app/api/knowledge/utils'
18+
19+
const logger = createLogger('DocumentUpsertAPI')
20+
21+
const UpsertDocumentSchema = z.object({
22+
documentId: z.string().optional(),
23+
filename: z.string().min(1, 'Filename is required'),
24+
fileUrl: z.string().min(1, 'File URL is required'),
25+
fileSize: z.number().min(1, 'File size must be greater than 0'),
26+
mimeType: z.string().min(1, 'MIME type is required'),
27+
documentTagsData: z.string().optional(),
28+
processingOptions: z.object({
29+
chunkSize: z.number().min(100).max(4000),
30+
minCharactersPerChunk: z.number().min(1).max(2000),
31+
recipe: z.string(),
32+
lang: z.string(),
33+
chunkOverlap: z.number().min(0).max(500),
34+
}),
35+
workflowId: z.string().optional(),
36+
})
37+
38+
export async function POST(req: NextRequest, { params }: { params: Promise<{ id: string }> }) {
39+
const requestId = randomUUID().slice(0, 8)
40+
const { id: knowledgeBaseId } = await params
41+
42+
try {
43+
const body = await req.json()
44+
45+
logger.info(`[${requestId}] Knowledge base document upsert request`, {
46+
knowledgeBaseId,
47+
hasDocumentId: !!body.documentId,
48+
filename: body.filename,
49+
})
50+
51+
const auth = await checkSessionOrInternalAuth(req, { requireWorkflowId: false })
52+
if (!auth.success || !auth.userId) {
53+
logger.warn(`[${requestId}] Authentication failed: ${auth.error || 'Unauthorized'}`)
54+
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
55+
}
56+
const userId = auth.userId
57+
58+
const validatedData = UpsertDocumentSchema.parse(body)
59+
60+
if (validatedData.workflowId) {
61+
const authorization = await authorizeWorkflowByWorkspacePermission({
62+
workflowId: validatedData.workflowId,
63+
userId,
64+
action: 'write',
65+
})
66+
if (!authorization.allowed) {
67+
return NextResponse.json(
68+
{ error: authorization.message || 'Access denied' },
69+
{ status: authorization.status }
70+
)
71+
}
72+
}
73+
74+
const accessCheck = await checkKnowledgeBaseWriteAccess(knowledgeBaseId, userId)
75+
76+
if (!accessCheck.hasAccess) {
77+
if ('notFound' in accessCheck && accessCheck.notFound) {
78+
logger.warn(`[${requestId}] Knowledge base not found: ${knowledgeBaseId}`)
79+
return NextResponse.json({ error: 'Knowledge base not found' }, { status: 404 })
80+
}
81+
logger.warn(
82+
`[${requestId}] User ${userId} attempted to upsert document in unauthorized knowledge base ${knowledgeBaseId}`
83+
)
84+
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
85+
}
86+
87+
let existingDocumentId: string | null = null
88+
let isUpdate = false
89+
90+
if (validatedData.documentId) {
91+
const existingDoc = await db
92+
.select({ id: document.id })
93+
.from(document)
94+
.where(
95+
and(
96+
eq(document.id, validatedData.documentId),
97+
eq(document.knowledgeBaseId, knowledgeBaseId),
98+
isNull(document.deletedAt)
99+
)
100+
)
101+
.limit(1)
102+
103+
if (existingDoc.length > 0) {
104+
existingDocumentId = existingDoc[0].id
105+
}
106+
} else {
107+
const docsByFilename = await db
108+
.select({ id: document.id })
109+
.from(document)
110+
.where(
111+
and(
112+
eq(document.filename, validatedData.filename),
113+
eq(document.knowledgeBaseId, knowledgeBaseId),
114+
isNull(document.deletedAt)
115+
)
116+
)
117+
.limit(1)
118+
119+
if (docsByFilename.length > 0) {
120+
existingDocumentId = docsByFilename[0].id
121+
}
122+
}
123+
124+
if (existingDocumentId) {
125+
isUpdate = true
126+
logger.info(
127+
`[${requestId}] Found existing document ${existingDocumentId}, creating replacement before deleting old`
128+
)
129+
}
130+
131+
const createdDocuments = await createDocumentRecords(
132+
[
133+
{
134+
filename: validatedData.filename,
135+
fileUrl: validatedData.fileUrl,
136+
fileSize: validatedData.fileSize,
137+
mimeType: validatedData.mimeType,
138+
...(validatedData.documentTagsData && {
139+
documentTagsData: validatedData.documentTagsData,
140+
}),
141+
},
142+
],
143+
knowledgeBaseId,
144+
requestId
145+
)
146+
147+
const firstDocument = createdDocuments[0]
148+
if (!firstDocument) {
149+
logger.error(`[${requestId}] createDocumentRecords returned empty array unexpectedly`)
150+
return NextResponse.json({ error: 'Failed to create document record' }, { status: 500 })
151+
}
152+
153+
if (existingDocumentId) {
154+
try {
155+
await deleteDocument(existingDocumentId, requestId)
156+
} catch (deleteError) {
157+
logger.error(
158+
`[${requestId}] Failed to delete old document ${existingDocumentId}, rolling back new record`,
159+
deleteError
160+
)
161+
await deleteDocument(firstDocument.documentId, requestId).catch(() => {})
162+
return NextResponse.json({ error: 'Failed to replace existing document' }, { status: 500 })
163+
}
164+
}
165+
166+
processDocumentsWithQueue(
167+
createdDocuments,
168+
knowledgeBaseId,
169+
validatedData.processingOptions,
170+
requestId
171+
).catch((error: unknown) => {
172+
logger.error(`[${requestId}] Critical error in document processing pipeline:`, error)
173+
})
174+
175+
try {
176+
const { PlatformEvents } = await import('@/lib/core/telemetry')
177+
PlatformEvents.knowledgeBaseDocumentsUploaded({
178+
knowledgeBaseId,
179+
documentsCount: 1,
180+
uploadType: 'single',
181+
chunkSize: validatedData.processingOptions.chunkSize,
182+
recipe: validatedData.processingOptions.recipe,
183+
})
184+
} catch (_e) {
185+
// Silently fail
186+
}
187+
188+
recordAudit({
189+
workspaceId: accessCheck.knowledgeBase?.workspaceId ?? null,
190+
actorId: userId,
191+
actorName: auth.userName,
192+
actorEmail: auth.userEmail,
193+
action: isUpdate ? AuditAction.DOCUMENT_UPDATED : AuditAction.DOCUMENT_UPLOADED,
194+
resourceType: AuditResourceType.DOCUMENT,
195+
resourceId: knowledgeBaseId,
196+
resourceName: validatedData.filename,
197+
description: isUpdate
198+
? `Upserted (replaced) document "${validatedData.filename}" in knowledge base "${knowledgeBaseId}"`
199+
: `Upserted (created) document "${validatedData.filename}" in knowledge base "${knowledgeBaseId}"`,
200+
metadata: {
201+
fileName: validatedData.filename,
202+
previousDocumentId: existingDocumentId,
203+
isUpdate,
204+
},
205+
request: req,
206+
})
207+
208+
return NextResponse.json({
209+
success: true,
210+
data: {
211+
documentsCreated: [
212+
{
213+
documentId: firstDocument.documentId,
214+
filename: firstDocument.filename,
215+
status: 'pending',
216+
},
217+
],
218+
isUpdate,
219+
previousDocumentId: existingDocumentId,
220+
processingMethod: 'background',
221+
processingConfig: {
222+
maxConcurrentDocuments: getProcessingConfig().maxConcurrentDocuments,
223+
batchSize: getProcessingConfig().batchSize,
224+
},
225+
},
226+
})
227+
} catch (error) {
228+
if (error instanceof z.ZodError) {
229+
logger.warn(`[${requestId}] Invalid upsert request data`, { errors: error.errors })
230+
return NextResponse.json(
231+
{ error: 'Invalid request data', details: error.errors },
232+
{ status: 400 }
233+
)
234+
}
235+
236+
logger.error(`[${requestId}] Error upserting document`, error)
237+
238+
const errorMessage = error instanceof Error ? error.message : 'Failed to upsert document'
239+
const isStorageLimitError =
240+
errorMessage.includes('Storage limit exceeded') || errorMessage.includes('storage limit')
241+
const isMissingKnowledgeBase = errorMessage === 'Knowledge base not found'
242+
243+
return NextResponse.json(
244+
{ error: errorMessage },
245+
{ status: isMissingKnowledgeBase ? 404 : isStorageLimitError ? 413 : 500 }
246+
)
247+
}
248+
}

apps/sim/blocks/blocks/knowledge.ts

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ export const KnowledgeBlock: BlockConfig = {
2929
{ label: 'List Documents', id: 'list_documents' },
3030
{ label: 'Get Document', id: 'get_document' },
3131
{ label: 'Create Document', id: 'create_document' },
32+
{ label: 'Upsert Document', id: 'upsert_document' },
3233
{ label: 'Delete Document', id: 'delete_document' },
3334
{ label: 'List Chunks', id: 'list_chunks' },
3435
{ label: 'Upload Chunk', id: 'upload_chunk' },
@@ -175,14 +176,14 @@ export const KnowledgeBlock: BlockConfig = {
175176
condition: { field: 'operation', value: 'upload_chunk' },
176177
},
177178

178-
// --- Create Document ---
179+
// --- Create Document / Upsert Document ---
179180
{
180181
id: 'name',
181182
title: 'Document Name',
182183
type: 'short-input',
183184
placeholder: 'Enter document name',
184185
required: true,
185-
condition: { field: 'operation', value: 'create_document' },
186+
condition: { field: 'operation', value: ['create_document', 'upsert_document'] },
186187
},
187188
{
188189
id: 'content',
@@ -191,14 +192,21 @@ export const KnowledgeBlock: BlockConfig = {
191192
placeholder: 'Enter the document content',
192193
rows: 6,
193194
required: true,
194-
condition: { field: 'operation', value: 'create_document' },
195+
condition: { field: 'operation', value: ['create_document', 'upsert_document'] },
196+
},
197+
{
198+
id: 'upsertDocumentId',
199+
title: 'Document ID (Optional)',
200+
type: 'short-input',
201+
placeholder: 'Enter existing document ID to update (or leave empty to match by name)',
202+
condition: { field: 'operation', value: 'upsert_document' },
195203
},
196204
{
197205
id: 'documentTags',
198206
title: 'Document Tags',
199207
type: 'document-tag-entry',
200208
dependsOn: ['knowledgeBaseSelector'],
201-
condition: { field: 'operation', value: 'create_document' },
209+
condition: { field: 'operation', value: ['create_document', 'upsert_document'] },
202210
},
203211

204212
// --- Update Chunk / Delete Chunk ---
@@ -264,6 +272,7 @@ export const KnowledgeBlock: BlockConfig = {
264272
'knowledge_search',
265273
'knowledge_upload_chunk',
266274
'knowledge_create_document',
275+
'knowledge_upsert_document',
267276
'knowledge_list_tags',
268277
'knowledge_list_documents',
269278
'knowledge_get_document',
@@ -284,6 +293,8 @@ export const KnowledgeBlock: BlockConfig = {
284293
return 'knowledge_upload_chunk'
285294
case 'create_document':
286295
return 'knowledge_create_document'
296+
case 'upsert_document':
297+
return 'knowledge_upsert_document'
287298
case 'list_tags':
288299
return 'knowledge_list_tags'
289300
case 'list_documents':
@@ -355,6 +366,11 @@ export const KnowledgeBlock: BlockConfig = {
355366
if (params.chunkEnabledFilter) params.enabled = params.chunkEnabledFilter
356367
}
357368

369+
// Map upsert sub-block field to tool param
370+
if (params.operation === 'upsert_document' && params.upsertDocumentId) {
371+
params.documentId = String(params.upsertDocumentId).trim()
372+
}
373+
358374
// Convert enabled dropdown string to boolean for update_chunk
359375
if (params.operation === 'update_chunk' && typeof params.enabled === 'string') {
360376
params.enabled = params.enabled === 'true'
@@ -382,6 +398,7 @@ export const KnowledgeBlock: BlockConfig = {
382398
documentTags: { type: 'string', description: 'Document tags' },
383399
chunkSearch: { type: 'string', description: 'Search filter for chunks' },
384400
chunkEnabledFilter: { type: 'string', description: 'Filter chunks by enabled status' },
401+
upsertDocumentId: { type: 'string', description: 'Document ID for upsert operation' },
385402
connectorId: { type: 'string', description: 'Connector identifier' },
386403
},
387404
outputs: {

apps/sim/tools/knowledge/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import { knowledgeSearchTool } from '@/tools/knowledge/search'
1111
import { knowledgeTriggerSyncTool } from '@/tools/knowledge/trigger_sync'
1212
import { knowledgeUpdateChunkTool } from '@/tools/knowledge/update_chunk'
1313
import { knowledgeUploadChunkTool } from '@/tools/knowledge/upload_chunk'
14+
import { knowledgeUpsertDocumentTool } from '@/tools/knowledge/upsert_document'
1415

1516
export {
1617
knowledgeSearchTool,
@@ -26,4 +27,5 @@ export {
2627
knowledgeListConnectorsTool,
2728
knowledgeGetConnectorTool,
2829
knowledgeTriggerSyncTool,
30+
knowledgeUpsertDocumentTool,
2931
}

apps/sim/tools/knowledge/types.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,3 +286,33 @@ export interface KnowledgeTriggerSyncResponse {
286286
}
287287
error?: string
288288
}
289+
290+
export interface KnowledgeUpsertDocumentParams {
291+
knowledgeBaseId: string
292+
name: string
293+
content: string
294+
documentId?: string
295+
documentTags?: Record<string, unknown>
296+
_context?: { workflowId?: string }
297+
}
298+
299+
export interface KnowledgeUpsertDocumentResult {
300+
documentId: string
301+
documentName: string
302+
type: string
303+
enabled: boolean
304+
isUpdate: boolean
305+
previousDocumentId: string | null
306+
createdAt: string
307+
updatedAt: string
308+
}
309+
310+
export interface KnowledgeUpsertDocumentResponse {
311+
success: boolean
312+
output: {
313+
data: KnowledgeUpsertDocumentResult
314+
message: string
315+
documentId: string
316+
}
317+
error?: string
318+
}

0 commit comments

Comments
 (0)