Skip to content

Commit c7ee74e

Browse files
authored
improvement(knowledge-upload): upload content from workflow; improve knowledge/base/document (#465)
* improvement(knowledge-upload): added ability to upload chunks manually * improvement(knowledge-upload): ui/ux and file structure * improvement(knowledge-upload): added knowledge upload tool
1 parent 00f893e commit c7ee74e

File tree

26 files changed

+1501
-492
lines changed

26 files changed

+1501
-492
lines changed

apps/sim/app/api/knowledge/[id]/documents/[documentId]/chunks/[chunkId]/route.ts

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
import { eq } from 'drizzle-orm'
1+
import { eq, sql } from 'drizzle-orm'
22
import { type NextRequest, NextResponse } from 'next/server'
33
import { z } from 'zod'
44
import { getSession } from '@/lib/auth'
55
import { createLogger } from '@/lib/logs/console-logger'
66
import { db } from '@/db'
7-
import { embedding } from '@/db/schema'
7+
import { document, embedding } from '@/db/schema'
88
import { checkChunkAccess } from '../../../../../utils'
99

1010
const logger = createLogger('ChunkByIdAPI')
@@ -188,8 +188,37 @@ export async function DELETE(
188188
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
189189
}
190190

191-
// Delete the chunk
192-
await db.delete(embedding).where(eq(embedding.id, chunkId))
191+
// Use transaction to atomically delete chunk and update document statistics
192+
await db.transaction(async (tx) => {
193+
// Get chunk data before deletion for statistics update
194+
const chunkToDelete = await tx
195+
.select({
196+
tokenCount: embedding.tokenCount,
197+
contentLength: embedding.contentLength,
198+
})
199+
.from(embedding)
200+
.where(eq(embedding.id, chunkId))
201+
.limit(1)
202+
203+
if (chunkToDelete.length === 0) {
204+
throw new Error('Chunk not found')
205+
}
206+
207+
const chunk = chunkToDelete[0]
208+
209+
// Delete the chunk
210+
await tx.delete(embedding).where(eq(embedding.id, chunkId))
211+
212+
// Update document statistics
213+
await tx
214+
.update(document)
215+
.set({
216+
chunkCount: sql`${document.chunkCount} - 1`,
217+
tokenCount: sql`${document.tokenCount} - ${chunk.tokenCount}`,
218+
characterCount: sql`${document.characterCount} - ${chunk.contentLength}`,
219+
})
220+
.where(eq(document.id, documentId))
221+
})
193222

194223
logger.info(
195224
`[${requestId}] Chunk deleted: ${chunkId} from document ${documentId} in knowledge base ${knowledgeBaseId}`

apps/sim/app/api/knowledge/[id]/documents/[documentId]/chunks/route.ts

Lines changed: 141 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1+
import crypto from 'crypto'
12
import { and, asc, eq, ilike, sql } from 'drizzle-orm'
23
import { type NextRequest, NextResponse } from 'next/server'
34
import { z } from 'zod'
45
import { getSession } from '@/lib/auth'
56
import { createLogger } from '@/lib/logs/console-logger'
67
import { db } from '@/db'
7-
import { embedding } from '@/db/schema'
8-
import { checkDocumentAccess } from '../../../../utils'
8+
import { document, embedding } from '@/db/schema'
9+
import { checkDocumentAccess, generateEmbeddings } from '../../../../utils'
910

1011
const logger = createLogger('DocumentChunksAPI')
1112

@@ -17,6 +18,12 @@ const GetChunksQuerySchema = z.object({
1718
offset: z.coerce.number().min(0).optional().default(0),
1819
})
1920

21+
// Schema for creating manual chunks
22+
const CreateChunkSchema = z.object({
23+
content: z.string().min(1, 'Content is required').max(10000, 'Content too long'),
24+
enabled: z.boolean().optional().default(true),
25+
})
26+
2027
export async function GET(
2128
req: NextRequest,
2229
{ params }: { params: Promise<{ id: string; documentId: string }> }
@@ -142,3 +149,135 @@ export async function GET(
142149
return NextResponse.json({ error: 'Failed to fetch chunks' }, { status: 500 })
143150
}
144151
}
152+
153+
export async function POST(
154+
req: NextRequest,
155+
{ params }: { params: Promise<{ id: string; documentId: string }> }
156+
) {
157+
const requestId = crypto.randomUUID().slice(0, 8)
158+
const { id: knowledgeBaseId, documentId } = await params
159+
160+
try {
161+
const session = await getSession()
162+
if (!session?.user?.id) {
163+
logger.warn(`[${requestId}] Unauthorized chunk creation attempt`)
164+
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
165+
}
166+
167+
const accessCheck = await checkDocumentAccess(knowledgeBaseId, documentId, session.user.id)
168+
169+
if (!accessCheck.hasAccess) {
170+
if (accessCheck.notFound) {
171+
logger.warn(
172+
`[${requestId}] ${accessCheck.reason}: KB=${knowledgeBaseId}, Doc=${documentId}`
173+
)
174+
return NextResponse.json({ error: accessCheck.reason }, { status: 404 })
175+
}
176+
logger.warn(
177+
`[${requestId}] User ${session.user.id} attempted unauthorized chunk creation: ${accessCheck.reason}`
178+
)
179+
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
180+
}
181+
182+
const doc = accessCheck.document
183+
if (!doc) {
184+
logger.warn(
185+
`[${requestId}] Document data not available: KB=${knowledgeBaseId}, Doc=${documentId}`
186+
)
187+
return NextResponse.json({ error: 'Document not found' }, { status: 404 })
188+
}
189+
190+
// Allow manual chunk creation even if document is not fully processed
191+
// but it should exist and not be in failed state
192+
if (doc.processingStatus === 'failed') {
193+
logger.warn(`[${requestId}] Document ${documentId} is in failed state, cannot add chunks`)
194+
return NextResponse.json({ error: 'Cannot add chunks to failed document' }, { status: 400 })
195+
}
196+
197+
const body = await req.json()
198+
199+
try {
200+
const validatedData = CreateChunkSchema.parse(body)
201+
202+
// Generate embedding for the content first (outside transaction for performance)
203+
logger.info(`[${requestId}] Generating embedding for manual chunk`)
204+
const embeddings = await generateEmbeddings([validatedData.content])
205+
206+
const chunkId = crypto.randomUUID()
207+
const now = new Date()
208+
209+
// Use transaction to atomically get next index and insert chunk
210+
const newChunk = await db.transaction(async (tx) => {
211+
// Get the next chunk index atomically within the transaction
212+
const lastChunk = await tx
213+
.select({ chunkIndex: embedding.chunkIndex })
214+
.from(embedding)
215+
.where(eq(embedding.documentId, documentId))
216+
.orderBy(sql`${embedding.chunkIndex} DESC`)
217+
.limit(1)
218+
219+
const nextChunkIndex = lastChunk.length > 0 ? lastChunk[0].chunkIndex + 1 : 0
220+
221+
const chunkData = {
222+
id: chunkId,
223+
knowledgeBaseId,
224+
documentId,
225+
chunkIndex: nextChunkIndex,
226+
chunkHash: crypto.createHash('sha256').update(validatedData.content).digest('hex'),
227+
content: validatedData.content,
228+
contentLength: validatedData.content.length,
229+
tokenCount: Math.ceil(validatedData.content.length / 4), // Rough approximation
230+
embedding: embeddings[0],
231+
embeddingModel: 'text-embedding-3-small',
232+
startOffset: 0, // Manual chunks don't have document offsets
233+
endOffset: validatedData.content.length,
234+
overlapTokens: 0,
235+
metadata: { manual: true }, // Mark as manually created
236+
searchRank: '1.0',
237+
accessCount: 0,
238+
lastAccessedAt: null,
239+
qualityScore: null,
240+
enabled: validatedData.enabled,
241+
createdAt: now,
242+
updatedAt: now,
243+
}
244+
245+
// Insert the new chunk
246+
await tx.insert(embedding).values(chunkData)
247+
248+
// Update document statistics
249+
await tx
250+
.update(document)
251+
.set({
252+
chunkCount: sql`${document.chunkCount} + 1`,
253+
tokenCount: sql`${document.tokenCount} + ${chunkData.tokenCount}`,
254+
characterCount: sql`${document.characterCount} + ${chunkData.contentLength}`,
255+
})
256+
.where(eq(document.id, documentId))
257+
258+
return chunkData
259+
})
260+
261+
logger.info(`[${requestId}] Manual chunk created: ${chunkId} in document ${documentId}`)
262+
263+
return NextResponse.json({
264+
success: true,
265+
data: newChunk,
266+
})
267+
} catch (validationError) {
268+
if (validationError instanceof z.ZodError) {
269+
logger.warn(`[${requestId}] Invalid chunk creation data`, {
270+
errors: validationError.errors,
271+
})
272+
return NextResponse.json(
273+
{ error: 'Invalid request data', details: validationError.errors },
274+
{ status: 400 }
275+
)
276+
}
277+
throw validationError
278+
}
279+
} catch (error) {
280+
logger.error(`[${requestId}] Error creating chunk`, error)
281+
return NextResponse.json({ error: 'Failed to create chunk' }, { status: 500 })
282+
}
283+
}

0 commit comments

Comments
 (0)