Skip to content

Commit bd07f80

Browse files
improvement(kb): added configurability for chunks, query across multiple knowledge bases (#512)
* refactor: consolidate create modal file * fix: identify dead processes * fix: mark failed in DB after processing timeout * improvement: added overlap chunks and fixed modal UI * feat: multiselect logic * fix: biome changes for css ordering warn instead of error * improvement: create chunk ui * fix: removed unused schema columns * fix: removed references to deleted columns * improvement: sped up vector search time * feat: multi-kb search * add bulk endpoint to disable/delete multiple chunks * add bulk endpoint to disable/delete multiple chunks * fix: removed unused schema columns * fix: removed references to deleted columns * made endpoints for knowledge more RESTful, added tests * added batch operations for delete/enable/disable docs, alr have this for chunks * added migrations * added migrations --------- Co-authored-by: Waleed Latif <walif6@gmail.com>
1 parent cc74921 commit bd07f80

File tree

40 files changed

+7983
-1600
lines changed

40 files changed

+7983
-1600
lines changed

apps/sim/app/api/__test-utils__/utils.ts

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import { NextRequest } from 'next/server'
22
import { vi } from 'vitest'
33

4-
// Add type definitions for better type safety
54
export interface MockUser {
65
id: string
76
email: string
@@ -14,7 +13,6 @@ export interface MockAuthResult {
1413
mockUnauthenticated: () => void
1514
}
1615

17-
// Database result types
1816
export interface DatabaseSelectResult {
1917
id: string
2018
[key: string]: any
@@ -234,7 +232,6 @@ export function createMockRequest(
234232
): NextRequest {
235233
const url = 'http://localhost:3000/api/test'
236234

237-
// Use the URL constructor to create a proper URL object
238235
return new NextRequest(new URL(url), {
239236
method,
240237
headers: new Headers(headers),
@@ -248,7 +245,6 @@ export function mockExecutionDependencies() {
248245
return {
249246
...(actual as any),
250247
decryptSecret: vi.fn().mockImplementation((encrypted: string) => {
251-
// Map from encrypted to decrypted
252248
const entries = Object.entries(mockEnvironmentVars)
253249
const found = entries.find(([_, val]) => val === encrypted)
254250
const key = found ? found[0] : null
@@ -570,6 +566,7 @@ export function mockDrizzleOrm() {
570566
asc: vi.fn((field) => ({ field, type: 'asc' })),
571567
desc: vi.fn((field) => ({ field, type: 'desc' })),
572568
isNull: vi.fn((field) => ({ field, type: 'isNull' })),
569+
count: vi.fn((field) => ({ field, type: 'count' })),
573570
sql: vi.fn((strings, ...values) => ({
574571
type: 'sql',
575572
sql: strings,
@@ -578,6 +575,57 @@ export function mockDrizzleOrm() {
578575
}))
579576
}
580577

578+
/**
579+
* Mock knowledge-related database schemas
580+
*/
581+
export function mockKnowledgeSchemas() {
582+
vi.doMock('@/db/schema', () => ({
583+
knowledgeBase: {
584+
id: 'kb_id',
585+
userId: 'user_id',
586+
name: 'kb_name',
587+
description: 'description',
588+
tokenCount: 'token_count',
589+
embeddingModel: 'embedding_model',
590+
embeddingDimension: 'embedding_dimension',
591+
chunkingConfig: 'chunking_config',
592+
workspaceId: 'workspace_id',
593+
createdAt: 'created_at',
594+
updatedAt: 'updated_at',
595+
deletedAt: 'deleted_at',
596+
},
597+
document: {
598+
id: 'doc_id',
599+
knowledgeBaseId: 'kb_id',
600+
filename: 'filename',
601+
fileUrl: 'file_url',
602+
fileSize: 'file_size',
603+
mimeType: 'mime_type',
604+
chunkCount: 'chunk_count',
605+
tokenCount: 'token_count',
606+
characterCount: 'character_count',
607+
processingStatus: 'processing_status',
608+
processingStartedAt: 'processing_started_at',
609+
processingCompletedAt: 'processing_completed_at',
610+
processingError: 'processing_error',
611+
enabled: 'enabled',
612+
uploadedAt: 'uploaded_at',
613+
deletedAt: 'deleted_at',
614+
},
615+
embedding: {
616+
id: 'embedding_id',
617+
documentId: 'doc_id',
618+
knowledgeBaseId: 'kb_id',
619+
chunkIndex: 'chunk_index',
620+
content: 'content',
621+
embedding: 'embedding',
622+
tokenCount: 'token_count',
623+
characterCount: 'character_count',
624+
createdAt: 'created_at',
625+
},
626+
}))
627+
}
628+
581629
/**
582630
* Mock console logger
583631
*/

apps/sim/app/api/knowledge/[id]/documents/[documentId]/chunks/[chunkId]/route.ts

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import crypto from 'node:crypto'
12
import { eq, sql } from 'drizzle-orm'
23
import { type NextRequest, NextResponse } from 'next/server'
34
import { z } from 'zod'
@@ -12,8 +13,6 @@ const logger = createLogger('ChunkByIdAPI')
1213
const UpdateChunkSchema = z.object({
1314
content: z.string().min(1, 'Content is required').optional(),
1415
enabled: z.boolean().optional(),
15-
searchRank: z.number().min(0).optional(),
16-
qualityScore: z.number().min(0).max(1).optional(),
1716
})
1817

1918
export async function GET(
@@ -103,21 +102,27 @@ export async function PUT(
103102
try {
104103
const validatedData = UpdateChunkSchema.parse(body)
105104

106-
const updateData: any = {
107-
updatedAt: new Date(),
108-
}
105+
const updateData: Partial<{
106+
content: string
107+
contentLength: number
108+
tokenCount: number
109+
chunkHash: string
110+
enabled: boolean
111+
updatedAt: Date
112+
}> = {}
109113

110-
if (validatedData.content !== undefined) {
114+
if (validatedData.content) {
111115
updateData.content = validatedData.content
112116
updateData.contentLength = validatedData.content.length
113117
// Update token count estimation (rough approximation: 4 chars per token)
114118
updateData.tokenCount = Math.ceil(validatedData.content.length / 4)
119+
updateData.chunkHash = crypto
120+
.createHash('sha256')
121+
.update(validatedData.content)
122+
.digest('hex')
115123
}
124+
116125
if (validatedData.enabled !== undefined) updateData.enabled = validatedData.enabled
117-
if (validatedData.searchRank !== undefined)
118-
updateData.searchRank = validatedData.searchRank.toString()
119-
if (validatedData.qualityScore !== undefined)
120-
updateData.qualityScore = validatedData.qualityScore.toString()
121126

122127
await db.update(embedding).set(updateData).where(eq(embedding.id, chunkId))
123128

apps/sim/app/api/knowledge/[id]/documents/[documentId]/chunks/route.ts

Lines changed: 150 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import crypto from 'crypto'
2-
import { and, asc, eq, ilike, sql } from 'drizzle-orm'
2+
import { and, asc, eq, ilike, inArray, sql } from 'drizzle-orm'
33
import { type NextRequest, NextResponse } from 'next/server'
44
import { z } from 'zod'
55
import { getSession } from '@/lib/auth'
@@ -11,20 +11,26 @@ import { checkDocumentAccess, generateEmbeddings } from '../../../../utils'
1111

1212
const logger = createLogger('DocumentChunksAPI')
1313

14-
// Schema for query parameters
1514
const GetChunksQuerySchema = z.object({
1615
search: z.string().optional(),
1716
enabled: z.enum(['true', 'false', 'all']).optional().default('all'),
1817
limit: z.coerce.number().min(1).max(100).optional().default(50),
1918
offset: z.coerce.number().min(0).optional().default(0),
2019
})
2120

22-
// Schema for creating manual chunks
2321
const CreateChunkSchema = z.object({
2422
content: z.string().min(1, 'Content is required').max(10000, 'Content too long'),
2523
enabled: z.boolean().optional().default(true),
2624
})
2725

26+
const BatchOperationSchema = z.object({
27+
operation: z.enum(['enable', 'disable', 'delete']),
28+
chunkIds: z
29+
.array(z.string())
30+
.min(1, 'At least one chunk ID is required')
31+
.max(100, 'Cannot operate on more than 100 chunks at once'),
32+
})
33+
2834
export async function GET(
2935
req: NextRequest,
3036
{ params }: { params: Promise<{ id: string; documentId: string }> }
@@ -112,10 +118,7 @@ export async function GET(
112118
enabled: embedding.enabled,
113119
startOffset: embedding.startOffset,
114120
endOffset: embedding.endOffset,
115-
overlapTokens: embedding.overlapTokens,
116121
metadata: embedding.metadata,
117-
searchRank: embedding.searchRank,
118-
qualityScore: embedding.qualityScore,
119122
createdAt: embedding.createdAt,
120123
updatedAt: embedding.updatedAt,
121124
})
@@ -236,12 +239,7 @@ export async function POST(
236239
embeddingModel: 'text-embedding-3-small',
237240
startOffset: 0, // Manual chunks don't have document offsets
238241
endOffset: validatedData.content.length,
239-
overlapTokens: 0,
240242
metadata: { manual: true }, // Mark as manually created
241-
searchRank: '1.0',
242-
accessCount: 0,
243-
lastAccessedAt: null,
244-
qualityScore: null,
245243
enabled: validatedData.enabled,
246244
createdAt: now,
247245
updatedAt: now,
@@ -286,3 +284,144 @@ export async function POST(
286284
return NextResponse.json({ error: 'Failed to create chunk' }, { status: 500 })
287285
}
288286
}
287+
288+
export async function PATCH(
289+
req: NextRequest,
290+
{ params }: { params: Promise<{ id: string; documentId: string }> }
291+
) {
292+
const requestId = crypto.randomUUID().slice(0, 8)
293+
const { id: knowledgeBaseId, documentId } = await params
294+
295+
try {
296+
const session = await getSession()
297+
if (!session?.user?.id) {
298+
logger.warn(`[${requestId}] Unauthorized batch chunk operation attempt`)
299+
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
300+
}
301+
302+
const accessCheck = await checkDocumentAccess(knowledgeBaseId, documentId, session.user.id)
303+
304+
if (!accessCheck.hasAccess) {
305+
if (accessCheck.notFound) {
306+
logger.warn(
307+
`[${requestId}] ${accessCheck.reason}: KB=${knowledgeBaseId}, Doc=${documentId}`
308+
)
309+
return NextResponse.json({ error: accessCheck.reason }, { status: 404 })
310+
}
311+
logger.warn(
312+
`[${requestId}] User ${session.user.id} attempted unauthorized batch chunk operation: ${accessCheck.reason}`
313+
)
314+
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
315+
}
316+
317+
const body = await req.json()
318+
319+
try {
320+
const validatedData = BatchOperationSchema.parse(body)
321+
const { operation, chunkIds } = validatedData
322+
323+
logger.info(
324+
`[${requestId}] Starting batch ${operation} operation on ${chunkIds.length} chunks for document ${documentId}`
325+
)
326+
327+
const results = []
328+
let successCount = 0
329+
const errorCount = 0
330+
331+
if (operation === 'delete') {
332+
// Handle batch delete with transaction for consistency
333+
await db.transaction(async (tx) => {
334+
// Get chunks to delete for statistics update
335+
const chunksToDelete = await tx
336+
.select({
337+
id: embedding.id,
338+
tokenCount: embedding.tokenCount,
339+
contentLength: embedding.contentLength,
340+
})
341+
.from(embedding)
342+
.where(and(eq(embedding.documentId, documentId), inArray(embedding.id, chunkIds)))
343+
344+
if (chunksToDelete.length === 0) {
345+
throw new Error('No valid chunks found to delete')
346+
}
347+
348+
// Delete chunks
349+
await tx
350+
.delete(embedding)
351+
.where(and(eq(embedding.documentId, documentId), inArray(embedding.id, chunkIds)))
352+
353+
// Update document statistics
354+
const totalTokens = chunksToDelete.reduce((sum, chunk) => sum + chunk.tokenCount, 0)
355+
const totalCharacters = chunksToDelete.reduce(
356+
(sum, chunk) => sum + chunk.contentLength,
357+
0
358+
)
359+
360+
await tx
361+
.update(document)
362+
.set({
363+
chunkCount: sql`${document.chunkCount} - ${chunksToDelete.length}`,
364+
tokenCount: sql`${document.tokenCount} - ${totalTokens}`,
365+
characterCount: sql`${document.characterCount} - ${totalCharacters}`,
366+
})
367+
.where(eq(document.id, documentId))
368+
369+
successCount = chunksToDelete.length
370+
results.push({
371+
operation: 'delete',
372+
deletedCount: chunksToDelete.length,
373+
chunkIds: chunksToDelete.map((c) => c.id),
374+
})
375+
})
376+
} else {
377+
// Handle batch enable/disable
378+
const enabled = operation === 'enable'
379+
380+
// Update chunks in a single query
381+
const updateResult = await db
382+
.update(embedding)
383+
.set({
384+
enabled,
385+
updatedAt: new Date(),
386+
})
387+
.where(and(eq(embedding.documentId, documentId), inArray(embedding.id, chunkIds)))
388+
.returning({ id: embedding.id })
389+
390+
successCount = updateResult.length
391+
results.push({
392+
operation,
393+
updatedCount: updateResult.length,
394+
chunkIds: updateResult.map((r) => r.id),
395+
})
396+
}
397+
398+
logger.info(
399+
`[${requestId}] Batch ${operation} operation completed: ${successCount} successful, ${errorCount} errors`
400+
)
401+
402+
return NextResponse.json({
403+
success: true,
404+
data: {
405+
operation,
406+
successCount,
407+
errorCount,
408+
results,
409+
},
410+
})
411+
} catch (validationError) {
412+
if (validationError instanceof z.ZodError) {
413+
logger.warn(`[${requestId}] Invalid batch operation data`, {
414+
errors: validationError.errors,
415+
})
416+
return NextResponse.json(
417+
{ error: 'Invalid request data', details: validationError.errors },
418+
{ status: 400 }
419+
)
420+
}
421+
throw validationError
422+
}
423+
} catch (error) {
424+
logger.error(`[${requestId}] Error in batch chunk operation`, error)
425+
return NextResponse.json({ error: 'Failed to perform batch operation' }, { status: 500 })
426+
}
427+
}

0 commit comments

Comments
 (0)