Skip to content

Commit 850447a

Browse files
committed
Initial commit
1 parent 0f21fbf commit 850447a

File tree

3 files changed

+418
-0
lines changed

3 files changed

+418
-0
lines changed
Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
import fs from 'fs/promises'
2+
import path from 'path'
3+
import { createLogger } from '@/lib/logs/console-logger'
4+
import { TextChunker } from './chunker'
5+
import { generateEmbeddings } from '@/app/api/knowledge/utils'
6+
import type { DocChunk, DocsChunkerOptions, HeaderInfo } from './types'
7+
8+
interface Frontmatter {
9+
title?: string
10+
description?: string
11+
[key: string]: any
12+
}
13+
14+
const logger = createLogger('DocsChunker')
15+
16+
/**
17+
* Docs-specific chunker that processes .mdx files and tracks header context
18+
*/
19+
export class DocsChunker {
20+
private readonly textChunker: TextChunker
21+
private readonly baseUrl: string
22+
23+
constructor(options: DocsChunkerOptions = {}) {
24+
// Use the existing TextChunker for chunking logic
25+
this.textChunker = new TextChunker({
26+
chunkSize: options.chunkSize ?? 1024,
27+
minChunkSize: options.minChunkSize ?? 100,
28+
overlap: options.overlap ?? 200,
29+
})
30+
this.baseUrl = options.baseUrl ?? 'https://docs.simstudio.ai'
31+
}
32+
33+
/**
34+
* Process all .mdx files in the docs directory
35+
*/
36+
async chunkAllDocs(docsPath: string): Promise<DocChunk[]> {
37+
const allChunks: DocChunk[] = []
38+
39+
try {
40+
const mdxFiles = await this.findMdxFiles(docsPath)
41+
logger.info(`Found ${mdxFiles.length} .mdx files to process`)
42+
43+
for (const filePath of mdxFiles) {
44+
try {
45+
const chunks = await this.chunkMdxFile(filePath, docsPath)
46+
allChunks.push(...chunks)
47+
logger.info(`Processed ${filePath}: ${chunks.length} chunks`)
48+
} catch (error) {
49+
logger.error(`Error processing ${filePath}:`, error)
50+
}
51+
}
52+
53+
logger.info(`Total chunks generated: ${allChunks.length}`)
54+
return allChunks
55+
} catch (error) {
56+
logger.error('Error processing docs:', error)
57+
throw error
58+
}
59+
}
60+
61+
/**
62+
* Process a single .mdx file
63+
*/
64+
async chunkMdxFile(filePath: string, basePath: string): Promise<DocChunk[]> {
65+
const content = await fs.readFile(filePath, 'utf-8')
66+
const relativePath = path.relative(basePath, filePath)
67+
68+
// Parse frontmatter and content
69+
const { data: frontmatter, content: markdownContent } = this.parseFrontmatter(content)
70+
71+
// Extract headers from the content
72+
const headers = this.extractHeaders(markdownContent)
73+
74+
// Generate document URL
75+
const documentUrl = this.generateDocumentUrl(relativePath)
76+
77+
// Split content into chunks
78+
const textChunks = await this.splitContent(markdownContent)
79+
80+
// Generate embeddings for all chunks at once (batch processing)
81+
logger.info(`Generating embeddings for ${textChunks.length} chunks in ${relativePath}`)
82+
const embeddings = textChunks.length > 0 ? await generateEmbeddings(textChunks) : []
83+
const embeddingModel = 'text-embedding-3-small'
84+
85+
// Convert to DocChunk objects with header context and embeddings
86+
const chunks: DocChunk[] = []
87+
let currentPosition = 0
88+
89+
for (let i = 0; i < textChunks.length; i++) {
90+
const chunkText = textChunks[i]
91+
const chunkStart = currentPosition
92+
const chunkEnd = currentPosition + chunkText.length
93+
94+
// Find the most relevant header for this chunk
95+
const relevantHeader = this.findRelevantHeader(headers, chunkStart)
96+
97+
const chunk: DocChunk = {
98+
text: chunkText,
99+
tokenCount: Math.ceil(chunkText.length / 4), // Simple token estimation
100+
sourceDocument: relativePath,
101+
headerLink: relevantHeader ? `${documentUrl}#${relevantHeader.anchor}` : documentUrl,
102+
headerText: relevantHeader?.text || frontmatter.title || 'Document Root',
103+
headerLevel: relevantHeader?.level || 1,
104+
embedding: embeddings[i] || [],
105+
embeddingModel,
106+
metadata: {
107+
startIndex: chunkStart,
108+
endIndex: chunkEnd,
109+
hasFrontmatter: i === 0 && content.startsWith('---'),
110+
documentTitle: frontmatter.title,
111+
documentDescription: frontmatter.description,
112+
},
113+
}
114+
115+
chunks.push(chunk)
116+
currentPosition = chunkEnd
117+
}
118+
119+
return chunks
120+
}
121+
122+
/**
123+
* Find all .mdx files recursively
124+
*/
125+
private async findMdxFiles(dirPath: string): Promise<string[]> {
126+
const files: string[] = []
127+
128+
const entries = await fs.readdir(dirPath, { withFileTypes: true })
129+
130+
for (const entry of entries) {
131+
const fullPath = path.join(dirPath, entry.name)
132+
133+
if (entry.isDirectory()) {
134+
const subFiles = await this.findMdxFiles(fullPath)
135+
files.push(...subFiles)
136+
} else if (entry.isFile() && entry.name.endsWith('.mdx')) {
137+
files.push(fullPath)
138+
}
139+
}
140+
141+
return files
142+
}
143+
144+
/**
145+
* Extract headers and their positions from markdown content
146+
*/
147+
private extractHeaders(content: string): HeaderInfo[] {
148+
const headers: HeaderInfo[] = []
149+
const headerRegex = /^(#{1,6})\s+(.+)$/gm
150+
let match
151+
152+
while ((match = headerRegex.exec(content)) !== null) {
153+
const level = match[1].length
154+
const text = match[2].trim()
155+
const anchor = this.generateAnchor(text)
156+
157+
headers.push({
158+
text,
159+
level,
160+
anchor,
161+
position: match.index,
162+
})
163+
}
164+
165+
return headers
166+
}
167+
168+
/**
169+
* Generate URL-safe anchor from header text
170+
*/
171+
private generateAnchor(headerText: string): string {
172+
return headerText
173+
.toLowerCase()
174+
.replace(/[^\w\s-]/g, '') // Remove special characters except hyphens
175+
.replace(/\s+/g, '-') // Replace spaces with hyphens
176+
.replace(/-+/g, '-') // Replace multiple hyphens with single
177+
.replace(/^-|-$/g, '') // Remove leading/trailing hyphens
178+
}
179+
180+
/**
181+
* Generate document URL from relative path
182+
*/
183+
private generateDocumentUrl(relativePath: string): string {
184+
// Convert file path to URL path
185+
// e.g., "tools/knowledge.mdx" -> "/tools/knowledge"
186+
const urlPath = relativePath
187+
.replace(/\.mdx$/, '')
188+
.replace(/\\/g, '/') // Handle Windows paths
189+
190+
return `${this.baseUrl}/${urlPath}`
191+
}
192+
193+
/**
194+
* Find the most relevant header for a given position
195+
*/
196+
private findRelevantHeader(headers: HeaderInfo[], position: number): HeaderInfo | null {
197+
if (headers.length === 0) return null
198+
199+
// Find the last header that comes before this position
200+
let relevantHeader: HeaderInfo | null = null
201+
202+
for (const header of headers) {
203+
if (header.position <= position) {
204+
relevantHeader = header
205+
} else {
206+
break
207+
}
208+
}
209+
210+
return relevantHeader
211+
}
212+
213+
/**
214+
* Split content into chunks using the existing TextChunker
215+
*/
216+
private async splitContent(content: string): Promise<string[]> {
217+
// Clean the content first
218+
const cleanedContent = this.cleanContent(content)
219+
220+
// Use the existing TextChunker
221+
const chunks = await this.textChunker.chunk(cleanedContent)
222+
223+
return chunks.map(chunk => chunk.text)
224+
}
225+
226+
/**
227+
* Clean content by removing MDX-specific elements and excessive whitespace
228+
*/
229+
private cleanContent(content: string): string {
230+
return content
231+
// Remove import statements
232+
.replace(/^import\s+.*$/gm, '')
233+
// Remove JSX components and React-style comments
234+
.replace(/<[^>]+>/g, ' ')
235+
.replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
236+
// Remove excessive whitespace
237+
.replace(/\n{3,}/g, '\n\n')
238+
.replace(/[ \t]{2,}/g, ' ')
239+
.trim()
240+
}
241+
242+
243+
244+
/**
245+
* Parse frontmatter from MDX content
246+
*/
247+
private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
248+
const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
249+
const match = content.match(frontmatterRegex)
250+
251+
if (!match) {
252+
return { data: {}, content }
253+
}
254+
255+
const [, frontmatterText, markdownContent] = match
256+
const data: Frontmatter = {}
257+
258+
// Simple YAML parsing for title and description
259+
const lines = frontmatterText.split('\n')
260+
for (const line of lines) {
261+
const colonIndex = line.indexOf(':')
262+
if (colonIndex > 0) {
263+
const key = line.slice(0, colonIndex).trim()
264+
const value = line.slice(colonIndex + 1).trim().replace(/^['"]|['"]$/g, '')
265+
data[key] = value
266+
}
267+
}
268+
269+
return { data, content: markdownContent }
270+
}
271+
272+
}

apps/sim/lib/documents/types.ts

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
export interface DocChunk {
2+
/** The chunk text content */
3+
text: string
4+
/** Token count estimate for the chunk */
5+
tokenCount: number
6+
/** Source document path relative to docs/ */
7+
sourceDocument: string
8+
/** Link to the most relevant header section */
9+
headerLink: string
10+
/** The header text that this chunk belongs to */
11+
headerText: string
12+
/** Header level (1-6) */
13+
headerLevel: number
14+
/** OpenAI text embedding vector (1536 dimensions for text-embedding-3-small) */
15+
embedding: number[]
16+
/** Model used to generate the embedding */
17+
embeddingModel: string
18+
/** Metadata about the chunk */
19+
metadata: {
20+
/** Start position in the original document */
21+
startIndex: number
22+
/** End position in the original document */
23+
endIndex: number
24+
/** Whether this chunk contains the document frontmatter */
25+
hasFrontmatter?: boolean
26+
/** Document title from frontmatter */
27+
documentTitle?: string
28+
/** Document description from frontmatter */
29+
documentDescription?: string
30+
}
31+
}
32+
33+
export interface DocsChunkerOptions {
34+
/** Target chunk size in tokens */
35+
chunkSize?: number
36+
/** Minimum chunk size in tokens */
37+
minChunkSize?: number
38+
/** Overlap between chunks in tokens */
39+
overlap?: number
40+
/** Base URL for generating links */
41+
baseUrl?: string
42+
}
43+
44+
export interface HeaderInfo {
45+
/** Header text */
46+
text: string
47+
/** Header level (1-6) */
48+
level: number
49+
/** Anchor link */
50+
anchor: string
51+
/** Position in document */
52+
position: number
53+
}

0 commit comments

Comments
 (0)