Skip to content

Commit 76c0c56

Browse files
committed
Initial lint
1 parent 850447a commit 76c0c56

File tree

3 files changed

+80
-70
lines changed

3 files changed

+80
-70
lines changed

apps/sim/lib/documents/docs-chunker.ts

Lines changed: 49 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import fs from 'fs/promises'
22
import path from 'path'
33
import { createLogger } from '@/lib/logs/console-logger'
4-
import { TextChunker } from './chunker'
54
import { generateEmbeddings } from '@/app/api/knowledge/utils'
5+
import { TextChunker } from './chunker'
66
import type { DocChunk, DocsChunkerOptions, HeaderInfo } from './types'
77

88
interface Frontmatter {
@@ -35,7 +35,7 @@ export class DocsChunker {
3535
*/
3636
async chunkAllDocs(docsPath: string): Promise<DocChunk[]> {
3737
const allChunks: DocChunk[] = []
38-
38+
3939
try {
4040
const mdxFiles = await this.findMdxFiles(docsPath)
4141
logger.info(`Found ${mdxFiles.length} .mdx files to process`)
@@ -64,36 +64,36 @@ export class DocsChunker {
6464
async chunkMdxFile(filePath: string, basePath: string): Promise<DocChunk[]> {
6565
const content = await fs.readFile(filePath, 'utf-8')
6666
const relativePath = path.relative(basePath, filePath)
67-
67+
6868
// Parse frontmatter and content
6969
const { data: frontmatter, content: markdownContent } = this.parseFrontmatter(content)
70-
70+
7171
// Extract headers from the content
7272
const headers = this.extractHeaders(markdownContent)
73-
73+
7474
// Generate document URL
7575
const documentUrl = this.generateDocumentUrl(relativePath)
76-
76+
7777
// Split content into chunks
7878
const textChunks = await this.splitContent(markdownContent)
79-
79+
8080
// Generate embeddings for all chunks at once (batch processing)
8181
logger.info(`Generating embeddings for ${textChunks.length} chunks in ${relativePath}`)
8282
const embeddings = textChunks.length > 0 ? await generateEmbeddings(textChunks) : []
8383
const embeddingModel = 'text-embedding-3-small'
84-
84+
8585
// Convert to DocChunk objects with header context and embeddings
8686
const chunks: DocChunk[] = []
8787
let currentPosition = 0
88-
88+
8989
for (let i = 0; i < textChunks.length; i++) {
9090
const chunkText = textChunks[i]
9191
const chunkStart = currentPosition
9292
const chunkEnd = currentPosition + chunkText.length
93-
93+
9494
// Find the most relevant header for this chunk
9595
const relevantHeader = this.findRelevantHeader(headers, chunkStart)
96-
96+
9797
const chunk: DocChunk = {
9898
text: chunkText,
9999
tokenCount: Math.ceil(chunkText.length / 4), // Simple token estimation
@@ -111,11 +111,11 @@ export class DocsChunker {
111111
documentDescription: frontmatter.description,
112112
},
113113
}
114-
114+
115115
chunks.push(chunk)
116116
currentPosition = chunkEnd
117117
}
118-
118+
119119
return chunks
120120
}
121121

@@ -124,20 +124,20 @@ export class DocsChunker {
124124
*/
125125
private async findMdxFiles(dirPath: string): Promise<string[]> {
126126
const files: string[] = []
127-
127+
128128
const entries = await fs.readdir(dirPath, { withFileTypes: true })
129-
129+
130130
for (const entry of entries) {
131131
const fullPath = path.join(dirPath, entry.name)
132-
132+
133133
if (entry.isDirectory()) {
134134
const subFiles = await this.findMdxFiles(fullPath)
135135
files.push(...subFiles)
136136
} else if (entry.isFile() && entry.name.endsWith('.mdx')) {
137137
files.push(fullPath)
138138
}
139139
}
140-
140+
141141
return files
142142
}
143143

@@ -148,20 +148,20 @@ export class DocsChunker {
148148
const headers: HeaderInfo[] = []
149149
const headerRegex = /^(#{1,6})\s+(.+)$/gm
150150
let match
151-
151+
152152
while ((match = headerRegex.exec(content)) !== null) {
153153
const level = match[1].length
154154
const text = match[2].trim()
155155
const anchor = this.generateAnchor(text)
156-
156+
157157
headers.push({
158158
text,
159159
level,
160160
anchor,
161161
position: match.index,
162162
})
163163
}
164-
164+
165165
return headers
166166
}
167167

@@ -183,10 +183,8 @@ export class DocsChunker {
183183
private generateDocumentUrl(relativePath: string): string {
184184
// Convert file path to URL path
185185
// e.g., "tools/knowledge.mdx" -> "/tools/knowledge"
186-
const urlPath = relativePath
187-
.replace(/\.mdx$/, '')
188-
.replace(/\\/g, '/') // Handle Windows paths
189-
186+
const urlPath = relativePath.replace(/\.mdx$/, '').replace(/\\/g, '/') // Handle Windows paths
187+
190188
return `${this.baseUrl}/${urlPath}`
191189
}
192190

@@ -195,18 +193,18 @@ export class DocsChunker {
195193
*/
196194
private findRelevantHeader(headers: HeaderInfo[], position: number): HeaderInfo | null {
197195
if (headers.length === 0) return null
198-
196+
199197
// Find the last header that comes before this position
200198
let relevantHeader: HeaderInfo | null = null
201-
199+
202200
for (const header of headers) {
203201
if (header.position <= position) {
204202
relevantHeader = header
205203
} else {
206204
break
207205
}
208206
}
209-
207+
210208
return relevantHeader
211209
}
212210

@@ -216,57 +214,59 @@ export class DocsChunker {
216214
private async splitContent(content: string): Promise<string[]> {
217215
// Clean the content first
218216
const cleanedContent = this.cleanContent(content)
219-
217+
220218
// Use the existing TextChunker
221219
const chunks = await this.textChunker.chunk(cleanedContent)
222-
223-
return chunks.map(chunk => chunk.text)
220+
221+
return chunks.map((chunk) => chunk.text)
224222
}
225223

226224
/**
227225
* Clean content by removing MDX-specific elements and excessive whitespace
228226
*/
229227
private cleanContent(content: string): string {
230-
return content
231-
// Remove import statements
232-
.replace(/^import\s+.*$/gm, '')
233-
// Remove JSX components and React-style comments
234-
.replace(/<[^>]+>/g, ' ')
235-
.replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
236-
// Remove excessive whitespace
237-
.replace(/\n{3,}/g, '\n\n')
238-
.replace(/[ \t]{2,}/g, ' ')
239-
.trim()
228+
return (
229+
content
230+
// Remove import statements
231+
.replace(/^import\s+.*$/gm, '')
232+
// Remove JSX components and React-style comments
233+
.replace(/<[^>]+>/g, ' ')
234+
.replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
235+
// Remove excessive whitespace
236+
.replace(/\n{3,}/g, '\n\n')
237+
.replace(/[ \t]{2,}/g, ' ')
238+
.trim()
239+
)
240240
}
241241

242-
243-
244242
/**
245243
* Parse frontmatter from MDX content
246244
*/
247245
private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
248246
const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
249247
const match = content.match(frontmatterRegex)
250-
248+
251249
if (!match) {
252250
return { data: {}, content }
253251
}
254-
252+
255253
const [, frontmatterText, markdownContent] = match
256254
const data: Frontmatter = {}
257-
255+
258256
// Simple YAML parsing for title and description
259257
const lines = frontmatterText.split('\n')
260258
for (const line of lines) {
261259
const colonIndex = line.indexOf(':')
262260
if (colonIndex > 0) {
263261
const key = line.slice(0, colonIndex).trim()
264-
const value = line.slice(colonIndex + 1).trim().replace(/^['"]|['"]$/g, '')
262+
const value = line
263+
.slice(colonIndex + 1)
264+
.trim()
265+
.replace(/^['"]|['"]$/g, '')
265266
data[key] = value
266267
}
267268
}
268-
269+
269270
return { data, content: markdownContent }
270271
}
271-
272-
}
272+
}

apps/sim/lib/documents/types.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,4 @@ export interface HeaderInfo {
5050
anchor: string
5151
/** Position in document */
5252
position: number
53-
}
53+
}

apps/sim/scripts/chunk-docs.ts

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,23 +21,26 @@ async function main() {
2121

2222
// Path to the docs content directory
2323
const docsPath = path.join(process.cwd(), '../../apps/docs/content/docs')
24-
24+
2525
logger.info(`Processing docs from: ${docsPath}`)
2626

2727
// Process all .mdx files
2828
const chunks = await chunker.chunkAllDocs(docsPath)
2929

3030
logger.info(`\n=== CHUNKING RESULTS ===`)
3131
logger.info(`Total chunks: ${chunks.length}`)
32-
32+
3333
// Group chunks by document
34-
const chunksByDoc = chunks.reduce((acc, chunk) => {
35-
if (!acc[chunk.sourceDocument]) {
36-
acc[chunk.sourceDocument] = []
37-
}
38-
acc[chunk.sourceDocument].push(chunk)
39-
return acc
40-
}, {} as Record<string, typeof chunks>)
34+
const chunksByDoc = chunks.reduce(
35+
(acc, chunk) => {
36+
if (!acc[chunk.sourceDocument]) {
37+
acc[chunk.sourceDocument] = []
38+
}
39+
acc[chunk.sourceDocument].push(chunk)
40+
return acc
41+
},
42+
{} as Record<string, typeof chunks>
43+
)
4144

4245
// Display summary
4346
logger.info(`\n=== DOCUMENT SUMMARY ===`)
@@ -54,14 +57,19 @@ async function main() {
5457
logger.info(` Link: ${chunk.headerLink}`)
5558
logger.info(` Tokens: ${chunk.tokenCount}`)
5659
logger.info(` Embedding: ${chunk.embedding.length} dimensions (${chunk.embeddingModel})`)
57-
logger.info(` Embedding Preview: [${chunk.embedding.slice(0, 5).map(n => n.toFixed(4)).join(', ')}...]`)
60+
logger.info(
61+
` Embedding Preview: [${chunk.embedding
62+
.slice(0, 5)
63+
.map((n) => n.toFixed(4))
64+
.join(', ')}...]`
65+
)
5866
logger.info(` Text Preview: ${chunk.text.slice(0, 100)}...`)
5967
})
6068

6169
// Calculate total token count
6270
const totalTokens = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0)
63-
const chunksWithEmbeddings = chunks.filter(chunk => chunk.embedding.length > 0).length
64-
71+
const chunksWithEmbeddings = chunks.filter((chunk) => chunk.embedding.length > 0).length
72+
6573
logger.info(`\n=== STATISTICS ===`)
6674
logger.info(`Total tokens: ${totalTokens}`)
6775
logger.info(`Average tokens per chunk: ${Math.round(totalTokens / chunks.length)}`)
@@ -70,24 +78,26 @@ async function main() {
7078
logger.info(`Embedding model: ${chunks[0].embeddingModel}`)
7179
logger.info(`Embedding dimensions: ${chunks[0].embedding.length}`)
7280
}
73-
74-
const headerLevels = chunks.reduce((acc, chunk) => {
75-
acc[chunk.headerLevel] = (acc[chunk.headerLevel] || 0) + 1
76-
return acc
77-
}, {} as Record<number, number>)
78-
81+
82+
const headerLevels = chunks.reduce(
83+
(acc, chunk) => {
84+
acc[chunk.headerLevel] = (acc[chunk.headerLevel] || 0) + 1
85+
return acc
86+
},
87+
{} as Record<number, number>
88+
)
89+
7990
logger.info(`Header level distribution:`)
8091
Object.entries(headerLevels)
8192
.sort(([a], [b]) => Number(a) - Number(b))
8293
.forEach(([level, count]) => {
8394
logger.info(` H${level}: ${count} chunks`)
8495
})
85-
8696
} catch (error) {
8797
logger.error('Error processing docs:', error)
8898
process.exit(1)
8999
}
90100
}
91101

92102
// Run the script
93-
main().catch(console.error)
103+
main().catch(console.error)

0 commit comments

Comments
 (0)