Skip to content

Commit e557499

Browse files
authored
feat(parsers): text and markdown parsers (#473)
* feat: text and markdown parsers * fix: don't readfile on buffer, convert buffer to string instead
1 parent 3902e64 commit e557499

File tree

4 files changed

+172
-1
lines changed

4 files changed

+172
-1
lines changed

apps/sim/lib/file-parsers/index.test.ts

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,31 @@ const mockDocxParseFile = vi.fn().mockResolvedValue({
3737
},
3838
})
3939

40+
const mockTxtParseFile = vi.fn().mockResolvedValue({
41+
content: 'Parsed TXT content',
42+
metadata: {
43+
characterCount: 100,
44+
tokenCount: 10,
45+
},
46+
})
47+
48+
const mockMdParseFile = vi.fn().mockResolvedValue({
49+
content: 'Parsed MD content',
50+
metadata: {
51+
characterCount: 100,
52+
tokenCount: 10,
53+
},
54+
})
55+
4056
// Create mock module implementation
4157
const createMockModule = () => {
4258
// Create mock parsers
4359
const mockParsers: Record<string, FileParser> = {
4460
pdf: { parseFile: mockPdfParseFile },
4561
csv: { parseFile: mockCsvParseFile },
4662
docx: { parseFile: mockDocxParseFile },
63+
txt: { parseFile: mockTxtParseFile },
64+
md: { parseFile: mockMdParseFile },
4765
}
4866

4967
// Create the mock module implementation
@@ -122,6 +140,18 @@ describe('File Parsers', () => {
122140
})),
123141
}))
124142

143+
vi.doMock('./txt-parser', () => ({
144+
TxtParser: vi.fn().mockImplementation(() => ({
145+
parseFile: mockTxtParseFile,
146+
})),
147+
}))
148+
149+
vi.doMock('./md-parser', () => ({
150+
MdParser: vi.fn().mockImplementation(() => ({
151+
parseFile: mockMdParseFile,
152+
})),
153+
}))
154+
125155
// Silence console output during tests
126156
global.console = {
127157
...console,
@@ -211,6 +241,40 @@ describe('File Parsers', () => {
211241
expect(result).toEqual(expectedResult)
212242
})
213243

244+
it('should parse TXT files successfully', async () => {
245+
const expectedResult = {
246+
content: 'Parsed TXT content',
247+
metadata: {
248+
characterCount: 100,
249+
tokenCount: 10,
250+
},
251+
}
252+
253+
mockTxtParseFile.mockResolvedValueOnce(expectedResult)
254+
mockExistsSync.mockReturnValue(true)
255+
256+
const { parseFile } = await import('./index')
257+
const result = await parseFile('/test/files/document.txt')
258+
259+
expect(result).toEqual(expectedResult)
260+
})
261+
262+
it('should parse MD files successfully', async () => {
263+
const expectedResult = {
264+
content: 'Parsed MD content',
265+
metadata: {
266+
characterCount: 100,
267+
tokenCount: 10,
268+
},
269+
}
270+
271+
mockMdParseFile.mockResolvedValueOnce(expectedResult)
272+
mockExistsSync.mockReturnValue(true)
273+
274+
const { parseFile } = await import('./index')
275+
const result = await parseFile('/test/files/document.md')
276+
})
277+
214278
it('should throw error for unsupported file types', async () => {
215279
// Make sure the file "exists" for this test
216280
mockExistsSync.mockReturnValue(true)
@@ -240,13 +304,14 @@ describe('File Parsers', () => {
240304
expect(isSupportedFileType('pdf')).toBe(true)
241305
expect(isSupportedFileType('csv')).toBe(true)
242306
expect(isSupportedFileType('docx')).toBe(true)
307+
expect(isSupportedFileType('txt')).toBe(true)
308+
expect(isSupportedFileType('md')).toBe(true)
243309
})
244310

245311
it('should return false for unsupported file types', async () => {
246312
const { isSupportedFileType } = await import('./index')
247313

248314
expect(isSupportedFileType('png')).toBe(false)
249-
expect(isSupportedFileType('txt')).toBe(false)
250315
expect(isSupportedFileType('unknown')).toBe(false)
251316
})
252317

@@ -255,6 +320,8 @@ describe('File Parsers', () => {
255320

256321
expect(isSupportedFileType('PDF')).toBe(true)
257322
expect(isSupportedFileType('CSV')).toBe(true)
323+
expect(isSupportedFileType('TXT')).toBe(true)
324+
expect(isSupportedFileType('MD')).toBe(true)
258325
})
259326

260327
it('should handle errors gracefully', async () => {

apps/sim/lib/file-parsers/index.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,20 @@ function getParserInstances(): Record<string, FileParser> {
7575
} catch (error) {
7676
logger.error('Failed to load DOCX parser:', error)
7777
}
78+
79+
try {
80+
const { TxtParser } = require('./txt-parser')
81+
parserInstances.txt = new TxtParser()
82+
} catch (error) {
83+
logger.error('Failed to load TXT parser:', error)
84+
}
85+
86+
try {
87+
const { MdParser } = require('./md-parser')
88+
parserInstances.md = new MdParser()
89+
} catch (error) {
90+
logger.error('Failed to load MD parser:', error)
91+
}
7892
} catch (error) {
7993
logger.error('Error loading file parsers:', error)
8094
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import { readFile } from 'fs/promises'
2+
import { createLogger } from '@/lib/logs/console-logger'
3+
import type { FileParseResult, FileParser } from './types'
4+
5+
const logger = createLogger('MdParser')
6+
7+
export class MdParser implements FileParser {
8+
async parseFile(filePath: string): Promise<FileParseResult> {
9+
try {
10+
// Validate input
11+
if (!filePath) {
12+
throw new Error('No file path provided')
13+
}
14+
15+
// Read the file
16+
const buffer = await readFile(filePath)
17+
18+
// Use parseBuffer for consistent implementation
19+
return this.parseBuffer(buffer)
20+
} catch (error) {
21+
logger.error('MD file error:', error)
22+
throw new Error(`Failed to parse MD file: ${(error as Error).message}`)
23+
}
24+
}
25+
26+
async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
27+
try {
28+
logger.info('Parsing buffer, size:', buffer.length)
29+
30+
// Extract content
31+
const result = buffer.toString('utf-8')
32+
33+
return {
34+
content: result,
35+
metadata: {
36+
characterCount: result.length,
37+
tokenCount: result.length / 4,
38+
},
39+
}
40+
} catch (error) {
41+
logger.error('MD buffer parsing error:', error)
42+
throw new Error(`Failed to parse MD buffer: ${(error as Error).message}`)
43+
}
44+
}
45+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import { readFile } from 'fs/promises'
2+
import { createLogger } from '@/lib/logs/console-logger'
3+
import type { FileParseResult, FileParser } from './types'
4+
5+
const logger = createLogger('TxtParser')
6+
7+
export class TxtParser implements FileParser {
8+
async parseFile(filePath: string): Promise<FileParseResult> {
9+
try {
10+
// Validate input
11+
if (!filePath) {
12+
throw new Error('No file path provided')
13+
}
14+
15+
// Read the file
16+
const buffer = await readFile(filePath)
17+
18+
// Use parseBuffer for consistent implementation
19+
return this.parseBuffer(buffer)
20+
} catch (error) {
21+
logger.error('TXT file error:', error)
22+
throw new Error(`Failed to parse TXT file: ${(error as Error).message}`)
23+
}
24+
}
25+
26+
async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
27+
try {
28+
logger.info('Parsing buffer, size:', buffer.length)
29+
30+
// Extract content
31+
const result = buffer.toString('utf-8')
32+
33+
return {
34+
content: result,
35+
metadata: {
36+
characterCount: result.length,
37+
tokenCount: result.length / 4,
38+
},
39+
}
40+
} catch (error) {
41+
logger.error('TXT buffer parsing error:', error)
42+
throw new Error(`Failed to parse TXT buffer: ${(error as Error).message}`)
43+
}
44+
}
45+
}

0 commit comments

Comments
 (0)