From b74a6945d312ae71234257d19e34e9a8eb86e012 Mon Sep 17 00:00:00 2001 From: Aditya Tripathi Date: Tue, 10 Jun 2025 19:17:13 +0000 Subject: [PATCH 1/2] feat: text and markdown parsers --- apps/sim/lib/file-parsers/index.test.ts | 69 ++++++++++++++++++++++++- apps/sim/lib/file-parsers/index.ts | 14 +++++ apps/sim/lib/file-parsers/md-parser.ts | 45 ++++++++++++++++ apps/sim/lib/file-parsers/txt-parser.ts | 45 ++++++++++++++++ 4 files changed, 172 insertions(+), 1 deletion(-) create mode 100644 apps/sim/lib/file-parsers/md-parser.ts create mode 100644 apps/sim/lib/file-parsers/txt-parser.ts diff --git a/apps/sim/lib/file-parsers/index.test.ts b/apps/sim/lib/file-parsers/index.test.ts index 535ab10c9b5..98b59e230ba 100644 --- a/apps/sim/lib/file-parsers/index.test.ts +++ b/apps/sim/lib/file-parsers/index.test.ts @@ -37,6 +37,22 @@ const mockDocxParseFile = vi.fn().mockResolvedValue({ }, }) +const mockTxtParseFile = vi.fn().mockResolvedValue({ + content: 'Parsed TXT content', + metadata: { + characterCount: 100, + tokenCount: 10, + }, +}) + +const mockMdParseFile = vi.fn().mockResolvedValue({ + content: 'Parsed MD content', + metadata: { + characterCount: 100, + tokenCount: 10, + }, +}) + // Create mock module implementation const createMockModule = () => { // Create mock parsers @@ -44,6 +60,8 @@ const createMockModule = () => { pdf: { parseFile: mockPdfParseFile }, csv: { parseFile: mockCsvParseFile }, docx: { parseFile: mockDocxParseFile }, + txt: { parseFile: mockTxtParseFile }, + md: { parseFile: mockMdParseFile }, } // Create the mock module implementation @@ -122,6 +140,18 @@ describe('File Parsers', () => { })), })) + vi.doMock('./txt-parser', () => ({ + TxtParser: vi.fn().mockImplementation(() => ({ + parseFile: mockTxtParseFile, + })), + })) + + vi.doMock('./md-parser', () => ({ + MdParser: vi.fn().mockImplementation(() => ({ + parseFile: mockMdParseFile, + })), + })) + // Silence console output during tests global.console = { ...console, @@ -211,6 +241,40 @@ describe('File Parsers', () => { expect(result).toEqual(expectedResult) }) + it('should parse TXT files successfully', async () => { + const expectedResult = { + content: 'Parsed TXT content', + metadata: { + characterCount: 100, + tokenCount: 10, + }, + } + + mockTxtParseFile.mockResolvedValueOnce(expectedResult) + mockExistsSync.mockReturnValue(true) + + const { parseFile } = await import('./index') + const result = await parseFile('/test/files/document.txt') + + expect(result).toEqual(expectedResult) + }) + + it('should parse MD files successfully', async () => { + const expectedResult = { + content: 'Parsed MD content', + metadata: { + characterCount: 100, + tokenCount: 10, + }, + } + + mockMdParseFile.mockResolvedValueOnce(expectedResult) + mockExistsSync.mockReturnValue(true) + + const { parseFile } = await import('./index') + const result = await parseFile('/test/files/document.md') + }) + it('should throw error for unsupported file types', async () => { // Make sure the file "exists" for this test mockExistsSync.mockReturnValue(true) @@ -240,13 +304,14 @@ describe('File Parsers', () => { expect(isSupportedFileType('pdf')).toBe(true) expect(isSupportedFileType('csv')).toBe(true) expect(isSupportedFileType('docx')).toBe(true) + expect(isSupportedFileType('txt')).toBe(true) + expect(isSupportedFileType('md')).toBe(true) }) it('should return false for unsupported file types', async () => { const { isSupportedFileType } = await import('./index') expect(isSupportedFileType('png')).toBe(false) - expect(isSupportedFileType('txt')).toBe(false) expect(isSupportedFileType('unknown')).toBe(false) }) @@ -255,6 +320,8 @@ describe('File Parsers', () => { expect(isSupportedFileType('PDF')).toBe(true) expect(isSupportedFileType('CSV')).toBe(true) + expect(isSupportedFileType('TXT')).toBe(true) + expect(isSupportedFileType('MD')).toBe(true) }) it('should handle errors gracefully', async () => { diff --git a/apps/sim/lib/file-parsers/index.ts b/apps/sim/lib/file-parsers/index.ts index 15ff2e50da2..26cd63f8832 100644 --- a/apps/sim/lib/file-parsers/index.ts +++ b/apps/sim/lib/file-parsers/index.ts @@ -75,6 +75,20 @@ function getParserInstances(): Record { } catch (error) { logger.error('Failed to load DOCX parser:', error) } + + try { + const { TxtParser } = require('./txt-parser') + parserInstances.txt = new TxtParser() + } catch (error) { + logger.error('Failed to load TXT parser:', error) + } + + try { + const { MdParser } = require('./md-parser') + parserInstances.md = new MdParser() + } catch (error) { + logger.error('Failed to load MD parser:', error) + } } catch (error) { logger.error('Error loading file parsers:', error) } diff --git a/apps/sim/lib/file-parsers/md-parser.ts b/apps/sim/lib/file-parsers/md-parser.ts new file mode 100644 index 00000000000..223f3881c58 --- /dev/null +++ b/apps/sim/lib/file-parsers/md-parser.ts @@ -0,0 +1,45 @@ +import { readFile } from 'fs/promises' +import { createLogger } from '@/lib/logs/console-logger' +import type { FileParseResult, FileParser } from './types' + +const logger = createLogger('MdParser') + +export class MdParser implements FileParser { + async parseFile(filePath: string): Promise { + try { + // Validate input + if (!filePath) { + throw new Error('No file path provided') + } + + // Read the file + const buffer = await readFile(filePath) + + // Use parseBuffer for consistent implementation + return this.parseBuffer(buffer) + } catch (error) { + logger.error('MD file error:', error) + throw new Error(`Failed to parse MD file: ${(error as Error).message}`) + } + } + + async parseBuffer(buffer: Buffer): Promise { + try { + logger.info('Parsing buffer, size:', buffer.length) + + // Extract content + const result = await readFile(buffer, 'utf-8') + + return { + content: result, + metadata: { + characterCount: result.length, + tokenCount: result.length / 4, + }, + } + } catch (error) { + logger.error('MD buffer parsing error:', error) + throw new Error(`Failed to parse MD buffer: ${(error as Error).message}`) + } + } +} diff --git a/apps/sim/lib/file-parsers/txt-parser.ts b/apps/sim/lib/file-parsers/txt-parser.ts new file mode 100644 index 00000000000..177c6fa9a51 --- /dev/null +++ b/apps/sim/lib/file-parsers/txt-parser.ts @@ -0,0 +1,45 @@ +import { readFile } from 'fs/promises' +import { createLogger } from '@/lib/logs/console-logger' +import type { FileParseResult, FileParser } from './types' + +const logger = createLogger('TxtParser') + +export class TxtParser implements FileParser { + async parseFile(filePath: string): Promise { + try { + // Validate input + if (!filePath) { + throw new Error('No file path provided') + } + + // Read the file + const buffer = await readFile(filePath) + + // Use parseBuffer for consistent implementation + return this.parseBuffer(buffer) + } catch (error) { + logger.error('TXT file error:', error) + throw new Error(`Failed to parse TXT file: ${(error as Error).message}`) + } + } + + async parseBuffer(buffer: Buffer): Promise { + try { + logger.info('Parsing buffer, size:', buffer.length) + + // Extract content + const result = await readFile(buffer, 'utf-8') + + return { + content: result, + metadata: { + characterCount: result.length, + tokenCount: result.length / 4, + }, + } + } catch (error) { + logger.error('TXT buffer parsing error:', error) + throw new Error(`Failed to parse TXT buffer: ${(error as Error).message}`) + } + } +} From 4717a46c373538c2bf7de550d71b24e59df0df79 Mon Sep 17 00:00:00 2001 From: Aditya Tripathi Date: Tue, 10 Jun 2025 19:34:09 +0000 Subject: [PATCH 2/2] fix: don't readfile on buffer, convert buffer to string instead --- apps/sim/lib/file-parsers/md-parser.ts | 2 +- apps/sim/lib/file-parsers/txt-parser.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/sim/lib/file-parsers/md-parser.ts b/apps/sim/lib/file-parsers/md-parser.ts index 223f3881c58..7080edd6ed0 100644 --- a/apps/sim/lib/file-parsers/md-parser.ts +++ b/apps/sim/lib/file-parsers/md-parser.ts @@ -28,7 +28,7 @@ export class MdParser implements FileParser { logger.info('Parsing buffer, size:', buffer.length) // Extract content - const result = await readFile(buffer, 'utf-8') + const result = buffer.toString('utf-8') return { content: result, diff --git a/apps/sim/lib/file-parsers/txt-parser.ts b/apps/sim/lib/file-parsers/txt-parser.ts index 177c6fa9a51..dfde1f1faac 100644 --- a/apps/sim/lib/file-parsers/txt-parser.ts +++ b/apps/sim/lib/file-parsers/txt-parser.ts @@ -28,7 +28,7 @@ export class TxtParser implements FileParser { logger.info('Parsing buffer, size:', buffer.length) // Extract content - const result = await readFile(buffer, 'utf-8') + const result = buffer.toString('utf-8') return { content: result,