From ca519a9c00e00428e6a9badb66142c07f76320c3 Mon Sep 17 00:00:00 2001 From: Aditya Tripathi Date: Tue, 10 Jun 2025 20:12:52 +0000 Subject: [PATCH] refactor: use chonkie locally --- apps/sim/lib/documents/document-processor.ts | 20 ++++++++------------ apps/sim/lib/env.ts | 1 - 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/apps/sim/lib/documents/document-processor.ts b/apps/sim/lib/documents/document-processor.ts index fb7a223be4a..ea3c12fe702 100644 --- a/apps/sim/lib/documents/document-processor.ts +++ b/apps/sim/lib/documents/document-processor.ts @@ -1,4 +1,4 @@ -import { RecursiveChunker } from 'chonkie/cloud' +import { RecursiveChunker } from 'chonkie' import type { RecursiveChunk } from 'chonkie/types' import { env } from '@/lib/env' import { isSupportedFileType, parseBuffer, parseFile } from '@/lib/file-parsers' @@ -78,7 +78,11 @@ async function parseDocument( fileUrl: string, filename: string, mimeType: string -): Promise<{ content: string; processingMethod: 'file-parser' | 'mistral-ocr'; s3Url?: string }> { +): Promise<{ + content: string + processingMethod: 'file-parser' | 'mistral-ocr' + s3Url?: string +}> { const processingMethod = determineProcessingMethod(mimeType, filename) logger.info(`Processing document "${filename}" using ${processingMethod}`) @@ -237,15 +241,8 @@ async function chunkContent( content: string, options: DocumentProcessingOptions ): Promise { - const apiKey = env.CHONKIE_API_KEY - if (!apiKey) { - throw new Error('CHONKIE_API_KEY not configured') - } - - const chunker = new RecursiveChunker(apiKey, { + const chunker = await RecursiveChunker.create({ chunkSize: options.chunkSize || 512, - recipe: options.recipe || 'default', - lang: options.lang || 'en', minCharactersPerChunk: options.minCharactersPerChunk || 24, }) @@ -255,7 +252,7 @@ async function chunkContent( chunkSize: options.chunkSize || 512, }) - const chunks = await chunker.chunk({ text: content }) + const chunks = await chunker.chunk(content) logger.info(`Successfully created ${chunks.length} chunks`) return chunks as RecursiveChunk[] @@ -266,7 +263,6 @@ async function chunkContent( ) } } - /** * Calculate token count estimation (rough approximation: 4 chars per token) */ diff --git a/apps/sim/lib/env.ts b/apps/sim/lib/env.ts index 38c7b60c7fe..9718ee106bd 100644 --- a/apps/sim/lib/env.ts +++ b/apps/sim/lib/env.ts @@ -72,7 +72,6 @@ export const env = createEnv({ FREE_PLAN_LOG_RETENTION_DAYS: z.string().optional(), NODE_ENV: z.string().optional(), GITHUB_TOKEN: z.string().optional(), - CHONKIE_API_KEY: z.string().min(1).optional(), ELEVENLABS_API_KEY: z.string().min(1).optional(), // OAuth blocks (all optional)