Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions apps/sim/app/api/knowledge/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,11 @@ export async function checkDocumentAccess(
.limit(1)

if (kb.length === 0) {
return { hasAccess: false, notFound: true, reason: 'Knowledge base not found' }
return {
hasAccess: false,
notFound: true,
reason: 'Knowledge base not found',
}
}

const kbData = kb[0]
Expand All @@ -204,7 +208,11 @@ export async function checkDocumentAccess(
return { hasAccess: false, notFound: true, reason: 'Document not found' }
}

return { hasAccess: true, document: doc[0] as DocumentData, knowledgeBase: kbData }
return {
hasAccess: true,
document: doc[0] as DocumentData,
knowledgeBase: kbData,
}
}

/**
Expand All @@ -226,7 +234,11 @@ export async function checkChunkAccess(
.limit(1)

if (kb.length === 0) {
return { hasAccess: false, notFound: true, reason: 'Knowledge base not found' }
return {
hasAccess: false,
notFound: true,
reason: 'Knowledge base not found',
}
}

const kbData = kb[0]
Expand Down Expand Up @@ -437,8 +449,8 @@ export async function processDocumentAsync(
tokenCount: Math.ceil(chunk.text.length / 4),
embedding: embeddings[chunkIndex] || null,
embeddingModel: 'text-embedding-3-small',
startOffset: chunk.startIndex || 0,
endOffset: chunk.endIndex || chunk.text.length,
startOffset: chunk.metadata.startIndex,
endOffset: chunk.metadata.endIndex,
overlapTokens: 0,
metadata: {},
searchRank: '1.0',
Expand Down
260 changes: 260 additions & 0 deletions apps/sim/lib/documents/chunker.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
export interface ChunkMetadata {
startIndex: number
endIndex: number
tokenCount: number
}

export interface TextChunk {
text: string
metadata: ChunkMetadata
}

export interface ChunkerOptions {
chunkSize?: number
minChunkSize?: number
overlap?: number
}

export interface Chunk {
text: string
tokenCount: number
metadata: {
startIndex: number
endIndex: number
}
}

/**
* Lightweight text chunker optimized for RAG applications
* Uses hierarchical splitting with smart token estimation
*/
export class TextChunker {
private readonly chunkSize: number
private readonly minChunkSize: number
private readonly overlap: number

// Hierarchical separators ordered from largest to smallest semantic units
private readonly separators = [
'\n\n\n', // Document sections
'\n---\n', // Markdown horizontal rules
'\n***\n', // Markdown horizontal rules (alternative)
'\n___\n', // Markdown horizontal rules (alternative)
'\n# ', // Markdown H1 headings
'\n## ', // Markdown H2 headings
'\n### ', // Markdown H3 headings
'\n#### ', // Markdown H4 headings
'\n##### ', // Markdown H5 headings
'\n###### ', // Markdown H6 headings
'\n\n', // Paragraphs
'\n', // Lines
'. ', // Sentences
'! ', // Exclamations
'? ', // Questions
'; ', // Semicolons
', ', // Commas
' ', // Words
]

constructor(options: ChunkerOptions = {}) {
this.chunkSize = options.chunkSize ?? 512
this.minChunkSize = options.minChunkSize ?? 50
this.overlap = options.overlap ?? 0
}

/**
* Estimate token count - optimized for common tokenizers
*/
private estimateTokens(text: string): number {
// Handle empty or whitespace-only text
if (!text?.trim()) return 0

const words = text.trim().split(/\s+/)
let tokenCount = 0

for (const word of words) {
if (word.length === 0) continue

// Short words (1-4 chars) are usually 1 token
if (word.length <= 4) {
tokenCount += 1
}
// Medium words (5-8 chars) are usually 1-2 tokens
else if (word.length <= 8) {
tokenCount += Math.ceil(word.length / 5)
}
// Long words get split more by subword tokenization
else {
tokenCount += Math.ceil(word.length / 4)
}
}

return tokenCount
}

/**
* Split text recursively using hierarchical separators
*/
private splitRecursively(text: string, separatorIndex = 0): string[] {
const tokenCount = this.estimateTokens(text)

// If chunk is small enough, return it
if (tokenCount <= this.chunkSize) {
return text.length >= this.minChunkSize ? [text] : []
}

// If we've run out of separators, force split by character count
if (separatorIndex >= this.separators.length) {
const chunks: string[] = []
const targetLength = Math.ceil((text.length * this.chunkSize) / tokenCount)

for (let i = 0; i < text.length; i += targetLength) {
const chunk = text.slice(i, i + targetLength).trim()
if (chunk.length >= this.minChunkSize) {
chunks.push(chunk)
}
}
return chunks
}

const separator = this.separators[separatorIndex]
const parts = text.split(separator).filter((part) => part.trim())

// If no split occurred, try next separator
if (parts.length <= 1) {
return this.splitRecursively(text, separatorIndex + 1)
}

const chunks: string[] = []
let currentChunk = ''

for (const part of parts) {
const testChunk = currentChunk + (currentChunk ? separator : '') + part

if (this.estimateTokens(testChunk) <= this.chunkSize) {
currentChunk = testChunk
} else {
// Save current chunk if it meets minimum size
if (currentChunk.trim() && currentChunk.length >= this.minChunkSize) {
chunks.push(currentChunk.trim())
}

// Start new chunk with current part
// If part itself is too large, split it further
if (this.estimateTokens(part) > this.chunkSize) {
chunks.push(...this.splitRecursively(part, separatorIndex + 1))
currentChunk = ''
} else {
currentChunk = part
}
}
}

// Add final chunk if it exists and meets minimum size
if (currentChunk.trim() && currentChunk.length >= this.minChunkSize) {
chunks.push(currentChunk.trim())
}

return chunks
}

/**
* Add overlap between chunks if specified
*/
private addOverlap(chunks: string[]): string[] {
if (this.overlap <= 0 || chunks.length <= 1) {
return chunks
}

const overlappedChunks: string[] = []

for (let i = 0; i < chunks.length; i++) {
let chunk = chunks[i]

// Add overlap from previous chunk
if (i > 0) {
const prevChunk = chunks[i - 1]
const words = prevChunk.split(/\s+/)
const overlapWords = words.slice(-Math.min(this.overlap, words.length))

if (overlapWords.length > 0) {
chunk = `${overlapWords.join(' ')} ${chunk}`
}
}

overlappedChunks.push(chunk)
}

return overlappedChunks
}

/**
* Clean and normalize text
*/
private cleanText(text: string): string {
return text
.replace(/\r\n/g, '\n') // Normalize Windows line endings
.replace(/\r/g, '\n') // Normalize old Mac line endings
.replace(/\n{3,}/g, '\n\n') // Limit consecutive newlines
.replace(/\t/g, ' ') // Convert tabs to spaces
.replace(/ {2,}/g, ' ') // Collapse multiple spaces
.trim()
}

/**
* Main chunking method
*/
async chunk(text: string): Promise<Chunk[]> {
if (!text?.trim()) {
return []
}

// Clean the text
const cleanedText = this.cleanText(text)

// Split into chunks
let chunks = this.splitRecursively(cleanedText)

// Add overlap if configured
chunks = this.addOverlap(chunks)

// Convert to Chunk objects with metadata
let previousEndIndex = 0
return chunks.map((chunkText, index) => {
let startIndex: number
let actualContentLength: number

if (index === 0 || this.overlap <= 0) {
// First chunk or no overlap - start from previous end
startIndex = previousEndIndex
actualContentLength = chunkText.length
} else {
// Calculate overlap length in characters
const prevChunk = chunks[index - 1]
const prevWords = prevChunk.split(/\s+/)
const overlapWords = prevWords.slice(-Math.min(this.overlap, prevWords.length))
const overlapLength = Math.min(
chunkText.length,
overlapWords.length > 0 ? overlapWords.join(' ').length + 1 : 0 // +1 for space
)

startIndex = previousEndIndex - overlapLength
actualContentLength = chunkText.length - overlapLength
}

const safeStart = Math.max(0, startIndex)
const endIndexSafe = safeStart + actualContentLength

const chunk: Chunk = {
text: chunkText,
tokenCount: this.estimateTokens(chunkText),
metadata: {
startIndex: safeStart,
endIndex: endIndexSafe,
},
}

previousEndIndex = endIndexSafe
return chunk
})
}
}
28 changes: 9 additions & 19 deletions apps/sim/lib/documents/document-processor.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import { RecursiveChunker } from 'chonkie'
import type { RecursiveChunk } from 'chonkie/types'
import { type Chunk, TextChunker } from '@/lib/documents/chunker'
import { env } from '@/lib/env'
import { isSupportedFileType, parseBuffer, parseFile } from '@/lib/file-parsers'
import { createLogger } from '@/lib/logs/console-logger'
Expand All @@ -26,7 +25,7 @@ class APIError extends Error {

export interface ProcessedDocument {
content: string
chunks: RecursiveChunk[]
chunks: Chunk[]
metadata: {
filename: string
fileSize: number
Expand Down Expand Up @@ -235,40 +234,31 @@ async function parseDocument(
}

/**
* Chunk text content using RecursiveChunker
* Chunk text content using TextChunker
*/
async function chunkContent(
content: string,
options: DocumentProcessingOptions
): Promise<RecursiveChunk[]> {
const chunker = await RecursiveChunker.create({
async function chunkContent(content: string, options: DocumentProcessingOptions): Promise<Chunk[]> {
const chunker = new TextChunker({
chunkSize: options.chunkSize || 512,
minCharactersPerChunk: options.minCharactersPerChunk || 24,
minChunkSize: options.minCharactersPerChunk || 24,
})

try {
logger.info('Chunking content with RecursiveChunker', {
logger.info('Chunking content with TextChunker', {
contentLength: content.length,
chunkSize: options.chunkSize || 512,
})

const chunks = await chunker.chunk(content)

logger.info(`Successfully created ${chunks.length} chunks`)
return chunks as RecursiveChunk[]
return chunks
} catch (error) {
logger.error('Chunking failed:', error)
throw new Error(
`Text chunking failed: ${error instanceof Error ? error.message : 'Unknown error'}`
)
}
}
/**
* Calculate token count estimation (rough approximation: 4 chars per token)
*/
function estimateTokenCount(text: string): number {
return Math.ceil(text.length / 4)
}

/**
* Process a single document: parse content and create chunks
Expand Down Expand Up @@ -300,7 +290,7 @@ export async function processDocument(

// Step 3: Calculate metadata
const characterCount = content.length
const tokenCount = estimateTokenCount(content)
const tokenCount = chunks.reduce((acc, chunk) => acc + chunk.tokenCount, 0)
const chunkCount = chunks.length

const processedDocument: ProcessedDocument = {
Expand Down
1 change: 0 additions & 1 deletion apps/sim/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@
"ai": "^4.3.2",
"better-auth": "^1.2.8-beta.3",
"browser-image-compression": "^2.0.2",
"chonkie": "^0.2.5",
"class-variance-authority": "^0.7.1",
"clsx": "^2.1.1",
"cmdk": "^1.0.0",
Expand Down
Loading