fix(chunkers): restore structured data token ratio and overlap joiner

- Restore /3 token estimation for StructuredDataChunker (structured data is denser than prose, ~3 chars/token vs ~4) - Change addOverlap joiner from \n to space to match original TextChunker behavior
simstudioai · waleedlatif1 · Apr 11, 2026 · Apr 11, 2026 · Apr 11, 2026 · Apr 11, 2026
commit 5e8b0515c3f6fc228356dad4480dce9db71a9214
diff --git a/apps/sim/lib/chunkers/structured-data-chunker.ts b/apps/sim/lib/chunkers/structured-data-chunker.ts
@@ -1,6 +1,10 @@
 import { createLogger } from '@sim/logger'
 import type { Chunk, StructuredDataOptions } from '@/lib/chunkers/types'
-import { estimateTokens } from '@/lib/chunkers/utils'
+/** Structured data is denser in tokens (~3 chars/token vs ~4 for prose) */
+function estimateStructuredTokens(text: string): number {
+  if (!text?.trim()) return 0
+  return Math.ceil(text.length / 3)
+}
 
 const logger = createLogger('StructuredDataChunker')
 
@@ -28,7 +32,7 @@ export class StructuredDataChunker {
     const headerLine = options.headers?.join('\t') || lines[0]
     const dataStartIndex = options.headers ? 0 : 1
 
-    const estimatedTokensPerRow = StructuredDataChunker.estimateTokensPerRow(
+    const estimatedTokensPerRow = StructuredDataChunker.estimateStructuredTokensPerRow(
       lines.slice(dataStartIndex, Math.min(10, lines.length))
     )
     const optimalRowsPerChunk = StructuredDataChunker.calculateOptimalRowsPerChunk(
@@ -42,12 +46,12 @@ export class StructuredDataChunker {
 
     let currentChunkRows: string[] = []
     let currentTokenEstimate = 0
-    const headerTokens = estimateTokens(headerLine)
+    const headerTokens = estimateStructuredTokens(headerLine)
     let chunkStartRow = dataStartIndex
 
     for (let i = dataStartIndex; i < lines.length; i++) {
       const row = lines[i]
-      const rowTokens = estimateTokens(row)
+      const rowTokens = estimateStructuredTokens(row)
 
       const projectedTokens =
         currentTokenEstimate +
@@ -111,18 +115,18 @@ export class StructuredDataChunker {
   private static createChunk(content: string, startRow: number, endRow: number): Chunk {
     return {
       text: content,
-      tokenCount: estimateTokens(content),
+      tokenCount: estimateStructuredTokens(content),
       metadata: {
         startIndex: startRow,
         endIndex: endRow,
       },
     }
   }
 
-  private static estimateTokensPerRow(sampleRows: string[]): number {
+  private static estimateStructuredTokensPerRow(sampleRows: string[]): number {
     if (sampleRows.length === 0) return 50
 
-    const totalTokens = sampleRows.reduce((sum, row) => sum + estimateTokens(row), 0)
+    const totalTokens = sampleRows.reduce((sum, row) => sum + estimateStructuredTokens(row), 0)
     return Math.ceil(totalTokens / sampleRows.length)
   }
 

diff --git a/apps/sim/lib/chunkers/utils.test.ts b/apps/sim/lib/chunkers/utils.test.ts
@@ -94,18 +94,16 @@ describe('addOverlap', () => {
     expect(result[1].length).toBeGreaterThan('second chunk here'.length)
   })
 
-  it('joins overlap text with \\n', () => {
+  it('joins overlap text with space', () => {
     const chunks = ['first chunk here', 'second chunk here']
     const result = addOverlap(chunks, 10)
-    expect(result[1]).toContain('\n')
+    expect(result[1]).toContain('here second')
   })
 
   it('snaps overlap to word boundary', () => {
     const chunks = ['hello beautiful world', 'next chunk']
     const result = addOverlap(chunks, 15)
-    const overlapPart = result[1].split('\n')[0]
-    expect(overlapPart).toBe('beautiful world')
-    expect(result[1]).toBe('beautiful world\nnext chunk')
+    expect(result[1]).toBe('beautiful world next chunk')
   })
 })
 

diff --git a/apps/sim/lib/chunkers/utils.ts b/apps/sim/lib/chunkers/utils.ts
@@ -41,7 +41,7 @@ export function addOverlap(chunks: string[], overlapChars: number): string[] {
         : overlapText
 
       if (cleanOverlap.trim()) {
-        chunk = `${cleanOverlap.trim()}\n${chunk}`
+        chunk = `${cleanOverlap.trim()} ${chunk}`
       }
     }