cleanup code

simstudioai · icecrasher321 · Apr 8, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
commit 09f326789d1e5892a69b6baa9549af698c2a23f1
diff --git a/apps/sim/app/api/settings/voice/route.ts b/apps/sim/app/api/settings/voice/route.ts
@@ -1,5 +1,5 @@
 import { NextResponse } from 'next/server'
-import { hasSTTService } from '@/lib/speech/transcriber'
+import { hasSTTService } from '@/lib/speech/config'
 
 /**
  * Returns whether server-side STT is configured.

diff --git a/apps/sim/app/chat/components/voice-interface/voice-interface.tsx b/apps/sim/app/chat/components/voice-interface/voice-interface.tsx
@@ -6,7 +6,13 @@ import { Mic, MicOff, Phone } from 'lucide-react'
 import dynamic from 'next/dynamic'
 import { Button } from '@/components/ui/button'
 import { cn } from '@/lib/core/utils/cn'
-import { MAX_SESSION_MS } from '@/hooks/use-speech-to-text'
+import { arrayBufferToBase64, floatTo16BitPCM } from '@/lib/speech/audio'
+import {
+  CHUNK_SEND_INTERVAL_MS,
+  ELEVENLABS_WS_URL,
+  MAX_SESSION_MS,
+  SAMPLE_RATE,
+} from '@/lib/speech/config'
 
 const ParticlesVisualization = dynamic(
   () =>
@@ -18,29 +24,6 @@ const ParticlesVisualization = dynamic(
 
 const logger = createLogger('VoiceInterface')
 
-const ELEVENLABS_WS_URL = 'wss://api.elevenlabs.io/v1/speech-to-text/realtime'
-const SAMPLE_RATE = 16000
-const CHUNK_SEND_INTERVAL_MS = 250
-
-function floatTo16BitPCM(float32Array: Float32Array): ArrayBuffer {
-  const buffer = new ArrayBuffer(float32Array.length * 2)
-  const view = new DataView(buffer)
-  for (let i = 0; i < float32Array.length; i++) {
-    const s = Math.max(-1, Math.min(1, float32Array[i]))
-    view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true)
-  }
-  return buffer
-}
-
-function arrayBufferToBase64(buffer: ArrayBuffer): string {
-  const bytes = new Uint8Array(buffer)
-  let binary = ''
-  for (let i = 0; i < bytes.length; i++) {
-    binary += String.fromCharCode(bytes[i])
-  }
-  return btoa(binary)
-}
-
 interface VoiceInterfaceProps {
   onCallEnd?: () => void
   onVoiceTranscript?: (transcript: string) => void
@@ -125,13 +108,6 @@ export function VoiceInterface({
     if (!ws || ws.readyState !== WebSocket.OPEN) return
 
     const chunks = pcmBufferRef.current
-    // #region agent log
-    if (chunks.length > 0)
-      console.warn('[DBG-1e2b84] H4 flush', {
-        chunksLen: chunks.length,
-        wsOpen: ws.readyState === WebSocket.OPEN,
-      })
-    // #endregion
     if (chunks.length === 0) return
     pcmBufferRef.current = []
 
@@ -189,12 +165,7 @@ export function VoiceInterface({
       })
 
       if (!tokenResponse.ok) {
-        const errBody = await tokenResponse.json().catch(() => ({}))
-        logger.error('Failed to get STT token', {
-          status: tokenResponse.status,
-          error: errBody.error,
-          chatId,
-        })
+        logger.error('Failed to get STT token', { status: tokenResponse.status })
         return false
       }
 
@@ -215,12 +186,8 @@ export function VoiceInterface({
 
       return new Promise<boolean>((resolve) => {
         ws.onopen = () => resolve(true)
-        ws.onerror = (event) => {
-          logger.error('STT WebSocket connection error', {
-            url: ws.url,
-            readyState: ws.readyState,
-            event: String(event),
-          })
+        ws.onerror = () => {
+          logger.error('STT WebSocket connection error')
           resolve(false)
         }
 
@@ -229,13 +196,6 @@ export function VoiceInterface({
 
           try {
             const msg = JSON.parse(event.data)
-            // #region agent log
-            console.warn('[DBG-1e2b84] H2 ws-msg', {
-              type: msg.message_type,
-              text: msg.text?.substring(0, 50),
-              error: msg.error,
-            })
-            // #endregion
 
             if (msg.message_type === 'partial_transcript') {
               if (msg.text) {
@@ -269,9 +229,6 @@ export function VoiceInterface({
 
         ws.onclose = () => {
           wsRef.current = null
-          // #region agent log
-          console.warn('[DBG-1e2b84] WS closed', { state: currentStateRef.current })
-          // #endregion
           if (currentStateRef.current === 'listening' && !isCallEndedRef.current) {
             stopSendingAudio()
             updateState('idle')
@@ -315,31 +272,9 @@ export function VoiceInterface({
       analyserRef.current = analyser
 
       const processor = ac.createScriptProcessor(4096, 1, 1)
-      // #region agent log
-      let _dbgAudioCount = 0
-      // #endregion
       processor.onaudioprocess = (e) => {
         if (!isMutedRef.current && currentStateRef.current === 'listening') {
           pcmBufferRef.current.push(new Float32Array(e.inputBuffer.getChannelData(0)))
-          // #region agent log
-          _dbgAudioCount++
-          if (_dbgAudioCount % 50 === 1)
-            console.warn('[DBG-1e2b84] H1 audio-captured', {
-              count: _dbgAudioCount,
-              bufLen: pcmBufferRef.current.length,
-              state: currentStateRef.current,
-            })
-          // #endregion
-        } else {
-          // #region agent log
-          _dbgAudioCount++
-          if (_dbgAudioCount % 100 === 1)
-            console.warn('[DBG-1e2b84] H1 audio-SKIPPED', {
-              count: _dbgAudioCount,
-              state: currentStateRef.current,
-              muted: isMutedRef.current,
-            })
-          // #endregion
         }
       }
       source.connect(processor)
@@ -373,15 +308,6 @@ export function VoiceInterface({
   }, [])
 
   const startListening = useCallback(async () => {
-    // #region agent log
-    console.warn('[DBG-1e2b84] H3 startListening', {
-      state: currentStateRef.current,
-      muted: isMutedRef.current,
-      callEnded: isCallEndedRef.current,
-      hasWs: !!wsRef.current,
-      wsState: wsRef.current?.readyState,
-    })
-    // #endregion
     if (currentStateRef.current !== 'idle' || isMutedRef.current || isCallEndedRef.current) return
 
     if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
@@ -392,11 +318,6 @@ export function VoiceInterface({
     updateState('listening')
     setCurrentTranscript('')
     startSendingAudio()
-    // #region agent log
-    console.warn('[DBG-1e2b84] H3 startListening-done', {
-      state: currentStateRef.current,
-      sendInterval: !!sendIntervalRef.current,
-    })
 
     sessionTimerRef.current = setTimeout(() => {
       logger.info('Voice session reached max duration, stopping')

diff --git a/apps/sim/hooks/use-speech-to-text.ts b/apps/sim/hooks/use-speech-to-text.ts
@@ -2,13 +2,17 @@
 
 import { useCallback, useEffect, useRef, useState } from 'react'
 import { createLogger } from '@sim/logger'
+import { arrayBufferToBase64, floatTo16BitPCM } from '@/lib/speech/audio'
+import {
+  CHUNK_SEND_INTERVAL_MS,
+  ELEVENLABS_WS_URL,
+  MAX_SESSION_MS,
+  SAMPLE_RATE,
+} from '@/lib/speech/config'
 
 const logger = createLogger('useSpeechToText')
 
-const ELEVENLABS_WS_URL = 'wss://api.elevenlabs.io/v1/speech-to-text/realtime'
-const SAMPLE_RATE = 16000
-const CHUNK_SEND_INTERVAL_MS = 250
-export const MAX_SESSION_MS = 3 * 60 * 1000
+export { MAX_SESSION_MS } from '@/lib/speech/config'
 
 export type PermissionState = 'prompt' | 'granted' | 'denied'
 
@@ -24,25 +28,6 @@ interface UseSpeechToTextReturn {
   toggleListening: () => void
 }
 
-function floatTo16BitPCM(float32Array: Float32Array): ArrayBuffer {
-  const buffer = new ArrayBuffer(float32Array.length * 2)
-  const view = new DataView(buffer)
-  for (let i = 0; i < float32Array.length; i++) {
-    const s = Math.max(-1, Math.min(1, float32Array[i]))
-    view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true)
-  }
-  return buffer
-}
-
-function arrayBufferToBase64(buffer: ArrayBuffer): string {
-  const bytes = new Uint8Array(buffer)
-  let binary = ''
-  for (let i = 0; i < bytes.length; i++) {
-    binary += String.fromCharCode(bytes[i])
-  }
-  return btoa(binary)
-}
-
 export function useSpeechToText({
   onTranscript,
   language,

diff --git a/apps/sim/lib/speech/audio.ts b/apps/sim/lib/speech/audio.ts
@@ -0,0 +1,25 @@
+/**
+ * Convert Float32 PCM samples to 16-bit signed integer PCM.
+ * Required for ElevenLabs realtime STT (pcm_16000 format).
+ */
+export function floatTo16BitPCM(float32Array: Float32Array): ArrayBuffer {
+  const buffer = new ArrayBuffer(float32Array.length * 2)
+  const view = new DataView(buffer)
+  for (let i = 0; i < float32Array.length; i++) {
+    const s = Math.max(-1, Math.min(1, float32Array[i]))
+    view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true)
+  }
+  return buffer
+}
+
+/**
+ * Encode an ArrayBuffer as a base64 string for WebSocket transport.
+ */
+export function arrayBufferToBase64(buffer: ArrayBuffer): string {
+  const bytes = new Uint8Array(buffer)
+  let binary = ''
+  for (let i = 0; i < bytes.length; i++) {
+    binary += String.fromCharCode(bytes[i])
+  }
+  return btoa(binary)
+}
diff --git a/apps/sim/lib/speech/config.ts b/apps/sim/lib/speech/config.ts
@@ -0,0 +1,15 @@
+import { env } from '@/lib/core/config/env'
+
+export const ELEVENLABS_WS_URL = 'wss://api.elevenlabs.io/v1/speech-to-text/realtime'
+export const SAMPLE_RATE = 16000
+export const CHUNK_SEND_INTERVAL_MS = 250
+export const MAX_SESSION_MS = 3 * 60 * 1000
+
+/**
+ * Whether a speech-to-text provider is configured.
+ * Currently checks for `ELEVENLABS_API_KEY`.
+ * To add a new provider: add its env check here.
+ */
+export function hasSTTService(): boolean {
+  return !!env.ELEVENLABS_API_KEY?.trim()
+}
diff --git a/apps/sim/lib/speech/providers/elevenlabs.ts b/apps/sim/lib/speech/providers/elevenlabs.ts
diff --git a/apps/sim/lib/speech/transcriber.ts b/apps/sim/lib/speech/transcriber.ts