Skip to content
Prev Previous commit
Next Next commit
cleanup code
  • Loading branch information
icecrasher321 committed Apr 8, 2026
commit 09f326789d1e5892a69b6baa9549af698c2a23f1
2 changes: 1 addition & 1 deletion apps/sim/app/api/settings/voice/route.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { NextResponse } from 'next/server'
import { hasSTTService } from '@/lib/speech/transcriber'
import { hasSTTService } from '@/lib/speech/config'

/**
* Returns whether server-side STT is configured.
Expand Down
99 changes: 10 additions & 89 deletions apps/sim/app/chat/components/voice-interface/voice-interface.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@ import { Mic, MicOff, Phone } from 'lucide-react'
import dynamic from 'next/dynamic'
import { Button } from '@/components/ui/button'
import { cn } from '@/lib/core/utils/cn'
import { MAX_SESSION_MS } from '@/hooks/use-speech-to-text'
import { arrayBufferToBase64, floatTo16BitPCM } from '@/lib/speech/audio'
import {
CHUNK_SEND_INTERVAL_MS,
ELEVENLABS_WS_URL,
MAX_SESSION_MS,
SAMPLE_RATE,
} from '@/lib/speech/config'

const ParticlesVisualization = dynamic(
() =>
Expand All @@ -18,29 +24,6 @@ const ParticlesVisualization = dynamic(

const logger = createLogger('VoiceInterface')

const ELEVENLABS_WS_URL = 'wss://api.elevenlabs.io/v1/speech-to-text/realtime'
const SAMPLE_RATE = 16000
const CHUNK_SEND_INTERVAL_MS = 250

function floatTo16BitPCM(float32Array: Float32Array): ArrayBuffer {
const buffer = new ArrayBuffer(float32Array.length * 2)
const view = new DataView(buffer)
for (let i = 0; i < float32Array.length; i++) {
const s = Math.max(-1, Math.min(1, float32Array[i]))
view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true)
}
return buffer
}

function arrayBufferToBase64(buffer: ArrayBuffer): string {
const bytes = new Uint8Array(buffer)
let binary = ''
for (let i = 0; i < bytes.length; i++) {
binary += String.fromCharCode(bytes[i])
}
return btoa(binary)
}

interface VoiceInterfaceProps {
onCallEnd?: () => void
onVoiceTranscript?: (transcript: string) => void
Expand Down Expand Up @@ -125,13 +108,6 @@ export function VoiceInterface({
if (!ws || ws.readyState !== WebSocket.OPEN) return

const chunks = pcmBufferRef.current
// #region agent log
if (chunks.length > 0)
console.warn('[DBG-1e2b84] H4 flush', {
chunksLen: chunks.length,
wsOpen: ws.readyState === WebSocket.OPEN,
})
// #endregion
if (chunks.length === 0) return
pcmBufferRef.current = []

Expand Down Expand Up @@ -189,12 +165,7 @@ export function VoiceInterface({
})

if (!tokenResponse.ok) {
const errBody = await tokenResponse.json().catch(() => ({}))
logger.error('Failed to get STT token', {
status: tokenResponse.status,
error: errBody.error,
chatId,
})
logger.error('Failed to get STT token', { status: tokenResponse.status })
return false
}

Expand All @@ -215,12 +186,8 @@ export function VoiceInterface({

return new Promise<boolean>((resolve) => {
ws.onopen = () => resolve(true)
ws.onerror = (event) => {
logger.error('STT WebSocket connection error', {
url: ws.url,
readyState: ws.readyState,
event: String(event),
})
ws.onerror = () => {
logger.error('STT WebSocket connection error')
resolve(false)
}

Expand All @@ -229,13 +196,6 @@ export function VoiceInterface({

try {
const msg = JSON.parse(event.data)
// #region agent log
console.warn('[DBG-1e2b84] H2 ws-msg', {
type: msg.message_type,
text: msg.text?.substring(0, 50),
error: msg.error,
})
// #endregion

if (msg.message_type === 'partial_transcript') {
if (msg.text) {
Expand Down Expand Up @@ -269,9 +229,6 @@ export function VoiceInterface({

ws.onclose = () => {
wsRef.current = null
// #region agent log
console.warn('[DBG-1e2b84] WS closed', { state: currentStateRef.current })
// #endregion
if (currentStateRef.current === 'listening' && !isCallEndedRef.current) {
stopSendingAudio()
updateState('idle')
Expand Down Expand Up @@ -315,31 +272,9 @@ export function VoiceInterface({
analyserRef.current = analyser

const processor = ac.createScriptProcessor(4096, 1, 1)
// #region agent log
let _dbgAudioCount = 0
// #endregion
processor.onaudioprocess = (e) => {
if (!isMutedRef.current && currentStateRef.current === 'listening') {
pcmBufferRef.current.push(new Float32Array(e.inputBuffer.getChannelData(0)))
// #region agent log
_dbgAudioCount++
if (_dbgAudioCount % 50 === 1)
console.warn('[DBG-1e2b84] H1 audio-captured', {
count: _dbgAudioCount,
bufLen: pcmBufferRef.current.length,
state: currentStateRef.current,
})
// #endregion
} else {
// #region agent log
_dbgAudioCount++
if (_dbgAudioCount % 100 === 1)
console.warn('[DBG-1e2b84] H1 audio-SKIPPED', {
count: _dbgAudioCount,
state: currentStateRef.current,
muted: isMutedRef.current,
})
// #endregion
}
}
source.connect(processor)
Expand Down Expand Up @@ -373,15 +308,6 @@ export function VoiceInterface({
}, [])

const startListening = useCallback(async () => {
// #region agent log
console.warn('[DBG-1e2b84] H3 startListening', {
state: currentStateRef.current,
muted: isMutedRef.current,
callEnded: isCallEndedRef.current,
hasWs: !!wsRef.current,
wsState: wsRef.current?.readyState,
})
// #endregion
if (currentStateRef.current !== 'idle' || isMutedRef.current || isCallEndedRef.current) return

if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
Expand All @@ -392,11 +318,6 @@ export function VoiceInterface({
updateState('listening')
setCurrentTranscript('')
startSendingAudio()
// #region agent log
console.warn('[DBG-1e2b84] H3 startListening-done', {
state: currentStateRef.current,
sendInterval: !!sendIntervalRef.current,
})

sessionTimerRef.current = setTimeout(() => {
logger.info('Voice session reached max duration, stopping')
Expand Down
31 changes: 8 additions & 23 deletions apps/sim/hooks/use-speech-to-text.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@

import { useCallback, useEffect, useRef, useState } from 'react'
import { createLogger } from '@sim/logger'
import { arrayBufferToBase64, floatTo16BitPCM } from '@/lib/speech/audio'
import {
CHUNK_SEND_INTERVAL_MS,
ELEVENLABS_WS_URL,
MAX_SESSION_MS,
SAMPLE_RATE,
} from '@/lib/speech/config'

const logger = createLogger('useSpeechToText')

const ELEVENLABS_WS_URL = 'wss://api.elevenlabs.io/v1/speech-to-text/realtime'
const SAMPLE_RATE = 16000
const CHUNK_SEND_INTERVAL_MS = 250
export const MAX_SESSION_MS = 3 * 60 * 1000
export { MAX_SESSION_MS } from '@/lib/speech/config'

export type PermissionState = 'prompt' | 'granted' | 'denied'

Expand All @@ -24,25 +28,6 @@ interface UseSpeechToTextReturn {
toggleListening: () => void
}

function floatTo16BitPCM(float32Array: Float32Array): ArrayBuffer {
const buffer = new ArrayBuffer(float32Array.length * 2)
const view = new DataView(buffer)
for (let i = 0; i < float32Array.length; i++) {
const s = Math.max(-1, Math.min(1, float32Array[i]))
view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true)
}
return buffer
}

function arrayBufferToBase64(buffer: ArrayBuffer): string {
const bytes = new Uint8Array(buffer)
let binary = ''
for (let i = 0; i < bytes.length; i++) {
binary += String.fromCharCode(bytes[i])
}
return btoa(binary)
}

export function useSpeechToText({
onTranscript,
language,
Expand Down
25 changes: 25 additions & 0 deletions apps/sim/lib/speech/audio.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/**
* Convert Float32 PCM samples to 16-bit signed integer PCM.
* Required for ElevenLabs realtime STT (pcm_16000 format).
*/
export function floatTo16BitPCM(float32Array: Float32Array): ArrayBuffer {
const buffer = new ArrayBuffer(float32Array.length * 2)
const view = new DataView(buffer)
for (let i = 0; i < float32Array.length; i++) {
const s = Math.max(-1, Math.min(1, float32Array[i]))
view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true)
}
return buffer
}

/**
* Encode an ArrayBuffer as a base64 string for WebSocket transport.
*/
export function arrayBufferToBase64(buffer: ArrayBuffer): string {
const bytes = new Uint8Array(buffer)
let binary = ''
for (let i = 0; i < bytes.length; i++) {
binary += String.fromCharCode(bytes[i])
}
return btoa(binary)
}
15 changes: 15 additions & 0 deletions apps/sim/lib/speech/config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import { env } from '@/lib/core/config/env'

export const ELEVENLABS_WS_URL = 'wss://api.elevenlabs.io/v1/speech-to-text/realtime'
export const SAMPLE_RATE = 16000
export const CHUNK_SEND_INTERVAL_MS = 250
export const MAX_SESSION_MS = 3 * 60 * 1000

/**
* Whether a speech-to-text provider is configured.
* Currently checks for `ELEVENLABS_API_KEY`.
* To add a new provider: add its env check here.
*/
export function hasSTTService(): boolean {
return !!env.ELEVENLABS_API_KEY?.trim()
}
60 changes: 0 additions & 60 deletions apps/sim/lib/speech/providers/elevenlabs.ts

This file was deleted.

54 changes: 0 additions & 54 deletions apps/sim/lib/speech/transcriber.ts

This file was deleted.

Loading