feat(voice): voice input migration to eleven labs (#4041)

* feat(speech): unified voice interface * add metering for voice input usage * ip key * use shared getclientip helper, fix deployed chat * cleanup code * prep merge * merge staging in * add billing check * add voice input section * remove skip billing * address comments
simstudioai · waleedlatif1 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
commit efb582e96a3a8dd393c721ce45ec6c867220e677
diff --git a/apps/docs/content/docs/en/execution/costs.mdx b/apps/docs/content/docs/en/execution/costs.mdx
@@ -135,6 +135,21 @@ Use your own API keys for AI model providers instead of Sim's hosted keys to pay
 
 When configured, workflows use your key instead of Sim's hosted keys. If removed, workflows automatically fall back to hosted keys with the multiplier.
 
+## Voice Input
+
+Voice input uses ElevenLabs Scribe v2 Realtime for speech-to-text transcription. It is available in the Mothership chat and in deployed chat voice mode.
+
+| Context | Cost per session | Max duration |
+|---------|-----------------|--------------|
+| Mothership (workspace) | ~5 credits ($0.024) | 3 minutes |
+| Deployed chat (voice mode) | ~2 credits ($0.008) | 1 minute |
+
+Each voice session is billed when it starts. In deployed chat voice mode, each conversation turn (speak → agent responds → speak again) is a separate session. Multi-turn conversations are billed per turn.
+
+<Callout type="info">
+  Voice input requires `ELEVENLABS_API_KEY` to be configured. When the key is not set, voice input controls are hidden.
+</Callout>
+
 ## Plans
 
 Sim has two paid plan tiers — **Pro** and **Max**. Either can be used individually or with a team. Team plans pool credits across all seats in the organization.

diff --git a/apps/sim/app/api/a2a/serve/[agentId]/route.ts b/apps/sim/app/api/a2a/serve/[agentId]/route.ts
@@ -15,6 +15,7 @@ import {
 import { type AuthResult, AuthType, checkHybridAuth } from '@/lib/auth/hybrid'
 import { acquireLock, getRedisClient, releaseLock } from '@/lib/core/config/redis'
 import { validateUrlWithDNS } from '@/lib/core/security/input-validation.server'
+import { getClientIp } from '@/lib/core/utils/request'
 import { SSE_HEADERS } from '@/lib/core/utils/sse'
 import { getBaseUrl } from '@/lib/core/utils/urls'
 import { generateId } from '@/lib/core/utils/uuid'
@@ -52,10 +53,9 @@ function getCallerFingerprint(request: NextRequest, userId?: string | null): str
     return `user:${userId}`
   }
 
-  const forwardedFor = request.headers.get('x-forwarded-for')?.split(',')[0]?.trim()
-  const realIp = request.headers.get('x-real-ip')?.trim()
+  const clientIp = getClientIp(request)
   const userAgent = request.headers.get('user-agent')?.trim() || 'unknown'
-  return `public:${forwardedFor || realIp || 'unknown'}:${userAgent}`
+  return `public:${clientIp}:${userAgent}`
 }
 
 function hasCallerAccessToTask(

diff --git a/apps/sim/app/api/demo-requests/route.ts b/apps/sim/app/api/demo-requests/route.ts
@@ -3,7 +3,7 @@ import { type NextRequest, NextResponse } from 'next/server'
 import { env } from '@/lib/core/config/env'
 import type { TokenBucketConfig } from '@/lib/core/rate-limiter'
 import { RateLimiter } from '@/lib/core/rate-limiter'
-import { generateRequestId } from '@/lib/core/utils/request'
+import { generateRequestId, getClientIp } from '@/lib/core/utils/request'
 import { getEmailDomain } from '@/lib/core/utils/urls'
 import { sendEmail } from '@/lib/messaging/email/mailer'
 import { getFromEmailAddress } from '@/lib/messaging/email/utils'
@@ -25,7 +25,7 @@ export async function POST(req: NextRequest) {
   const requestId = generateRequestId()
 
   try {
-    const ip = req.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ?? 'unknown'
+    const ip = getClientIp(req)
     const storageKey = `public:demo-request:${ip}`
 
     const { allowed, remaining, resetAt } = await rateLimiter.checkRateLimitDirect(

diff --git a/apps/sim/app/api/help/integration-request/route.ts b/apps/sim/app/api/help/integration-request/route.ts
@@ -4,7 +4,7 @@ import { z } from 'zod'
 import { env } from '@/lib/core/config/env'
 import type { TokenBucketConfig } from '@/lib/core/rate-limiter'
 import { RateLimiter } from '@/lib/core/rate-limiter'
-import { generateRequestId } from '@/lib/core/utils/request'
+import { generateRequestId, getClientIp } from '@/lib/core/utils/request'
 import { getEmailDomain } from '@/lib/core/utils/urls'
 import { sendEmail } from '@/lib/messaging/email/mailer'
 import {
@@ -37,7 +37,7 @@ export async function POST(req: NextRequest) {
   const requestId = generateRequestId()
 
   try {
-    const ip = req.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ?? 'unknown'
+    const ip = getClientIp(req)
     const storageKey = `public:integration-request:${ip}`
 
     const { allowed, remaining, resetAt } = await rateLimiter.checkRateLimitDirect(

diff --git a/apps/sim/app/api/settings/voice/route.ts b/apps/sim/app/api/settings/voice/route.ts
@@ -0,0 +1,11 @@
+import { NextResponse } from 'next/server'
+import { hasSTTService } from '@/lib/speech/config'
+
+/**
+ * Returns whether server-side STT is configured.
+ * Unauthenticated — the response is a single boolean,
+ * not sensitive data, and deployed chat visitors need it.
+ */
+export async function GET() {
+  return NextResponse.json({ sttAvailable: hasSTTService() })
+}
diff --git a/apps/sim/app/api/speech/token/route.ts b/apps/sim/app/api/speech/token/route.ts
@@ -0,0 +1,171 @@
+import { db } from '@sim/db'
+import { chat } from '@sim/db/schema'
+import { createLogger } from '@sim/logger'
+import { eq } from 'drizzle-orm'
+import { type NextRequest, NextResponse } from 'next/server'
+import { getSession } from '@/lib/auth'
+import { hasExceededCostLimit } from '@/lib/billing/core/subscription'
+import { recordUsage } from '@/lib/billing/core/usage-log'
+import { env } from '@/lib/core/config/env'
+import { getCostMultiplier, isBillingEnabled } from '@/lib/core/config/feature-flags'
+import { RateLimiter } from '@/lib/core/rate-limiter'
+import { validateAuthToken } from '@/lib/core/security/deployment'
+import { getClientIp } from '@/lib/core/utils/request'
+
+const logger = createLogger('SpeechTokenAPI')
+
+export const dynamic = 'force-dynamic'
+
+const ELEVENLABS_TOKEN_URL = 'https://api.elevenlabs.io/v1/single-use-token/realtime_scribe'
+
+const VOICE_SESSION_COST_PER_MIN = 0.008
+const WORKSPACE_SESSION_MAX_MINUTES = 3
+const CHAT_SESSION_MAX_MINUTES = 1
+
+const STT_TOKEN_RATE_LIMIT = {
+  maxTokens: 30,
+  refillRate: 3,
+  refillIntervalMs: 72 * 1000,
+} as const
+
+const rateLimiter = new RateLimiter()
+
+async function validateChatAuth(
+  request: NextRequest,
+  chatId: string
+): Promise<{ valid: boolean; ownerId?: string }> {
+  try {
+    const chatResult = await db
+      .select({
+        id: chat.id,
+        userId: chat.userId,
+        isActive: chat.isActive,
+        authType: chat.authType,
+        password: chat.password,
+      })
+      .from(chat)
+      .where(eq(chat.id, chatId))
+      .limit(1)
+
+    if (chatResult.length === 0 || !chatResult[0].isActive) {
+      return { valid: false }
+    }
+
+    const chatData = chatResult[0]
+
+    if (chatData.authType === 'public') {
+      return { valid: true, ownerId: chatData.userId }
+    }
+
+    const cookieName = `chat_auth_${chatId}`
+    const authCookie = request.cookies.get(cookieName)
+    if (authCookie && validateAuthToken(authCookie.value, chatId, chatData.password)) {
+      return { valid: true, ownerId: chatData.userId }
+    }
+
+    return { valid: false }
+  } catch (error) {
+    logger.error('Error validating chat auth for STT:', error)
+    return { valid: false }
+  }
+}
+
+export async function POST(request: NextRequest) {
+  try {
+    const body = await request.json().catch(() => ({}))
+    const chatId = body?.chatId as string | undefined
+
+    let billingUserId: string | undefined
+
+    if (chatId) {
+      const chatAuth = await validateChatAuth(request, chatId)
+      if (!chatAuth.valid) {
+        return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
+      }
+      billingUserId = chatAuth.ownerId
+    } else {
+      const session = await getSession()
+      if (!session?.user?.id) {
+        return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
+      }
+      billingUserId = session.user.id
+    }
+
+    if (isBillingEnabled) {
+      const rateLimitKey = chatId
+        ? `stt-token:chat:${chatId}:${getClientIp(request)}`
+        : `stt-token:user:${billingUserId}`
+
+      const rateCheck = await rateLimiter.checkRateLimitDirect(rateLimitKey, STT_TOKEN_RATE_LIMIT)
+      if (!rateCheck.allowed) {
+        return NextResponse.json(
+          { error: 'Voice input rate limit exceeded. Please try again later.' },
+          {
+            status: 429,
+            headers: {
+              'Retry-After': String(Math.ceil((rateCheck.retryAfterMs ?? 60000) / 1000)),
+            },
+          }
+        )
+      }
+    }
+
+    if (billingUserId && isBillingEnabled) {
+      const exceeded = await hasExceededCostLimit(billingUserId)
+      if (exceeded) {
+        return NextResponse.json(
+          { error: 'Usage limit exceeded. Please upgrade your plan to continue.' },
+          { status: 402 }
+        )
+      }
+    }
+
+    const apiKey = env.ELEVENLABS_API_KEY
+    if (!apiKey?.trim()) {
+      return NextResponse.json(
+        { error: 'Speech-to-text service is not configured' },
+        { status: 503 }
+      )
+    }
+
+    const response = await fetch(ELEVENLABS_TOKEN_URL, {
+      method: 'POST',
+      headers: { 'xi-api-key': apiKey },
+    })
+
+    if (!response.ok) {
+      const errBody = await response.json().catch(() => ({}))
+      const message =
+        errBody.detail || errBody.message || `Token request failed (${response.status})`
+      logger.error('ElevenLabs token request failed', { status: response.status, message })
+      return NextResponse.json({ error: message }, { status: 502 })
+    }
+
+    const data = await response.json()
+
+    if (billingUserId) {
+      const maxMinutes = chatId ? CHAT_SESSION_MAX_MINUTES : WORKSPACE_SESSION_MAX_MINUTES
+      const sessionCost = VOICE_SESSION_COST_PER_MIN * maxMinutes
+
+      await recordUsage({
+        userId: billingUserId,
+        entries: [
+          {
+            category: 'fixed',
+            source: 'voice-input',
+            description: `Voice input session (${maxMinutes} min)`,
+            cost: sessionCost * getCostMultiplier(),
+          },
+        ],
+      }).catch((err) => {
+        logger.warn('Failed to record voice input usage, continuing:', err)
+      })
+    }
+
+    return NextResponse.json({ token: data.token })
+  } catch (error) {
+    const message = error instanceof Error ? error.message : 'Failed to generate speech token'
+    logger.error('Speech token error:', error)
+    return NextResponse.json({ error: message }, { status: 500 })
+  }
+}
diff --git a/apps/sim/app/chat/[identifier]/chat.tsx b/apps/sim/app/chat/[identifier]/chat.tsx
@@ -127,6 +127,14 @@ export default function ChatClient({ identifier }: { identifier: string }) {
   const [authRequired, setAuthRequired] = useState<'password' | 'email' | 'sso' | null>(null)
 
   const [isVoiceFirstMode, setIsVoiceFirstMode] = useState(false)
+  const [sttAvailable, setSttAvailable] = useState(false)
+
+  useEffect(() => {
+    fetch('/api/settings/voice')
+      .then((r) => (r.ok ? r.json() : { sttAvailable: false }))
+      .then((data) => setSttAvailable(data.sttAvailable === true))
+      .catch(() => setSttAvailable(false))
+  }, [])
   const { isStreamingResponse, abortControllerRef, stopStreaming, handleStreamedResponse } =
     useChatStreaming()
   const audioContextRef = useRef<AudioContext | null>(null)
@@ -443,8 +451,9 @@ export default function ChatClient({ identifier }: { identifier: string }) {
   }, [isStreamingResponse, stopStreaming, setMessages, stopAudio])
 
   const handleVoiceStart = useCallback(() => {
+    if (!sttAvailable) return
     setIsVoiceFirstMode(true)
-  }, [])
+  }, [sttAvailable])
 
   const handleExitVoiceMode = useCallback(() => {
     setIsVoiceFirstMode(false)
@@ -494,6 +503,7 @@ export default function ChatClient({ identifier }: { identifier: string }) {
         isStreaming={isStreamingResponse}
         isPlayingAudio={isPlayingAudio}
         audioContextRef={audioContextRef}
+        chatId={chatConfig?.id}
         messages={messages.map((msg) => ({
           content: typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content),
           type: msg.type,
@@ -529,6 +539,7 @@ export default function ChatClient({ identifier }: { identifier: string }) {
             isStreaming={isStreamingResponse}
             onStopStreaming={() => stopStreaming(setMessages)}
             onVoiceStart={handleVoiceStart}
+            sttAvailable={sttAvailable}
           />
         </div>
       </div>

diff --git a/apps/sim/app/chat/components/input/input.tsx b/apps/sim/app/chat/components/input/input.tsx
@@ -14,14 +14,6 @@ const logger = createLogger('ChatInput')
 
 const MAX_TEXTAREA_HEIGHT = 200
 
-const IS_STT_AVAILABLE =
-  typeof window !== 'undefined' &&
-  !!(
-    (window as Window & { SpeechRecognition?: unknown; webkitSpeechRecognition?: unknown })
-      .SpeechRecognition ||
-    (window as Window & { webkitSpeechRecognition?: unknown }).webkitSpeechRecognition
-  )
-
 interface AttachedFile {
   id: string
   name: string
@@ -37,7 +29,15 @@ export const ChatInput: React.FC<{
   onStopStreaming?: () => void
   onVoiceStart?: () => void
   voiceOnly?: boolean
-}> = ({ onSubmit, isStreaming = false, onStopStreaming, onVoiceStart, voiceOnly = false }) => {
+  sttAvailable?: boolean
+}> = ({
+  onSubmit,
+  isStreaming = false,
+  onStopStreaming,
+  onVoiceStart,
+  voiceOnly = false,
+  sttAvailable = false,
+}) => {
   const fileInputRef = useRef<HTMLInputElement>(null)
   const textareaRef = useRef<HTMLTextAreaElement>(null)
   const [inputValue, setInputValue] = useState('')
@@ -142,7 +142,7 @@ export const ChatInput: React.FC<{
     return (
       <Tooltip.Provider>
         <div className='flex items-center justify-center'>
-          {IS_STT_AVAILABLE && (
+          {sttAvailable && (
             <Tooltip.Root>
               <Tooltip.Trigger asChild>
                 <div>
@@ -295,7 +295,7 @@ export const ChatInput: React.FC<{
 
               {/* Right: mic + send */}
               <div className='flex items-center gap-1.5'>
-                {IS_STT_AVAILABLE && (
+                {sttAvailable && (
                   <Tooltip.Root>
                     <Tooltip.Trigger asChild>
                       <button