Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
feat(voice): voice input migration to eleven labs (#4041)
* feat(speech): unified voice interface

* add metering for voice input usage

* ip key

* use shared getclientip helper, fix deployed chat

* cleanup code

* prep merge

* merge staging in

* add billing check

* add voice input section

* remove skip billing

* address comments
  • Loading branch information
icecrasher321 authored Apr 8, 2026
commit efb582e96a3a8dd393c721ce45ec6c867220e677
15 changes: 15 additions & 0 deletions apps/docs/content/docs/en/execution/costs.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,21 @@ Use your own API keys for AI model providers instead of Sim's hosted keys to pay

When configured, workflows use your key instead of Sim's hosted keys. If removed, workflows automatically fall back to hosted keys with the multiplier.

## Voice Input

Voice input uses ElevenLabs Scribe v2 Realtime for speech-to-text transcription. It is available in the Mothership chat and in deployed chat voice mode.

| Context | Cost per session | Max duration |
|---------|-----------------|--------------|
| Mothership (workspace) | ~5 credits ($0.024) | 3 minutes |
| Deployed chat (voice mode) | ~2 credits ($0.008) | 1 minute |

Each voice session is billed when it starts. In deployed chat voice mode, each conversation turn (speak → agent responds → speak again) is a separate session. Multi-turn conversations are billed per turn.

<Callout type="info">
Voice input requires `ELEVENLABS_API_KEY` to be configured. When the key is not set, voice input controls are hidden.
</Callout>

## Plans

Sim has two paid plan tiers — **Pro** and **Max**. Either can be used individually or with a team. Team plans pool credits across all seats in the organization.
Expand Down
6 changes: 3 additions & 3 deletions apps/sim/app/api/a2a/serve/[agentId]/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
import { type AuthResult, AuthType, checkHybridAuth } from '@/lib/auth/hybrid'
import { acquireLock, getRedisClient, releaseLock } from '@/lib/core/config/redis'
import { validateUrlWithDNS } from '@/lib/core/security/input-validation.server'
import { getClientIp } from '@/lib/core/utils/request'
import { SSE_HEADERS } from '@/lib/core/utils/sse'
import { getBaseUrl } from '@/lib/core/utils/urls'
import { generateId } from '@/lib/core/utils/uuid'
Expand Down Expand Up @@ -52,10 +53,9 @@ function getCallerFingerprint(request: NextRequest, userId?: string | null): str
return `user:${userId}`
}

const forwardedFor = request.headers.get('x-forwarded-for')?.split(',')[0]?.trim()
const realIp = request.headers.get('x-real-ip')?.trim()
const clientIp = getClientIp(request)
const userAgent = request.headers.get('user-agent')?.trim() || 'unknown'
return `public:${forwardedFor || realIp || 'unknown'}:${userAgent}`
return `public:${clientIp}:${userAgent}`
}

function hasCallerAccessToTask(
Expand Down
4 changes: 2 additions & 2 deletions apps/sim/app/api/demo-requests/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { type NextRequest, NextResponse } from 'next/server'
import { env } from '@/lib/core/config/env'
import type { TokenBucketConfig } from '@/lib/core/rate-limiter'
import { RateLimiter } from '@/lib/core/rate-limiter'
import { generateRequestId } from '@/lib/core/utils/request'
import { generateRequestId, getClientIp } from '@/lib/core/utils/request'
import { getEmailDomain } from '@/lib/core/utils/urls'
import { sendEmail } from '@/lib/messaging/email/mailer'
import { getFromEmailAddress } from '@/lib/messaging/email/utils'
Expand All @@ -25,7 +25,7 @@ export async function POST(req: NextRequest) {
const requestId = generateRequestId()

try {
const ip = req.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ?? 'unknown'
const ip = getClientIp(req)
const storageKey = `public:demo-request:${ip}`

const { allowed, remaining, resetAt } = await rateLimiter.checkRateLimitDirect(
Expand Down
4 changes: 2 additions & 2 deletions apps/sim/app/api/help/integration-request/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { z } from 'zod'
import { env } from '@/lib/core/config/env'
import type { TokenBucketConfig } from '@/lib/core/rate-limiter'
import { RateLimiter } from '@/lib/core/rate-limiter'
import { generateRequestId } from '@/lib/core/utils/request'
import { generateRequestId, getClientIp } from '@/lib/core/utils/request'
import { getEmailDomain } from '@/lib/core/utils/urls'
import { sendEmail } from '@/lib/messaging/email/mailer'
import {
Expand Down Expand Up @@ -37,7 +37,7 @@ export async function POST(req: NextRequest) {
const requestId = generateRequestId()

try {
const ip = req.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ?? 'unknown'
const ip = getClientIp(req)
const storageKey = `public:integration-request:${ip}`

const { allowed, remaining, resetAt } = await rateLimiter.checkRateLimitDirect(
Expand Down
11 changes: 11 additions & 0 deletions apps/sim/app/api/settings/voice/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { NextResponse } from 'next/server'
import { hasSTTService } from '@/lib/speech/config'

/**
* Returns whether server-side STT is configured.
* Unauthenticated — the response is a single boolean,
* not sensitive data, and deployed chat visitors need it.
*/
export async function GET() {
return NextResponse.json({ sttAvailable: hasSTTService() })
}
171 changes: 171 additions & 0 deletions apps/sim/app/api/speech/token/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import { db } from '@sim/db'
import { chat } from '@sim/db/schema'
import { createLogger } from '@sim/logger'
import { eq } from 'drizzle-orm'
import { type NextRequest, NextResponse } from 'next/server'
import { getSession } from '@/lib/auth'
import { hasExceededCostLimit } from '@/lib/billing/core/subscription'
import { recordUsage } from '@/lib/billing/core/usage-log'
import { env } from '@/lib/core/config/env'
import { getCostMultiplier, isBillingEnabled } from '@/lib/core/config/feature-flags'
import { RateLimiter } from '@/lib/core/rate-limiter'
import { validateAuthToken } from '@/lib/core/security/deployment'
import { getClientIp } from '@/lib/core/utils/request'

const logger = createLogger('SpeechTokenAPI')

export const dynamic = 'force-dynamic'

const ELEVENLABS_TOKEN_URL = 'https://api.elevenlabs.io/v1/single-use-token/realtime_scribe'

const VOICE_SESSION_COST_PER_MIN = 0.008
const WORKSPACE_SESSION_MAX_MINUTES = 3
const CHAT_SESSION_MAX_MINUTES = 1

const STT_TOKEN_RATE_LIMIT = {
maxTokens: 30,
refillRate: 3,
refillIntervalMs: 72 * 1000,
} as const

const rateLimiter = new RateLimiter()

async function validateChatAuth(
request: NextRequest,
chatId: string
): Promise<{ valid: boolean; ownerId?: string }> {
try {
const chatResult = await db
.select({
id: chat.id,
userId: chat.userId,
isActive: chat.isActive,
authType: chat.authType,
password: chat.password,
})
.from(chat)
.where(eq(chat.id, chatId))
.limit(1)

if (chatResult.length === 0 || !chatResult[0].isActive) {
return { valid: false }
}

const chatData = chatResult[0]

if (chatData.authType === 'public') {
return { valid: true, ownerId: chatData.userId }
}

const cookieName = `chat_auth_${chatId}`
const authCookie = request.cookies.get(cookieName)
if (authCookie && validateAuthToken(authCookie.value, chatId, chatData.password)) {
return { valid: true, ownerId: chatData.userId }
}

return { valid: false }
} catch (error) {
logger.error('Error validating chat auth for STT:', error)
return { valid: false }
}
}

export async function POST(request: NextRequest) {
try {
const body = await request.json().catch(() => ({}))
const chatId = body?.chatId as string | undefined

let billingUserId: string | undefined

if (chatId) {
const chatAuth = await validateChatAuth(request, chatId)
if (!chatAuth.valid) {
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
billingUserId = chatAuth.ownerId
} else {
const session = await getSession()
if (!session?.user?.id) {
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
billingUserId = session.user.id
}

if (isBillingEnabled) {
const rateLimitKey = chatId
? `stt-token:chat:${chatId}:${getClientIp(request)}`
: `stt-token:user:${billingUserId}`

const rateCheck = await rateLimiter.checkRateLimitDirect(rateLimitKey, STT_TOKEN_RATE_LIMIT)
if (!rateCheck.allowed) {
return NextResponse.json(
{ error: 'Voice input rate limit exceeded. Please try again later.' },
{
status: 429,
headers: {
'Retry-After': String(Math.ceil((rateCheck.retryAfterMs ?? 60000) / 1000)),
},
}
)
}
}

if (billingUserId && isBillingEnabled) {
const exceeded = await hasExceededCostLimit(billingUserId)
if (exceeded) {
return NextResponse.json(
{ error: 'Usage limit exceeded. Please upgrade your plan to continue.' },
{ status: 402 }
)
}
}

const apiKey = env.ELEVENLABS_API_KEY
if (!apiKey?.trim()) {
return NextResponse.json(
{ error: 'Speech-to-text service is not configured' },
{ status: 503 }
)
}

const response = await fetch(ELEVENLABS_TOKEN_URL, {
method: 'POST',
headers: { 'xi-api-key': apiKey },
})

if (!response.ok) {
const errBody = await response.json().catch(() => ({}))
const message =
errBody.detail || errBody.message || `Token request failed (${response.status})`
logger.error('ElevenLabs token request failed', { status: response.status, message })
return NextResponse.json({ error: message }, { status: 502 })
}

const data = await response.json()

if (billingUserId) {
const maxMinutes = chatId ? CHAT_SESSION_MAX_MINUTES : WORKSPACE_SESSION_MAX_MINUTES
const sessionCost = VOICE_SESSION_COST_PER_MIN * maxMinutes

await recordUsage({
userId: billingUserId,
entries: [
{
category: 'fixed',
source: 'voice-input',
description: `Voice input session (${maxMinutes} min)`,
cost: sessionCost * getCostMultiplier(),
},
],
}).catch((err) => {
logger.warn('Failed to record voice input usage, continuing:', err)
})
}

return NextResponse.json({ token: data.token })
Comment on lines +146 to +165
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Usage billed before microphone permission is confirmed

The server records usage (line 150) and issues the ElevenLabs token before the client has checked or obtained microphone access. In use-speech-to-text.ts, getUserMedia is called only after the token response is received. If the user denies microphone permission, NotAllowedError is thrown, cleanup runs, and the session is abandoned — but billing was already recorded and the single-use token was consumed.

Consider requesting getUserMedia on the client before calling /api/speech/token, or deferring usage recording to after the WebSocket connection is successfully established.

} catch (error) {
const message = error instanceof Error ? error.message : 'Failed to generate speech token'
logger.error('Speech token error:', error)
return NextResponse.json({ error: message }, { status: 500 })
}
}
13 changes: 12 additions & 1 deletion apps/sim/app/chat/[identifier]/chat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,14 @@ export default function ChatClient({ identifier }: { identifier: string }) {
const [authRequired, setAuthRequired] = useState<'password' | 'email' | 'sso' | null>(null)

const [isVoiceFirstMode, setIsVoiceFirstMode] = useState(false)
const [sttAvailable, setSttAvailable] = useState(false)

useEffect(() => {
fetch('/api/settings/voice')
.then((r) => (r.ok ? r.json() : { sttAvailable: false }))
.then((data) => setSttAvailable(data.sttAvailable === true))
.catch(() => setSttAvailable(false))
}, [])
const { isStreamingResponse, abortControllerRef, stopStreaming, handleStreamedResponse } =
useChatStreaming()
const audioContextRef = useRef<AudioContext | null>(null)
Expand Down Expand Up @@ -443,8 +451,9 @@ export default function ChatClient({ identifier }: { identifier: string }) {
}, [isStreamingResponse, stopStreaming, setMessages, stopAudio])

const handleVoiceStart = useCallback(() => {
if (!sttAvailable) return
setIsVoiceFirstMode(true)
}, [])
}, [sttAvailable])

const handleExitVoiceMode = useCallback(() => {
setIsVoiceFirstMode(false)
Expand Down Expand Up @@ -494,6 +503,7 @@ export default function ChatClient({ identifier }: { identifier: string }) {
isStreaming={isStreamingResponse}
isPlayingAudio={isPlayingAudio}
audioContextRef={audioContextRef}
chatId={chatConfig?.id}
messages={messages.map((msg) => ({
content: typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content),
type: msg.type,
Expand Down Expand Up @@ -529,6 +539,7 @@ export default function ChatClient({ identifier }: { identifier: string }) {
isStreaming={isStreamingResponse}
onStopStreaming={() => stopStreaming(setMessages)}
onVoiceStart={handleVoiceStart}
sttAvailable={sttAvailable}
/>
</div>
</div>
Expand Down
22 changes: 11 additions & 11 deletions apps/sim/app/chat/components/input/input.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,6 @@ const logger = createLogger('ChatInput')

const MAX_TEXTAREA_HEIGHT = 200

const IS_STT_AVAILABLE =
typeof window !== 'undefined' &&
!!(
(window as Window & { SpeechRecognition?: unknown; webkitSpeechRecognition?: unknown })
.SpeechRecognition ||
(window as Window & { webkitSpeechRecognition?: unknown }).webkitSpeechRecognition
)

interface AttachedFile {
id: string
name: string
Expand All @@ -37,7 +29,15 @@ export const ChatInput: React.FC<{
onStopStreaming?: () => void
onVoiceStart?: () => void
voiceOnly?: boolean
}> = ({ onSubmit, isStreaming = false, onStopStreaming, onVoiceStart, voiceOnly = false }) => {
sttAvailable?: boolean
}> = ({
onSubmit,
isStreaming = false,
onStopStreaming,
onVoiceStart,
voiceOnly = false,
sttAvailable = false,
}) => {
const fileInputRef = useRef<HTMLInputElement>(null)
const textareaRef = useRef<HTMLTextAreaElement>(null)
const [inputValue, setInputValue] = useState('')
Expand Down Expand Up @@ -142,7 +142,7 @@ export const ChatInput: React.FC<{
return (
<Tooltip.Provider>
<div className='flex items-center justify-center'>
{IS_STT_AVAILABLE && (
{sttAvailable && (
<Tooltip.Root>
<Tooltip.Trigger asChild>
<div>
Expand Down Expand Up @@ -295,7 +295,7 @@ export const ChatInput: React.FC<{

{/* Right: mic + send */}
<div className='flex items-center gap-1.5'>
{IS_STT_AVAILABLE && (
{sttAvailable && (
<Tooltip.Root>
<Tooltip.Trigger asChild>
<button
Expand Down
Loading
Loading