diff --git a/src/modules/speex/components/SpeexEngineConfig.tsx b/src/modules/speex/components/SpeexEngineConfig.tsx index 4dc0398dd..6ff26674f 100644 --- a/src/modules/speex/components/SpeexEngineConfig.tsx +++ b/src/modules/speex/components/SpeexEngineConfig.tsx @@ -144,6 +144,7 @@ function ElevenLabsConfig({ engine, onUpdate, mode }: { engine={engine} voiceId={voice.ttsVoiceId ?? null} onVoiceChange={handleVoiceChange} + autoPreview /> diff --git a/src/modules/speex/components/SpeexVoiceDropdown.tsx b/src/modules/speex/components/SpeexVoiceDropdown.tsx index 53aad1ebc..e7fe2e131 100644 --- a/src/modules/speex/components/SpeexVoiceDropdown.tsx +++ b/src/modules/speex/components/SpeexVoiceDropdown.tsx @@ -3,7 +3,8 @@ import { useQuery } from '@tanstack/react-query'; import { CircularProgress, Option, Select } from '@mui/joy'; import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown'; -import RecordVoiceOverTwoToneIcon from '@mui/icons-material/RecordVoiceOverTwoTone'; + +import { AudioPlayer } from '~/common/util/audio/AudioPlayer'; import type { DSpeexEngineAny, SpeexListVoiceOption } from '../speex.types'; import { speexListVoices_RPC } from '../protocols/rpc/rpc.client'; @@ -24,11 +25,24 @@ export function SpeexVoiceDropdown(props: { // external state - module const { voices, isLoading, error } = useSpeexVoices(engine); + // track user-initiated voice changes for preview (not initial load or voice list changes) + const [userSelectedVoiceId, setUserSelectedVoiceId] = React.useState(null); + + + // [effect] auto-preview: play voice sample only when user explicitly selects a voice + const selectedVoice = userSelectedVoiceId ? voices.find(v => v.id === userSelectedVoiceId) : null; + const previewUrl = (autoPreview && selectedVoice?.previewUrl) || null; + React.useEffect(() => { + if (previewUrl) + void AudioPlayer.playUrl(previewUrl); + }, [previewUrl]); + // handlers const handleVoiceChange = React.useCallback((_event: unknown, value: string | null) => { - if (value) onVoiceChange(value); + setUserSelectedVoiceId(value); + value && onVoiceChange(value); }, [onVoiceChange]); @@ -44,7 +58,7 @@ export function SpeexVoiceDropdown(props: { : voices.length === 0 ? 'No voices available' : 'Select a voice' } - startDecorator={} + // startDecorator={} endDecorator={isLoading && } indicator={} slotProps={{ @@ -53,7 +67,7 @@ export function SpeexVoiceDropdown(props: { }} > {voices.map(voice => ( - diff --git a/src/modules/speex/protocols/rpc/rpc.client.ts b/src/modules/speex/protocols/rpc/rpc.client.ts index 4f6235b4f..e124551a5 100644 --- a/src/modules/speex/protocols/rpc/rpc.client.ts +++ b/src/modules/speex/protocols/rpc/rpc.client.ts @@ -6,12 +6,14 @@ */ import { apiAsync, apiStream } from '~/common/util/trpc.client'; +import { convert_Base64_To_UInt8Array, convert_UInt8Array_To_Base64 } from '~/common/util/blobUtils'; import { findModelsServiceOrNull } from '~/common/stores/llms/store-llms'; import type { DLocalAIServiceSettings } from '~/modules/llms/vendors/localai/localai.vendor'; import type { DOpenAIServiceSettings } from '~/modules/llms/vendors/openai/openai.vendor'; import { AudioLivePlayer } from '~/common/util/audio/AudioLivePlayer'; +import { AudioPlayer } from '~/common/util/audio/AudioPlayer'; import type { DSpeexEngine, SpeexSpeakResult } from '../../speex.types'; import type { SpeexWire_Access, SpeexWire_ListVoices_Output, SpeexWire_Voice } from './rpc.wiretypes'; @@ -28,9 +30,10 @@ export async function speexSynthesize_RPC( text: string, options: { streaming: boolean; + languageCode?: string; + priority?: 'fast' | 'balanced' | 'quality'; playback: boolean; returnAudio: boolean; - languageCode?: string }, callbacks?: { onStart?: () => void; @@ -63,7 +66,7 @@ export async function speexSynthesize_RPC( // call the streaming RPC - whether the backend will stream in chunks or as a whole const particleStream = await apiStream.speex.synthesize.mutate( - { access, text, voice, streaming: options.streaming, languageCode: options.languageCode }, + { access, text, voice, streaming: options.streaming, languageCode: options.languageCode, priority: options.priority }, { signal: abortController.signal }, ); @@ -78,12 +81,16 @@ export async function speexSynthesize_RPC( case 'audio': // Decode base64 to ArrayBuffer - // const audioBuffer = convert_Base64_To_UInt8Array(particle.base64, 'speexSynthesize_RPC audio chunk'); // preload conversion - const audioBuffer = _base64ToArrayBuffer(particle.base64); + const audioBuffer = convert_Base64_To_UInt8Array(particle.base64, 'speex.rpc.client').buffer; - // Playback - if (options.playback) - audioPlayer?.enqueueChunk(audioBuffer); + // Playback: streaming uses AudioLivePlayer for chunked playback, + // non-streaming uses AudioPlayer for single-buffer playback + if (options.playback) { + if (particle.chunk) + audioPlayer?.enqueueChunk(audioBuffer); + else + void AudioPlayer.playBuffer(audioBuffer); // fire-and-forget for whole audio + } // Accumulate for return if (options.returnAudio) @@ -93,6 +100,10 @@ export async function speexSynthesize_RPC( callbacks?.onChunk?.(audioBuffer); break; + case 'log': + console.log(`[Speex] (${particle.level})`, particle.message); + break; + case 'done': audioPlayer?.endPlayback(); break; @@ -117,7 +128,7 @@ export async function speexSynthesize_RPC( combined.set(new Uint8Array(chunk), offset); offset += chunk.byteLength; } - result.audioBase64 = _arrayBufferToBase64(combined.buffer); + result.audioBase64 = convert_UInt8Array_To_Base64(combined, 'speex.rpc.client'); } return result; @@ -212,25 +223,3 @@ function _buildRPCWireAccess({ credentials: c, vendorType }: _DSpeexEngineRPC): } } } - -// Private: Helpers - -// TODO: use `blobUtils.ts` functions instead? - -function _base64ToArrayBuffer(base64: string): ArrayBuffer { - const binaryString = atob(base64); - const bytes = new Uint8Array(binaryString.length); - for (let i = 0; i < binaryString.length; i++) { - bytes[i] = binaryString.charCodeAt(i); - } - return bytes.buffer; -} - -function _arrayBufferToBase64(buffer: ArrayBuffer): string { - const bytes = new Uint8Array(buffer); - let binary = ''; - for (let i = 0; i < bytes.byteLength; i++) { - binary += String.fromCharCode(bytes[i]); - } - return btoa(binary); -} diff --git a/src/modules/speex/protocols/rpc/rpc.router.ts b/src/modules/speex/protocols/rpc/rpc.router.ts index 2f183550f..a4daae6c2 100644 --- a/src/modules/speex/protocols/rpc/rpc.router.ts +++ b/src/modules/speex/protocols/rpc/rpc.router.ts @@ -11,6 +11,7 @@ interface SynthesizeBackendFnParams { voice: SpeexWire_Voice; streaming: boolean; languageCode?: string; + priority?: 'fast' | 'balanced' | 'quality'; signal?: AbortSignal; } @@ -26,17 +27,18 @@ export const speexRouter = createTRPCRouter({ synthesize: edgeProcedure .input(SpeexWire.Synthesize_input_schema) .mutation(async function* ({ input, ctx }): AsyncGenerator { - const { access, text, voice, streaming, languageCode } = input; + const { access, text, voice, streaming, languageCode, priority } = input; + try { yield { t: 'start' }; switch (access.dialect) { case 'elevenlabs': - yield* synthesizeElevenLabs({ access, text, voice, streaming, languageCode, signal: ctx.reqSignal }); + yield* synthesizeElevenLabs({ access, text, voice, streaming, languageCode, priority, signal: ctx.reqSignal }); break; case 'localai': case 'openai': - yield* synthesizeOpenAIProtocol({ access, text, voice, streaming, languageCode, signal: ctx.reqSignal }); + yield* synthesizeOpenAIProtocol({ access, text, voice, streaming, languageCode, priority, signal: ctx.reqSignal }); break; default: diff --git a/src/modules/speex/protocols/rpc/rpc.wiretypes.ts b/src/modules/speex/protocols/rpc/rpc.wiretypes.ts index 58aa00fc2..4fb696b8a 100644 --- a/src/modules/speex/protocols/rpc/rpc.wiretypes.ts +++ b/src/modules/speex/protocols/rpc/rpc.wiretypes.ts @@ -8,6 +8,7 @@ export type SpeexSpeechParticle = | { t: 'start' } | { t: 'audio'; base64: string; chunk: boolean; contentType?: string; characterCost?: number; ttsLatencyMs?: number } | { t: 'done'; durationMs?: number; chars?: number } + | { t: 'log'; level: 'info', message: string } | { t: 'error'; e: string } ; @@ -86,6 +87,7 @@ export namespace SpeexWire { voice: SpeexWire.Voice_schema, streaming: z.boolean().default(true), languageCode: z.string().optional(), // ISO language code (e.g., 'en', 'fr') for model selection fallback + priority: z.enum(['fast', 'balanced', 'quality']).optional(), // Hint for speed vs quality tradeoff }); @@ -96,7 +98,12 @@ export namespace SpeexWire { name: z.string(), description: z.string().optional(), previewUrl: z.string().optional(), - category: z.string().optional(), + category: z.string().optional(), // e.g., 'premade', 'cloned', 'professional' + // Voice labels (flattened for simplicity) + // gender: z.string().optional(), // e.g., 'male', 'female', 'neutral' + // accent: z.string().optional(), // e.g., 'american', 'british', 'australian' + // age: z.string().optional(), // e.g., 'young', 'middle_aged', 'old' + // language: z.string().optional(), // e.g., 'en', 'es', 'multilingual' }); export const ListVoices_input_schema = z.object({ diff --git a/src/modules/speex/protocols/rpc/synthesize-elevenlabs.ts b/src/modules/speex/protocols/rpc/synthesize-elevenlabs.ts index c7b5aac44..eb184e22d 100644 --- a/src/modules/speex/protocols/rpc/synthesize-elevenlabs.ts +++ b/src/modules/speex/protocols/rpc/synthesize-elevenlabs.ts @@ -10,35 +10,37 @@ import { returnAudioWholeOrThrow, streamAudioChunksOrThrow } from './rpc.streami // configuration const SAFETY_TEXT_LENGTH = 1000; const MIN_CHUNK_SIZE = 4096; +const MODEL_FAST = 'eleven_turbo_v2_5'; // Fastest, English-optimized +const MODEL_QUALITY = 'eleven_multilingual_v2'; // Highest quality, multilingual const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM'; // Rachel -const DEFAULT_MODEL_ENGLISH = 'eleven_turbo_v2_5'; -const DEFAULT_MODEL_MULTILINGUAL = 'eleven_multilingual_v2'; -const _selectModelForLanguage = (languageCode: string | undefined): string => - languageCode?.toLowerCase() === 'en' ? DEFAULT_MODEL_ENGLISH : DEFAULT_MODEL_MULTILINGUAL; +const _selectModel = (priority: 'fast' | 'balanced' | 'quality' | undefined, languageCode: string | undefined): string => { + return priority === 'fast' ? MODEL_FAST // lowest latency, best for real-time use cases like calls + : priority === 'quality' ? MODEL_QUALITY // multilingual v2 (highest quality) + : languageCode?.toLowerCase() === 'en' ? MODEL_FAST : MODEL_QUALITY; // 'balanced'/undefined: English → turbo, non-English → multilingual +}; export const synthesizeElevenLabs: SynthesizeBackendFn = async function* (params) { // destructure and validate - const { access, text: inputText, voice, streaming, languageCode, signal } = params; + const { access, text: inputText, voice, streaming, languageCode, priority, signal } = params; if (access.dialect !== 'elevenlabs' || voice.dialect !== 'elevenlabs') throw new Error('Mismatched dialect in ElevenLabs synthesize'); // safety check: trim text that's too long let text = inputText; - if (text.length > SAFETY_TEXT_LENGTH) + if (text.length > SAFETY_TEXT_LENGTH) { text = text.slice(0, SAFETY_TEXT_LENGTH); - + // -> log.info + yield { t: 'log', level: 'info', message: `Text truncated to ${SAFETY_TEXT_LENGTH} characters` }; + } // build request - narrow to elevenlabs dialect for type safety - const voiceId = (voice.dialect === 'elevenlabs' ? voice.ttsVoiceId : undefined) || DEFAULT_VOICE_ID; - - // Model selection: use explicit model if provided, otherwise auto-select based on language - const explicitModel = voice.dialect === 'elevenlabs' ? voice.ttsModel : undefined; - const model = explicitModel || _selectModelForLanguage(languageCode); + const voiceId = voice.ttsVoiceId || env.ELEVENLABS_VOICE_ID || DEFAULT_VOICE_ID; + const model = voice.ttsModel || _selectModel(priority, languageCode); const path = `/v1/text-to-speech/${voiceId}${streaming ? '/stream' : ''}`; const { headers, url } = _elevenlabsAccess(access, path); @@ -46,7 +48,7 @@ export const synthesizeElevenLabs: SynthesizeBackendFn; + export const VoicesList_schema = z.object({ + voices: z.array(z.object({ + voice_id: z.string(), + name: z.string(), + category: z.enum(['premade', 'cloned', 'professional']).or(z.string()), + labels: z.looseObject({ + gender: z.enum(['male', 'female', 'neutral']).or(z.string()).nullish(), + accent: z.string().nullish(), + age: z.string().nullish(), + language: z.string().nullish(), + }), + description: z.string().nullish(), + preview_url: z.string().nullish(), + settings: z.object({ + stability: z.number(), + similarity_boost: z.number(), + }).nullish(), + // high_quality_base_model_ids: z.array(z.string()).nullish(), + is_owner: z.boolean().nullish(), + is_legacy: z.boolean().nullish(), + })), + }); + export type TTS_Request = z.infer; export const TTS_Request_schema = z.object({ text: z.string(), @@ -152,20 +183,4 @@ namespace ElevenLabsWire { }).optional(), }); - // export type VoicesList = z.infer; - export const VoicesList_schema = z.object({ - voices: z.array(z.object({ - voice_id: z.string(), - name: z.string(), - category: z.string(), - labels: z.record(z.string(), z.string()), - description: z.string(), - preview_url: z.string(), - settings: z.object({ - stability: z.number(), - similarity_boost: z.number(), - }), - })), - }); - } diff --git a/src/modules/speex/protocols/rpc/synthesize-openai.ts b/src/modules/speex/protocols/rpc/synthesize-openai.ts index 25033cf64..67371ad06 100644 --- a/src/modules/speex/protocols/rpc/synthesize-openai.ts +++ b/src/modules/speex/protocols/rpc/synthesize-openai.ts @@ -59,7 +59,7 @@ export const synthesizeOpenAIProtocol: SynthesizeBackendFn { const streaming = options?.streaming ?? true; + const languageCode = options?.languageCode ?? _getUIPreferenceLanguageCode(); + const priority = options?.priority; const playback = options?.playback ?? true; const returnAudio = options?.returnAudio ?? !streaming; - const languageCode = options?.languageCode ?? _getUIPreferenceLanguageCode(); // resolve engine from voice selector const engine = _engineFromSelector(voiceSelector); @@ -50,7 +51,7 @@ export async function speakText(inputText: string, voiceSelector: _Speak_VoiceSe case 'elevenlabs': case 'openai': case 'localai': - return speexSynthesize_RPC(effectiveEngine, inputText, { streaming, playback, returnAudio, languageCode }, callbacks); + return speexSynthesize_RPC(effectiveEngine, inputText, { streaming, playback, returnAudio, languageCode, priority }, callbacks); // Web Speech: client-only, no RPC case 'webspeech': diff --git a/src/modules/speex/speex.types.ts b/src/modules/speex/speex.types.ts index 8a19a2f9a..77a9ff3a9 100644 --- a/src/modules/speex/speex.types.ts +++ b/src/modules/speex/speex.types.ts @@ -108,10 +108,12 @@ export type SpeexListVoiceOption = SpeexWire_VoiceOption; export type SpeexSpeakOptions = { label?: string; // For NorthBridge queue display personaUid?: string; // For NorthBridge queue icon / controls (if the audio came from a persona) + // core options streaming?: boolean; // Streaming defaults to True + languageCode?: string; // ISO language code (e.g., 'en', 'fr') - auto-detected from preferredLanguage if not provided + priority?: 'fast' | 'balanced' | 'quality'; // Hint for speed vs quality tradeoff: 'fast' = low latency (turbo models), 'quality' = highest quality playback?: boolean; // Play audio (default: true) returnAudio?: boolean; // Accumulate full audio buffer in result, even if streaming (for save/download) - languageCode?: string; // ISO language code (e.g., 'en', 'fr') - auto-detected from preferredLanguage if not provided } export type SpeexSpeakResult = { diff --git a/src/modules/speex/store-module-speex.ts b/src/modules/speex/store-module-speex.ts index 26a4e29f2..c7588ba61 100644 --- a/src/modules/speex/store-module-speex.ts +++ b/src/modules/speex/store-module-speex.ts @@ -225,7 +225,11 @@ export const useSpeexStore = create()(persist( isAutoDetected: true, isAutoLinked: false, credentials: { type: 'api-key', apiKey: apiKey.trim() }, - voice: { dialect: 'elevenlabs', ttsModel: 'eleven_multilingual_v2', ttsVoiceId: voiceId || undefined }, + voice: { + dialect: 'elevenlabs', + ttsModel: 'eleven_multilingual_v2', + ...((typeof voiceId === 'string' && voiceId.trim()) ? { ttsVoiceId: voiceId.trim() } : {}), + }, }); console.log('[DEV] Speex: Migrated legacy ElevenLabs configuration'); }