Speex: fix elevenlabs

This commit is contained in:
Enrico Ros
2025-11-27 01:23:50 -08:00
parent d9471a8684
commit c84b2df3fa
10 changed files with 107 additions and 72 deletions
@@ -144,6 +144,7 @@ function ElevenLabsConfig({ engine, onUpdate, mode }: {
engine={engine}
voiceId={voice.ttsVoiceId ?? null}
onVoiceChange={handleVoiceChange}
autoPreview
/>
</FormControl>
@@ -3,7 +3,8 @@ import { useQuery } from '@tanstack/react-query';
import { CircularProgress, Option, Select } from '@mui/joy';
import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown';
import RecordVoiceOverTwoToneIcon from '@mui/icons-material/RecordVoiceOverTwoTone';
import { AudioPlayer } from '~/common/util/audio/AudioPlayer';
import type { DSpeexEngineAny, SpeexListVoiceOption } from '../speex.types';
import { speexListVoices_RPC } from '../protocols/rpc/rpc.client';
@@ -24,11 +25,24 @@ export function SpeexVoiceDropdown(props: {
// external state - module
const { voices, isLoading, error } = useSpeexVoices(engine);
// track user-initiated voice changes for preview (not initial load or voice list changes)
const [userSelectedVoiceId, setUserSelectedVoiceId] = React.useState<string | null>(null);
// [effect] auto-preview: play voice sample only when user explicitly selects a voice
const selectedVoice = userSelectedVoiceId ? voices.find(v => v.id === userSelectedVoiceId) : null;
const previewUrl = (autoPreview && selectedVoice?.previewUrl) || null;
React.useEffect(() => {
if (previewUrl)
void AudioPlayer.playUrl(previewUrl);
}, [previewUrl]);
// handlers
const handleVoiceChange = React.useCallback((_event: unknown, value: string | null) => {
if (value) onVoiceChange(value);
setUserSelectedVoiceId(value);
value && onVoiceChange(value);
}, [onVoiceChange]);
@@ -44,7 +58,7 @@ export function SpeexVoiceDropdown(props: {
: voices.length === 0 ? 'No voices available'
: 'Select a voice'
}
startDecorator={<RecordVoiceOverTwoToneIcon />}
// startDecorator={<RecordVoiceOverTwoToneIcon />}
endDecorator={isLoading && <CircularProgress size='sm' />}
indicator={<KeyboardArrowDownIcon />}
slotProps={{
@@ -53,7 +67,7 @@ export function SpeexVoiceDropdown(props: {
}}
>
{voices.map(voice => (
<Option key={voice.id} value={voice.id}>
<Option key={voice.id} value={voice.id} label={voice.name}>
{voice.name}
{voice.description && <span style={{ opacity: 0.6, marginLeft: 8 }}>({voice.description})</span>}
</Option>
+19 -30
View File
@@ -6,12 +6,14 @@
*/
import { apiAsync, apiStream } from '~/common/util/trpc.client';
import { convert_Base64_To_UInt8Array, convert_UInt8Array_To_Base64 } from '~/common/util/blobUtils';
import { findModelsServiceOrNull } from '~/common/stores/llms/store-llms';
import type { DLocalAIServiceSettings } from '~/modules/llms/vendors/localai/localai.vendor';
import type { DOpenAIServiceSettings } from '~/modules/llms/vendors/openai/openai.vendor';
import { AudioLivePlayer } from '~/common/util/audio/AudioLivePlayer';
import { AudioPlayer } from '~/common/util/audio/AudioPlayer';
import type { DSpeexEngine, SpeexSpeakResult } from '../../speex.types';
import type { SpeexWire_Access, SpeexWire_ListVoices_Output, SpeexWire_Voice } from './rpc.wiretypes';
@@ -28,9 +30,10 @@ export async function speexSynthesize_RPC(
text: string,
options: {
streaming: boolean;
languageCode?: string;
priority?: 'fast' | 'balanced' | 'quality';
playback: boolean;
returnAudio: boolean;
languageCode?: string
},
callbacks?: {
onStart?: () => void;
@@ -63,7 +66,7 @@ export async function speexSynthesize_RPC(
// call the streaming RPC - whether the backend will stream in chunks or as a whole
const particleStream = await apiStream.speex.synthesize.mutate(
{ access, text, voice, streaming: options.streaming, languageCode: options.languageCode },
{ access, text, voice, streaming: options.streaming, languageCode: options.languageCode, priority: options.priority },
{ signal: abortController.signal },
);
@@ -78,12 +81,16 @@ export async function speexSynthesize_RPC(
case 'audio':
// Decode base64 to ArrayBuffer
// const audioBuffer = convert_Base64_To_UInt8Array(particle.base64, 'speexSynthesize_RPC audio chunk'); // preload conversion
const audioBuffer = _base64ToArrayBuffer(particle.base64);
const audioBuffer = convert_Base64_To_UInt8Array(particle.base64, 'speex.rpc.client').buffer;
// Playback
if (options.playback)
audioPlayer?.enqueueChunk(audioBuffer);
// Playback: streaming uses AudioLivePlayer for chunked playback,
// non-streaming uses AudioPlayer for single-buffer playback
if (options.playback) {
if (particle.chunk)
audioPlayer?.enqueueChunk(audioBuffer);
else
void AudioPlayer.playBuffer(audioBuffer); // fire-and-forget for whole audio
}
// Accumulate for return
if (options.returnAudio)
@@ -93,6 +100,10 @@ export async function speexSynthesize_RPC(
callbacks?.onChunk?.(audioBuffer);
break;
case 'log':
console.log(`[Speex] (${particle.level})`, particle.message);
break;
case 'done':
audioPlayer?.endPlayback();
break;
@@ -117,7 +128,7 @@ export async function speexSynthesize_RPC(
combined.set(new Uint8Array(chunk), offset);
offset += chunk.byteLength;
}
result.audioBase64 = _arrayBufferToBase64(combined.buffer);
result.audioBase64 = convert_UInt8Array_To_Base64(combined, 'speex.rpc.client');
}
return result;
@@ -212,25 +223,3 @@ function _buildRPCWireAccess({ credentials: c, vendorType }: _DSpeexEngineRPC):
}
}
}
// Private: Helpers
// TODO: use `blobUtils.ts` functions instead?
function _base64ToArrayBuffer(base64: string): ArrayBuffer {
const binaryString = atob(base64);
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
return bytes.buffer;
}
function _arrayBufferToBase64(buffer: ArrayBuffer): string {
const bytes = new Uint8Array(buffer);
let binary = '';
for (let i = 0; i < bytes.byteLength; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
@@ -11,6 +11,7 @@ interface SynthesizeBackendFnParams<TSpeexAccess extends SpeexWire_Access> {
voice: SpeexWire_Voice;
streaming: boolean;
languageCode?: string;
priority?: 'fast' | 'balanced' | 'quality';
signal?: AbortSignal;
}
@@ -26,17 +27,18 @@ export const speexRouter = createTRPCRouter({
synthesize: edgeProcedure
.input(SpeexWire.Synthesize_input_schema)
.mutation(async function* ({ input, ctx }): AsyncGenerator<SpeexSpeechParticle> {
const { access, text, voice, streaming, languageCode } = input;
const { access, text, voice, streaming, languageCode, priority } = input;
try {
yield { t: 'start' };
switch (access.dialect) {
case 'elevenlabs':
yield* synthesizeElevenLabs({ access, text, voice, streaming, languageCode, signal: ctx.reqSignal });
yield* synthesizeElevenLabs({ access, text, voice, streaming, languageCode, priority, signal: ctx.reqSignal });
break;
case 'localai':
case 'openai':
yield* synthesizeOpenAIProtocol({ access, text, voice, streaming, languageCode, signal: ctx.reqSignal });
yield* synthesizeOpenAIProtocol({ access, text, voice, streaming, languageCode, priority, signal: ctx.reqSignal });
break;
default:
@@ -8,6 +8,7 @@ export type SpeexSpeechParticle =
| { t: 'start' }
| { t: 'audio'; base64: string; chunk: boolean; contentType?: string; characterCost?: number; ttsLatencyMs?: number }
| { t: 'done'; durationMs?: number; chars?: number }
| { t: 'log'; level: 'info', message: string }
| { t: 'error'; e: string }
;
@@ -86,6 +87,7 @@ export namespace SpeexWire {
voice: SpeexWire.Voice_schema,
streaming: z.boolean().default(true),
languageCode: z.string().optional(), // ISO language code (e.g., 'en', 'fr') for model selection fallback
priority: z.enum(['fast', 'balanced', 'quality']).optional(), // Hint for speed vs quality tradeoff
});
@@ -96,7 +98,12 @@ export namespace SpeexWire {
name: z.string(),
description: z.string().optional(),
previewUrl: z.string().optional(),
category: z.string().optional(),
category: z.string().optional(), // e.g., 'premade', 'cloned', 'professional'
// Voice labels (flattened for simplicity)
// gender: z.string().optional(), // e.g., 'male', 'female', 'neutral'
// accent: z.string().optional(), // e.g., 'american', 'british', 'australian'
// age: z.string().optional(), // e.g., 'young', 'middle_aged', 'old'
// language: z.string().optional(), // e.g., 'en', 'es', 'multilingual'
});
export const ListVoices_input_schema = z.object({
@@ -10,35 +10,37 @@ import { returnAudioWholeOrThrow, streamAudioChunksOrThrow } from './rpc.streami
// configuration
const SAFETY_TEXT_LENGTH = 1000;
const MIN_CHUNK_SIZE = 4096;
const MODEL_FAST = 'eleven_turbo_v2_5'; // Fastest, English-optimized
const MODEL_QUALITY = 'eleven_multilingual_v2'; // Highest quality, multilingual
const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM'; // Rachel
const DEFAULT_MODEL_ENGLISH = 'eleven_turbo_v2_5';
const DEFAULT_MODEL_MULTILINGUAL = 'eleven_multilingual_v2';
const _selectModelForLanguage = (languageCode: string | undefined): string =>
languageCode?.toLowerCase() === 'en' ? DEFAULT_MODEL_ENGLISH : DEFAULT_MODEL_MULTILINGUAL;
const _selectModel = (priority: 'fast' | 'balanced' | 'quality' | undefined, languageCode: string | undefined): string => {
return priority === 'fast' ? MODEL_FAST // lowest latency, best for real-time use cases like calls
: priority === 'quality' ? MODEL_QUALITY // multilingual v2 (highest quality)
: languageCode?.toLowerCase() === 'en' ? MODEL_FAST : MODEL_QUALITY; // 'balanced'/undefined: English → turbo, non-English → multilingual
};
export const synthesizeElevenLabs: SynthesizeBackendFn<SpeexWire_Access_ElevenLabs> = async function* (params) {
// destructure and validate
const { access, text: inputText, voice, streaming, languageCode, signal } = params;
const { access, text: inputText, voice, streaming, languageCode, priority, signal } = params;
if (access.dialect !== 'elevenlabs' || voice.dialect !== 'elevenlabs')
throw new Error('Mismatched dialect in ElevenLabs synthesize');
// safety check: trim text that's too long
let text = inputText;
if (text.length > SAFETY_TEXT_LENGTH)
if (text.length > SAFETY_TEXT_LENGTH) {
text = text.slice(0, SAFETY_TEXT_LENGTH);
// -> log.info
yield { t: 'log', level: 'info', message: `Text truncated to ${SAFETY_TEXT_LENGTH} characters` };
}
// build request - narrow to elevenlabs dialect for type safety
const voiceId = (voice.dialect === 'elevenlabs' ? voice.ttsVoiceId : undefined) || DEFAULT_VOICE_ID;
// Model selection: use explicit model if provided, otherwise auto-select based on language
const explicitModel = voice.dialect === 'elevenlabs' ? voice.ttsModel : undefined;
const model = explicitModel || _selectModelForLanguage(languageCode);
const voiceId = voice.ttsVoiceId || env.ELEVENLABS_VOICE_ID || DEFAULT_VOICE_ID;
const model = voice.ttsModel || _selectModel(priority, languageCode);
const path = `/v1/text-to-speech/${voiceId}${streaming ? '/stream' : ''}`;
const { headers, url } = _elevenlabsAccess(access, path);
@@ -46,7 +48,7 @@ export const synthesizeElevenLabs: SynthesizeBackendFn<SpeexWire_Access_ElevenLa
const body: ElevenLabsWire.TTS_Request = {
text,
model_id: model,
};
} as const;
// Fetch
let response: Response;
@@ -101,6 +103,11 @@ export async function listVoicesElevenLabs(access: SpeexWire_Access_ElevenLabs):
description: voice.description || undefined,
previewUrl: voice.preview_url || undefined,
category: voice.category,
// Flatten labels for UI display
// gender: voice.labels?.gender || undefined,
// accent: voice.labels?.accent || undefined,
// age: voice.labels?.age || undefined,
// language: voice.labels?.language || undefined,
})),
};
}
@@ -142,6 +149,30 @@ function _elevenlabsAccess(access: SpeexWire_Access_ElevenLabs, apiPath: string)
namespace ElevenLabsWire {
// export type VoicesList = z.infer<typeof VoicesList_schema>;
export const VoicesList_schema = z.object({
voices: z.array(z.object({
voice_id: z.string(),
name: z.string(),
category: z.enum(['premade', 'cloned', 'professional']).or(z.string()),
labels: z.looseObject({
gender: z.enum(['male', 'female', 'neutral']).or(z.string()).nullish(),
accent: z.string().nullish(),
age: z.string().nullish(),
language: z.string().nullish(),
}),
description: z.string().nullish(),
preview_url: z.string().nullish(),
settings: z.object({
stability: z.number(),
similarity_boost: z.number(),
}).nullish(),
// high_quality_base_model_ids: z.array(z.string()).nullish(),
is_owner: z.boolean().nullish(),
is_legacy: z.boolean().nullish(),
})),
});
export type TTS_Request = z.infer<typeof TTS_Request_schema>;
export const TTS_Request_schema = z.object({
text: z.string(),
@@ -152,20 +183,4 @@ namespace ElevenLabsWire {
}).optional(),
});
// export type VoicesList = z.infer<typeof VoicesList_schema>;
export const VoicesList_schema = z.object({
voices: z.array(z.object({
voice_id: z.string(),
name: z.string(),
category: z.string(),
labels: z.record(z.string(), z.string()),
description: z.string(),
preview_url: z.string(),
settings: z.object({
stability: z.number(),
similarity_boost: z.number(),
}),
})),
});
}
@@ -59,7 +59,7 @@ export const synthesizeOpenAIProtocol: SynthesizeBackendFn<SpeexWire_Access_Open
const headers: HeadersInit = {
'Content-Type': 'application/json',
...(!apiKey ? {} : { 'Authorization': `Bearer ${apiKey}` }),
...(!access.orgId ? {} : { 'OpenAI-Organization': access.orgId }),
...(!access.apiOrgId ? {} : { 'OpenAI-Organization': access.apiOrgId }),
};
// request.body
+3 -2
View File
@@ -32,9 +32,10 @@ type _Speak_Callbacks = {
export async function speakText(inputText: string, voiceSelector: _Speak_VoiceSelector, options?: SpeexSpeakOptions, callbacks?: _Speak_Callbacks): Promise<SpeexSpeakResult> {
const streaming = options?.streaming ?? true;
const languageCode = options?.languageCode ?? _getUIPreferenceLanguageCode();
const priority = options?.priority;
const playback = options?.playback ?? true;
const returnAudio = options?.returnAudio ?? !streaming;
const languageCode = options?.languageCode ?? _getUIPreferenceLanguageCode();
// resolve engine from voice selector
const engine = _engineFromSelector(voiceSelector);
@@ -50,7 +51,7 @@ export async function speakText(inputText: string, voiceSelector: _Speak_VoiceSe
case 'elevenlabs':
case 'openai':
case 'localai':
return speexSynthesize_RPC(effectiveEngine, inputText, { streaming, playback, returnAudio, languageCode }, callbacks);
return speexSynthesize_RPC(effectiveEngine, inputText, { streaming, playback, returnAudio, languageCode, priority }, callbacks);
// Web Speech: client-only, no RPC
case 'webspeech':
+3 -1
View File
@@ -108,10 +108,12 @@ export type SpeexListVoiceOption = SpeexWire_VoiceOption;
export type SpeexSpeakOptions = {
label?: string; // For NorthBridge queue display
personaUid?: string; // For NorthBridge queue icon / controls (if the audio came from a persona)
// core options
streaming?: boolean; // Streaming defaults to True
languageCode?: string; // ISO language code (e.g., 'en', 'fr') - auto-detected from preferredLanguage if not provided
priority?: 'fast' | 'balanced' | 'quality'; // Hint for speed vs quality tradeoff: 'fast' = low latency (turbo models), 'quality' = highest quality
playback?: boolean; // Play audio (default: true)
returnAudio?: boolean; // Accumulate full audio buffer in result, even if streaming (for save/download)
languageCode?: string; // ISO language code (e.g., 'en', 'fr') - auto-detected from preferredLanguage if not provided
}
export type SpeexSpeakResult = {
+5 -1
View File
@@ -225,7 +225,11 @@ export const useSpeexStore = create<SpeexStore>()(persist(
isAutoDetected: true,
isAutoLinked: false,
credentials: { type: 'api-key', apiKey: apiKey.trim() },
voice: { dialect: 'elevenlabs', ttsModel: 'eleven_multilingual_v2', ttsVoiceId: voiceId || undefined },
voice: {
dialect: 'elevenlabs',
ttsModel: 'eleven_multilingual_v2',
...((typeof voiceId === 'string' && voiceId.trim()) ? { ttsVoiceId: voiceId.trim() } : {}),
},
});
console.log('[DEV] Speex: Migrated legacy ElevenLabs configuration');
}