Speex: fix elevenlabs

2026-05-10 21:50:14 -07:00 · 2025-11-27 01:23:50 -08:00
parent d9471a8684
commit c84b2df3fa
10 changed files with 107 additions and 72 deletions
@@ -144,6 +144,7 @@ function ElevenLabsConfig({ engine, onUpdate, mode }: {
        engine={engine}
        voiceId={voice.ttsVoiceId ?? null}
        onVoiceChange={handleVoiceChange}
+        autoPreview
      />
    </FormControl>

@@ -3,7 +3,8 @@ import { useQuery } from '@tanstack/react-query';

 import { CircularProgress, Option, Select } from '@mui/joy';
 import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown';
-import RecordVoiceOverTwoToneIcon from '@mui/icons-material/RecordVoiceOverTwoTone';
+
+import { AudioPlayer } from '~/common/util/audio/AudioPlayer';

 import type { DSpeexEngineAny, SpeexListVoiceOption } from '../speex.types';
 import { speexListVoices_RPC } from '../protocols/rpc/rpc.client';
@@ -24,11 +25,24 @@ export function SpeexVoiceDropdown(props: {
  // external state - module
  const { voices, isLoading, error } = useSpeexVoices(engine);

+  // track user-initiated voice changes for preview (not initial load or voice list changes)
+  const [userSelectedVoiceId, setUserSelectedVoiceId] = React.useState<string | null>(null);
+
+
+  // [effect] auto-preview: play voice sample only when user explicitly selects a voice
+  const selectedVoice = userSelectedVoiceId ? voices.find(v => v.id === userSelectedVoiceId) : null;
+  const previewUrl = (autoPreview && selectedVoice?.previewUrl) || null;
+  React.useEffect(() => {
+    if (previewUrl)
+      void AudioPlayer.playUrl(previewUrl);
+  }, [previewUrl]);
+

  // handlers

  const handleVoiceChange = React.useCallback((_event: unknown, value: string | null) => {
-    if (value) onVoiceChange(value);
+    setUserSelectedVoiceId(value);
+    value && onVoiceChange(value);
  }, [onVoiceChange]);


@@ -44,7 +58,7 @@ export function SpeexVoiceDropdown(props: {
            : voices.length === 0 ? 'No voices available'
              : 'Select a voice'
      }
-      startDecorator={<RecordVoiceOverTwoToneIcon />}
+      // startDecorator={<RecordVoiceOverTwoToneIcon />}
      endDecorator={isLoading && <CircularProgress size='sm' />}
      indicator={<KeyboardArrowDownIcon />}
      slotProps={{
@@ -53,7 +67,7 @@ export function SpeexVoiceDropdown(props: {
      }}
    >
      {voices.map(voice => (
-        <Option key={voice.id} value={voice.id}>
+        <Option key={voice.id} value={voice.id} label={voice.name}>
          {voice.name}
          {voice.description && <span style={{ opacity: 0.6, marginLeft: 8 }}>({voice.description})</span>}
        </Option>
@@ -6,12 +6,14 @@
 */

 import { apiAsync, apiStream } from '~/common/util/trpc.client';
+import { convert_Base64_To_UInt8Array, convert_UInt8Array_To_Base64 } from '~/common/util/blobUtils';
 import { findModelsServiceOrNull } from '~/common/stores/llms/store-llms';

 import type { DLocalAIServiceSettings } from '~/modules/llms/vendors/localai/localai.vendor';
 import type { DOpenAIServiceSettings } from '~/modules/llms/vendors/openai/openai.vendor';

 import { AudioLivePlayer } from '~/common/util/audio/AudioLivePlayer';
+import { AudioPlayer } from '~/common/util/audio/AudioPlayer';

 import type { DSpeexEngine, SpeexSpeakResult } from '../../speex.types';
 import type { SpeexWire_Access, SpeexWire_ListVoices_Output, SpeexWire_Voice } from './rpc.wiretypes';
@@ -28,9 +30,10 @@ export async function speexSynthesize_RPC(
  text: string,
  options: {
    streaming: boolean;
+    languageCode?: string;
+    priority?: 'fast' | 'balanced' | 'quality';
    playback: boolean;
    returnAudio: boolean;
-    languageCode?: string
  },
  callbacks?: {
    onStart?: () => void;
@@ -63,7 +66,7 @@ export async function speexSynthesize_RPC(

    // call the streaming RPC - whether the backend will stream in chunks or as a whole
    const particleStream = await apiStream.speex.synthesize.mutate(
-      { access, text, voice, streaming: options.streaming, languageCode: options.languageCode },
+      { access, text, voice, streaming: options.streaming, languageCode: options.languageCode, priority: options.priority },
      { signal: abortController.signal },
    );

@@ -78,12 +81,16 @@ export async function speexSynthesize_RPC(

        case 'audio':
          // Decode base64 to ArrayBuffer
-          // const audioBuffer = convert_Base64_To_UInt8Array(particle.base64, 'speexSynthesize_RPC audio chunk'); // preload conversion
-          const audioBuffer = _base64ToArrayBuffer(particle.base64);
+          const audioBuffer = convert_Base64_To_UInt8Array(particle.base64, 'speex.rpc.client').buffer;

-          // Playback
-          if (options.playback)
-            audioPlayer?.enqueueChunk(audioBuffer);
+          // Playback: streaming uses AudioLivePlayer for chunked playback,
+          // non-streaming uses AudioPlayer for single-buffer playback
+          if (options.playback) {
+            if (particle.chunk)
+              audioPlayer?.enqueueChunk(audioBuffer);
+            else
+              void AudioPlayer.playBuffer(audioBuffer); // fire-and-forget for whole audio
+          }

          // Accumulate for return
          if (options.returnAudio)
@@ -93,6 +100,10 @@ export async function speexSynthesize_RPC(
          callbacks?.onChunk?.(audioBuffer);
          break;

+        case 'log':
+          console.log(`[Speex] (${particle.level})`, particle.message);
+          break;
+
        case 'done':
          audioPlayer?.endPlayback();
          break;
@@ -117,7 +128,7 @@ export async function speexSynthesize_RPC(
        combined.set(new Uint8Array(chunk), offset);
        offset += chunk.byteLength;
      }
-      result.audioBase64 = _arrayBufferToBase64(combined.buffer);
+      result.audioBase64 = convert_UInt8Array_To_Base64(combined, 'speex.rpc.client');
    }

    return result;
@@ -212,25 +223,3 @@ function _buildRPCWireAccess({ credentials: c, vendorType }: _DSpeexEngineRPC):
      }
  }
 }
-
-// Private: Helpers
-
-// TODO: use `blobUtils.ts` functions instead?
-
-function _base64ToArrayBuffer(base64: string): ArrayBuffer {
-  const binaryString = atob(base64);
-  const bytes = new Uint8Array(binaryString.length);
-  for (let i = 0; i < binaryString.length; i++) {
-    bytes[i] = binaryString.charCodeAt(i);
-  }
-  return bytes.buffer;
-}
-
-function _arrayBufferToBase64(buffer: ArrayBuffer): string {
-  const bytes = new Uint8Array(buffer);
-  let binary = '';
-  for (let i = 0; i < bytes.byteLength; i++) {
-    binary += String.fromCharCode(bytes[i]);
-  }
-  return btoa(binary);
-}
@@ -11,6 +11,7 @@ interface SynthesizeBackendFnParams<TSpeexAccess extends SpeexWire_Access> {
  voice: SpeexWire_Voice;
  streaming: boolean;
  languageCode?: string;
+  priority?: 'fast' | 'balanced' | 'quality';
  signal?: AbortSignal;
 }

@@ -26,17 +27,18 @@ export const speexRouter = createTRPCRouter({
  synthesize: edgeProcedure
    .input(SpeexWire.Synthesize_input_schema)
    .mutation(async function* ({ input, ctx }): AsyncGenerator<SpeexSpeechParticle> {
-      const { access, text, voice, streaming, languageCode } = input;
+      const { access, text, voice, streaming, languageCode, priority } = input;
+
      try {
        yield { t: 'start' };
        switch (access.dialect) {
          case 'elevenlabs':
-            yield* synthesizeElevenLabs({ access, text, voice, streaming, languageCode, signal: ctx.reqSignal });
+            yield* synthesizeElevenLabs({ access, text, voice, streaming, languageCode, priority, signal: ctx.reqSignal });
            break;

          case 'localai':
          case 'openai':
-            yield* synthesizeOpenAIProtocol({ access, text, voice, streaming, languageCode, signal: ctx.reqSignal });
+            yield* synthesizeOpenAIProtocol({ access, text, voice, streaming, languageCode, priority, signal: ctx.reqSignal });
            break;

          default:
@@ -8,6 +8,7 @@ export type SpeexSpeechParticle =
  | { t: 'start' }
  | { t: 'audio'; base64: string; chunk: boolean; contentType?: string; characterCost?: number; ttsLatencyMs?: number }
  | { t: 'done'; durationMs?: number; chars?: number }
+  | { t: 'log'; level: 'info', message: string }
  | { t: 'error'; e: string }
  ;

@@ -86,6 +87,7 @@ export namespace SpeexWire {
    voice: SpeexWire.Voice_schema,
    streaming: z.boolean().default(true),
    languageCode: z.string().optional(), // ISO language code (e.g., 'en', 'fr') for model selection fallback
+    priority: z.enum(['fast', 'balanced', 'quality']).optional(), // Hint for speed vs quality tradeoff
  });


@@ -96,7 +98,12 @@ export namespace SpeexWire {
    name: z.string(),
    description: z.string().optional(),
    previewUrl: z.string().optional(),
-    category: z.string().optional(),
+    category: z.string().optional(),    // e.g., 'premade', 'cloned', 'professional'
+    // Voice labels (flattened for simplicity)
+    // gender: z.string().optional(),      // e.g., 'male', 'female', 'neutral'
+    // accent: z.string().optional(),      // e.g., 'american', 'british', 'australian'
+    // age: z.string().optional(),         // e.g., 'young', 'middle_aged', 'old'
+    // language: z.string().optional(),    // e.g., 'en', 'es', 'multilingual'
  });

  export const ListVoices_input_schema = z.object({
@@ -10,35 +10,37 @@ import { returnAudioWholeOrThrow, streamAudioChunksOrThrow } from './rpc.streami
 // configuration
 const SAFETY_TEXT_LENGTH = 1000;
 const MIN_CHUNK_SIZE = 4096;
+const MODEL_FAST = 'eleven_turbo_v2_5';        // Fastest, English-optimized
+const MODEL_QUALITY = 'eleven_multilingual_v2'; // Highest quality, multilingual
 const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM'; // Rachel
-const DEFAULT_MODEL_ENGLISH = 'eleven_turbo_v2_5';
-const DEFAULT_MODEL_MULTILINGUAL = 'eleven_multilingual_v2';


-const _selectModelForLanguage = (languageCode: string | undefined): string =>
-  languageCode?.toLowerCase() === 'en' ? DEFAULT_MODEL_ENGLISH : DEFAULT_MODEL_MULTILINGUAL;
+const _selectModel = (priority: 'fast' | 'balanced' | 'quality' | undefined, languageCode: string | undefined): string => {
+  return priority === 'fast' ? MODEL_FAST // lowest latency, best for real-time use cases like calls
+    : priority === 'quality' ? MODEL_QUALITY // multilingual v2 (highest quality)
+      : languageCode?.toLowerCase() === 'en' ? MODEL_FAST : MODEL_QUALITY; // 'balanced'/undefined: English → turbo, non-English → multilingual
+};


 export const synthesizeElevenLabs: SynthesizeBackendFn<SpeexWire_Access_ElevenLabs> = async function* (params) {

  // destructure and validate
-  const { access, text: inputText, voice, streaming, languageCode, signal } = params;
+  const { access, text: inputText, voice, streaming, languageCode, priority, signal } = params;
  if (access.dialect !== 'elevenlabs' || voice.dialect !== 'elevenlabs')
    throw new Error('Mismatched dialect in ElevenLabs synthesize');


  // safety check: trim text that's too long
  let text = inputText;
-  if (text.length > SAFETY_TEXT_LENGTH)
+  if (text.length > SAFETY_TEXT_LENGTH) {
    text = text.slice(0, SAFETY_TEXT_LENGTH);
-
+    // -> log.info
+    yield { t: 'log', level: 'info', message: `Text truncated to ${SAFETY_TEXT_LENGTH} characters` };
+  }

  // build request - narrow to elevenlabs dialect for type safety
-  const voiceId = (voice.dialect === 'elevenlabs' ? voice.ttsVoiceId : undefined) || DEFAULT_VOICE_ID;
-
-  // Model selection: use explicit model if provided, otherwise auto-select based on language
-  const explicitModel = voice.dialect === 'elevenlabs' ? voice.ttsModel : undefined;
-  const model = explicitModel || _selectModelForLanguage(languageCode);
+  const voiceId = voice.ttsVoiceId || env.ELEVENLABS_VOICE_ID || DEFAULT_VOICE_ID;
+  const model = voice.ttsModel || _selectModel(priority, languageCode);

  const path = `/v1/text-to-speech/${voiceId}${streaming ? '/stream' : ''}`;
  const { headers, url } = _elevenlabsAccess(access, path);
@@ -46,7 +48,7 @@ export const synthesizeElevenLabs: SynthesizeBackendFn<SpeexWire_Access_ElevenLa
  const body: ElevenLabsWire.TTS_Request = {
    text,
    model_id: model,
-  };
+  } as const;

  // Fetch
  let response: Response;
@@ -101,6 +103,11 @@ export async function listVoicesElevenLabs(access: SpeexWire_Access_ElevenLabs):
      description: voice.description || undefined,
      previewUrl: voice.preview_url || undefined,
      category: voice.category,
+      // Flatten labels for UI display
+      // gender: voice.labels?.gender || undefined,
+      // accent: voice.labels?.accent || undefined,
+      // age: voice.labels?.age || undefined,
+      // language: voice.labels?.language || undefined,
    })),
  };
 }
@@ -142,6 +149,30 @@ function _elevenlabsAccess(access: SpeexWire_Access_ElevenLabs, apiPath: string)

 namespace ElevenLabsWire {

+  // export type VoicesList = z.infer<typeof VoicesList_schema>;
+  export const VoicesList_schema = z.object({
+    voices: z.array(z.object({
+      voice_id: z.string(),
+      name: z.string(),
+      category: z.enum(['premade', 'cloned', 'professional']).or(z.string()),
+      labels: z.looseObject({
+        gender: z.enum(['male', 'female', 'neutral']).or(z.string()).nullish(),
+        accent: z.string().nullish(),
+        age: z.string().nullish(),
+        language: z.string().nullish(),
+      }),
+      description: z.string().nullish(),
+      preview_url: z.string().nullish(),
+      settings: z.object({
+        stability: z.number(),
+        similarity_boost: z.number(),
+      }).nullish(),
+      // high_quality_base_model_ids: z.array(z.string()).nullish(),
+      is_owner: z.boolean().nullish(),
+      is_legacy: z.boolean().nullish(),
+    })),
+  });
+
  export type TTS_Request = z.infer<typeof TTS_Request_schema>;
  export const TTS_Request_schema = z.object({
    text: z.string(),
@@ -152,20 +183,4 @@ namespace ElevenLabsWire {
    }).optional(),
  });

-  // export type VoicesList = z.infer<typeof VoicesList_schema>;
-  export const VoicesList_schema = z.object({
-    voices: z.array(z.object({
-      voice_id: z.string(),
-      name: z.string(),
-      category: z.string(),
-      labels: z.record(z.string(), z.string()),
-      description: z.string(),
-      preview_url: z.string(),
-      settings: z.object({
-        stability: z.number(),
-        similarity_boost: z.number(),
-      }),
-    })),
-  });
-
 }
@@ -59,7 +59,7 @@ export const synthesizeOpenAIProtocol: SynthesizeBackendFn<SpeexWire_Access_Open
  const headers: HeadersInit = {
    'Content-Type': 'application/json',
    ...(!apiKey ? {} : { 'Authorization': `Bearer ${apiKey}` }),
-    ...(!access.orgId ? {} : { 'OpenAI-Organization': access.orgId }),
+    ...(!access.apiOrgId ? {} : { 'OpenAI-Organization': access.apiOrgId }),
  };

  // request.body
@@ -32,9 +32,10 @@ type _Speak_Callbacks = {
 export async function speakText(inputText: string, voiceSelector: _Speak_VoiceSelector, options?: SpeexSpeakOptions, callbacks?: _Speak_Callbacks): Promise<SpeexSpeakResult> {

  const streaming = options?.streaming ?? true;
+  const languageCode = options?.languageCode ?? _getUIPreferenceLanguageCode();
+  const priority = options?.priority;
  const playback = options?.playback ?? true;
  const returnAudio = options?.returnAudio ?? !streaming;
-  const languageCode = options?.languageCode ?? _getUIPreferenceLanguageCode();

  // resolve engine from voice selector
  const engine = _engineFromSelector(voiceSelector);
@@ -50,7 +51,7 @@ export async function speakText(inputText: string, voiceSelector: _Speak_VoiceSe
      case 'elevenlabs':
      case 'openai':
      case 'localai':
-        return speexSynthesize_RPC(effectiveEngine, inputText, { streaming, playback, returnAudio, languageCode }, callbacks);
+        return speexSynthesize_RPC(effectiveEngine, inputText, { streaming, playback, returnAudio, languageCode, priority }, callbacks);

      // Web Speech: client-only, no RPC
      case 'webspeech':
@@ -108,10 +108,12 @@ export type SpeexListVoiceOption = SpeexWire_VoiceOption;
 export type SpeexSpeakOptions = {
  label?: string;           // For NorthBridge queue display
  personaUid?: string;      // For NorthBridge queue icon / controls (if the audio came from a persona)
+  // core options
  streaming?: boolean;      // Streaming defaults to True
+  languageCode?: string;    // ISO language code (e.g., 'en', 'fr') - auto-detected from preferredLanguage if not provided
+  priority?: 'fast' | 'balanced' | 'quality'; // Hint for speed vs quality tradeoff: 'fast' = low latency (turbo models), 'quality' = highest quality
  playback?: boolean;       // Play audio (default: true)
  returnAudio?: boolean;    // Accumulate full audio buffer in result, even if streaming (for save/download)
-  languageCode?: string;    // ISO language code (e.g., 'en', 'fr') - auto-detected from preferredLanguage if not provided
 }

 export type SpeexSpeakResult = {
@@ -225,7 +225,11 @@ export const useSpeexStore = create<SpeexStore>()(persist(
                isAutoDetected: true,
                isAutoLinked: false,
                credentials: { type: 'api-key', apiKey: apiKey.trim() },
-                voice: { dialect: 'elevenlabs', ttsModel: 'eleven_multilingual_v2', ttsVoiceId: voiceId || undefined },
+                voice: {
+                  dialect: 'elevenlabs',
+                  ttsModel: 'eleven_multilingual_v2',
+                  ...((typeof voiceId === 'string' && voiceId.trim()) ? { ttsVoiceId: voiceId.trim() } : {}),
+                },
              });
              console.log('[DEV] Speex: Migrated legacy ElevenLabs configuration');
            }