Speex: LocalAI vendor

2026-05-11 14:10:15 -07:00 · 2025-11-26 02:17:00 -08:00
parent a003600839
commit e62ffa02e9
8 changed files with 193 additions and 135 deletions
@@ -230,23 +230,24 @@ function LocalAIConfig({ engine, onUpdate, mode }: {
  const voice = engine.voice as DVoiceLocalAI;

  return <>
-    <FormControl>
-      <FormLabelStart title='Voice ID' description='LocalAI voice identifier' />
-      <Input
-        value={voice.voiceId ?? ''}
-        onChange={(e) => onUpdate({ voice: { ...voice, voiceId: e.target.value } })}
-        placeholder='e.g., en-us-amy-low'
-      />
-      <FormHelperText>Depends on your LocalAI TTS configuration</FormHelperText>
-    </FormControl>
-
    <FormControl>
      <FormLabelStart title='Model' description='TTS model name' />
      <Input
        value={voice.ttsModel ?? ''}
        onChange={(e) => onUpdate({ voice: { ...voice, ttsModel: e.target.value } })}
-        placeholder='e.g., piper'
+        placeholder='e.g., kokoro'
      />
+      <FormHelperText>Model to use for speech synthesis</FormHelperText>
+    </FormControl>
+
+    <FormControl>
+      <FormLabelStart title='Backend' description='TTS backend (optional)' />
+      <Input
+        value={voice.ttsBackend ?? ''}
+        onChange={(e) => onUpdate({ voice: { ...voice, ttsBackend: e.target.value || undefined } })}
+        placeholder='e.g., coqui, bark, piper'
+      />
+      <FormHelperText>Leave empty for default backend</FormHelperText>
    </FormControl>
  </>;
 }
@@ -1,11 +1,21 @@
 import { createTRPCRouter, edgeProcedure } from '~/server/trpc/trpc.server';

-import { SpeexSpeechParticle, SpeexWire, SpeexWire_ListVoices_Output } from './speex.wiretypes';
-
+import { SpeexSpeechParticle, SpeexWire, SpeexWire_Access, SpeexWire_ListVoices_Output, SpeexWire_Voice } from './speex.wiretypes';
 import { listVoicesElevenLabs, synthesizeElevenLabs } from './synthesize-elevenlabs';
 import { synthesizeOpenAIProtocol } from './synthesize-openai';


+interface SynthesizeBackendFnParams<TSpeexAccess extends SpeexWire_Access> {
+  access: TSpeexAccess;
+  text: string;
+  voice: SpeexWire_Voice;
+  streaming: boolean;
+  signal?: AbortSignal;
+}
+
+export type SynthesizeBackendFn<TSpeexAccess extends SpeexWire_Access> = (params: SynthesizeBackendFnParams<TSpeexAccess>) => AsyncGenerator<SpeexSpeechParticle>;
+
+
 export const speexRouter = createTRPCRouter({

  /**
@@ -16,11 +26,8 @@ export const speexRouter = createTRPCRouter({
    .input(SpeexWire.Synthesize_input_schema)
    .mutation(async function* ({ input, ctx }): AsyncGenerator<SpeexSpeechParticle> {
      const { access, text, voice, streaming } = input;
-
      try {
        yield { t: 'start' };
-
-        // Route based on access.dialect discriminant
        switch (access.dialect) {
          case 'elevenlabs':
            yield* synthesizeElevenLabs({ access, text, voice, streaming, signal: ctx.reqSignal });
@@ -32,9 +39,8 @@ export const speexRouter = createTRPCRouter({
            break;

          default:
-            yield { t: 'error', e: 'Unknown dialect' };
+            const _exhaustiveCheck: never = access;
        }
-
      } catch (error) {
        yield { t: 'error', e: error instanceof Error ? error.message : 'Synthesis failed' };
      }
@@ -59,8 +59,9 @@ export namespace SpeexWire {

  export const LocalAI_schema = z.object({
    dialect: z.literal('localai'),
-    voiceId: z.string().optional(),
-    model: z.string().optional(),
+    backend: z.string().optional(),   // ttsBackend (e.g., 'coqui', 'bark', 'piper', 'vall-e-x')
+    model: z.string().optional(),     // ttsModel (e.g., 'kokoro', 'tts_models/en/ljspeech/glow-tts')
+    language: z.string().optional(),  // for multilingual models like xtts_v2
  });

  export const OpenAI_schema = z.object({
@@ -1,7 +1,8 @@
 import { env } from '~/server/env.server';
 import { fetchJsonOrTRPCThrow, fetchResponseOrTRPCThrow } from '~/server/trpc/trpc.router.fetchers';

-import type { SpeexSpeechParticle, SpeexWire_Access_ElevenLabs, SpeexWire_ListVoices_Output, SpeexWire_Voice } from './speex.wiretypes';
+import type { SpeexWire_Access_ElevenLabs, SpeexWire_ListVoices_Output } from './speex.wiretypes';
+import type { SynthesizeBackendFn } from './speex.router';


 // configuration
@@ -10,16 +11,7 @@ const MIN_CHUNK_SIZE = 4096;
 const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM'; // Rachel


-interface SynthesizeElevenLabsParams {
-  access: SpeexWire_Access_ElevenLabs;
-  text: string;
-  voice: SpeexWire_Voice;
-  streaming: boolean;
-  signal?: AbortSignal;
-}
-
-
-export async function* synthesizeElevenLabs(params: SynthesizeElevenLabsParams): AsyncGenerator<SpeexSpeechParticle> {
+export const synthesizeElevenLabs: SynthesizeBackendFn<SpeexWire_Access_ElevenLabs> = async function* (params) {
  const { access, text: inputText, voice, streaming, signal } = params;

  // Safety check: trim text that's too long
@@ -27,8 +19,8 @@ export async function* synthesizeElevenLabs(params: SynthesizeElevenLabsParams):
  if (text.length > SAFETY_TEXT_LENGTH)
    text = text.slice(0, SAFETY_TEXT_LENGTH);

-  // Build request
-  const voiceId = voice.voiceId || DEFAULT_VOICE_ID;
+  // Build request - narrow to elevenlabs dialect for type safety
+  const voiceId = (voice.dialect === 'elevenlabs' ? voice.voiceId : undefined) || DEFAULT_VOICE_ID;
  const model = voice.model || 'eleven_turbo_v2_5';
  const path = `/v1/text-to-speech/${voiceId}${streaming ? '/stream' : ''}`;
  const { headers, url } = _elevenlabsAccess(access, path);
@@ -106,7 +98,7 @@ export async function* synthesizeElevenLabs(params: SynthesizeElevenLabsParams):
  } catch (error: any) {
    yield { t: 'error', e: `ElevenLabs stream error: ${error.message || 'Unknown error'}` };
  }
-}
+};


 export async function listVoicesElevenLabs(access: SpeexWire_Access_ElevenLabs): Promise<SpeexWire_ListVoices_Output> {
@@ -5,78 +5,92 @@
 * Endpoint: POST /v1/audio/speech
 */

-import { fetchResponseOrTRPCThrow } from '~/server/trpc/trpc.router.fetchers';
+import { fetchJsonOrTRPCThrow, fetchResponseOrTRPCThrow } from '~/server/trpc/trpc.router.fetchers';

-import type { SpeexSpeechParticle, SpeexWire_Access_OpenAI, SpeexWire_Voice } from './speex.wiretypes';
+import type { SynthesizeBackendFn } from './speex.router';
+import type { SpeexWire_Access_OpenAI, SpeexWire_ListVoices_Output } from './speex.wiretypes';


 // configuration
 const SAFETY_TEXT_LENGTH = 4096; // OpenAI max
-const MIN_CHUNK_SIZE = 4096;
-const DEFAULT_VOICE_ID = 'alloy';
-const DEFAULT_MODEL = 'tts-1';
+const MIN_CHUNK_SIZE = 4096; // bytes
+const FALLBACK_OPENAI_MODEL = 'tts-1';
+const FALLBACK_OPENAI_VOICE_ID = 'alloy';


-interface SynthesizeOpenAIParams {
-  access: SpeexWire_Access_OpenAI;
-  text: string;
-  voice: SpeexWire_Voice;
-  streaming: boolean;
-  signal?: AbortSignal;
+/** OpenAI TTS API: POST /v1/audio/speech */
+interface OpenAIWire_TTSRequest {
+  input: string;
+  model: string;          // required: 'tts-1', 'tts-1-hd', 'gpt-4o-mini-tts'
+  voice: string;          // required: 'alloy', 'echo', 'fable', etc.
+  response_format?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm';
+  speed?: number;         // 0.25-4.0
+  instructions?: string;  // voice instructions
+}
+
+/** LocalAI TTS API: POST /v1/audio/speech (OpenAI-similar) */
+interface LocalAIWire_TTSRequest {
+  input: string;
+  model?: string;         // optional: e.g., 'kokoro'
+  backend?: string;       // optional: 'coqui', 'bark', 'piper', 'transformers-musicgen', 'vall-e-x'
+  language?: string;      // optional: for multilingual models
+  response_format?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm'; // defaults to 'wav', 'mp3' also seem to work well, with kokoro at least
 }


 /**
- * Synthesize speech using OpenAI-compatible TTS API.
- * Works with both OpenAI and LocalAI dialects.
+ * Synthesize speech using OpenAI-compatible/similar TTS API.
 */
-export async function* synthesizeOpenAIProtocol(params: SynthesizeOpenAIParams): AsyncGenerator<SpeexSpeechParticle> {
+export const synthesizeOpenAIProtocol: SynthesizeBackendFn<SpeexWire_Access_OpenAI> = async function* (params) {
+
  const { access, text: inputText, voice, streaming, signal } = params;

-  // Safety check: trim text that's too long
+  // safety check: trim text that's too long
  let text = inputText;
  if (text.length > SAFETY_TEXT_LENGTH)
    text = text.slice(0, SAFETY_TEXT_LENGTH);

-  // Resolve host and API key based on dialect
+
+  // request.headers
  const { host, apiKey } = _resolveAccess(access);
-
-  // Build request
-  const voiceId = voice.voiceId || DEFAULT_VOICE_ID;
-  const model = voice.model || DEFAULT_MODEL;
-  const url = `${host}/v1/audio/speech`;
-
-  const body: OpenAIWire_TTSRequest = {
-    input: text,
-    model,
-    voice: voiceId,
-    // Use wav for streaming (lower latency, no decoding overhead)
-    // Use mp3 for non-streaming (smaller size)
-    response_format: streaming ? 'wav' : 'mp3',
-  };
-
-  // Add optional parameters if present
-  if (voice.dialect === 'openai') {
-    if (voice.speed !== undefined) body.speed = voice.speed;
-    if (voice.instruction) body.instructions = voice.instruction;
-  }
-
-  // Build headers
  const headers: HeadersInit = {
    'Content-Type': 'application/json',
+    ...(!apiKey ? {} : { 'Authorization': `Bearer ${apiKey}` }),
+    ...(!access.orgId ? {} : { 'OpenAI-Organization': access.orgId }),
  };
-  if (apiKey) {
-    headers['Authorization'] = `Bearer ${apiKey}`;
-  }
-  if (access.orgId) {
-    headers['OpenAI-Organization'] = access.orgId;
+
+  // request.body
+  let body: OpenAIWire_TTSRequest | LocalAIWire_TTSRequest;
+  switch (access.dialect) {
+    case 'localai':
+      if (voice.dialect !== 'localai') throw new Error('Voice dialect mismatch for LocalAI access');
+      body = {
+        input: text,
+        ...(voice.backend ? { backend: voice.backend } : {}),
+        ...(voice.model ? { model: voice.model } : {}),
+        ...(voice.language ? { language: voice.language } : {}),
+        response_format: streaming ? 'wav' : 'mp3',
+      } satisfies LocalAIWire_TTSRequest;
+      break;
+
+    case 'openai':
+      if (voice.dialect !== 'openai') throw new Error('Voice dialect mismatch for OpenAI access');
+      body = {
+        input: text,
+        model: voice.model || FALLBACK_OPENAI_MODEL,
+        voice: ('voiceId' in voice ? voice.voiceId : undefined) || FALLBACK_OPENAI_VOICE_ID,
+        ...(voice.speed !== undefined ? { speed: voice.speed } : {}),
+        ...(voice.instruction ? { instructions: voice.instruction } : {}),
+        response_format: streaming ? 'wav' : 'mp3',
+      } satisfies OpenAIWire_TTSRequest;
+      break;
  }

-  // Fetch
+  // connect
  let response: Response;
  try {
    response = await fetchResponseOrTRPCThrow({
-      url,
+      url: `${host}/v1/audio/speech`,
      method: 'POST',
      headers,
      body,
@@ -89,7 +103,7 @@ export async function* synthesizeOpenAIProtocol(params: SynthesizeOpenAIParams):
    return;
  }

-  // Non-streaming: return entire audio at once
+  // non-streaming: return entire audio at once
  if (!streaming) {
    try {
      const audioArrayBuffer = await response.arrayBuffer();
@@ -102,12 +116,10 @@ export async function* synthesizeOpenAIProtocol(params: SynthesizeOpenAIParams):
    return;
  }

-  // Streaming: read chunks
+  // streaming: read chunks
  const reader = response.body?.getReader();
-  if (!reader) {
-    yield { t: 'error', e: 'No stream reader available' };
-    return;
-  }
+  if (!reader)
+    return yield { t: 'error', e: 'No stream reader available' };

  try {
    const accumulatedChunks: Uint8Array[] = [];
@@ -141,51 +153,83 @@ export async function* synthesizeOpenAIProtocol(params: SynthesizeOpenAIParams):
  } catch (error: any) {
    yield { t: 'error', e: `Stream error: ${error.message || 'Unknown error'}` };
  }
+};
+
+
+//
+// List Voices - LocalAI
+//
+
+const KNOWN_TTS_MODELS: Record<string, { name: string; description: string }> = {
+  'kokoro': { name: 'Kokoro', description: 'High-quality neural TTS' },
+  'bark': { name: 'Bark', description: 'Text-to-audio by Suno AI' },
+  'piper': { name: 'Piper', description: 'Fast local TTS' },
+  'coqui': { name: 'Coqui', description: 'Coqui TTS engine' },
+  'vall-e-x': { name: 'VALL-E X', description: 'Zero-shot voice cloning' },
+  'tts-1': { name: 'TTS-1', description: 'OpenAI-compatible TTS' },
+  'tts-1-hd': { name: 'TTS-1 HD', description: 'High-definition TTS' },
+};
+
+/** LocalAI GET /v1/models response */
+interface LocalAIWire_ModelsResponse {
+  object: 'list';
+  data: Array<{ id: string; object: 'model' }>;
+}
+
+/**
+ * List available TTS models from LocalAI instance
+ */
+export async function listVoicesLocalAI(access: SpeexWire_Access_OpenAI): Promise<SpeexWire_ListVoices_Output> {
+  if (access.dialect !== 'localai')
+    throw new Error('listVoicesLocalAI requires localai dialect');
+
+  const { host, apiKey } = _resolveAccess(access);
+  const headers: HeadersInit = {
+    'Content-Type': 'application/json',
+    ...(!apiKey ? {} : { 'Authorization': `Bearer ${apiKey}` }),
+  };
+
+  let modelsResponse: LocalAIWire_ModelsResponse;
+  try {
+    modelsResponse = await fetchJsonOrTRPCThrow<LocalAIWire_ModelsResponse>({
+      url: `${host}/v1/models`,
+      headers,
+      name: 'LocalAI',
+    });
+  } catch (error: any) {
+    console.warn('[listVoicesLocalAI] Failed to fetch models:', error.message);
+    return { voices: [] };
+  }
+
+  // Filter to known TTS models only
+  const ttsModels = modelsResponse.data.filter(model => model.id in KNOWN_TTS_MODELS);
+
+  return {
+    voices: ttsModels.map(model => ({
+      id: model.id,
+      name: KNOWN_TTS_MODELS[model.id].name,
+      description: KNOWN_TTS_MODELS[model.id].description,
+    })),
+  };
 }


 // Helpers

-function _resolveAccess(access: SpeexWire_Access_OpenAI): { host: string; apiKey: string } {
-  if (access.dialect === 'openai') {
-    // OpenAI: use default host if not specified, API key required
-    let host = (access.apiHost || 'https://api.openai.com').trim();
-    if (!host.startsWith('http'))
-      host = `https://${host}`;
-    if (host.endsWith('/'))
-      host = host.slice(0, -1);
-
-    return {
-      host,
-      apiKey: access.apiKey || '',
-    };
-  }
-
-  // LocalAI: host required, API key optional
-  let host = (access.apiHost || '').trim();
-  if (!host) throw new Error('LocalAI requires apiHost to be specified');
+function _resolveAccess(access: Readonly<SpeexWire_Access_OpenAI>): { host: string; apiKey: string } {

+  // determine host
+  const isOpenAI = access.dialect === 'openai';
+  let host = isOpenAI
+    ? (access.apiHost || 'https://api.openai.com').trim()
+    : (access.apiHost || '').trim();
+  if (!host) throw new Error('LocalAI requires a host URL');
  if (!host.startsWith('http')) {
    // noinspection HttpUrlsUsage
-    host = `http://${host}`; // LocalAI is often local, default to http
+    host = isOpenAI ? `https://${host}` : `http://${host}`; // LocalAI is often local, default to http
  }
  if (host.endsWith('/'))
    host = host.slice(0, -1);

-  return {
-    host,
-    apiKey: access.apiKey || '',
-  };
-}
-
-
-// Wire types
-
-interface OpenAIWire_TTSRequest {
-  input: string;
-  model: string;
-  voice: string;
-  response_format?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm';
-  speed?: number;
-  instructions?: string;
+  return { host, apiKey: access.apiKey || '' };
 }
@@ -10,9 +10,9 @@ import { findModelsServiceOrNull } from '~/common/stores/llms/store-llms';

 import { AudioLivePlayer } from '~/common/util/audio/AudioLivePlayer';

+import type { DCredentialsApiKey, DCredentialsLLMSService, DSpeexEngineAny, SpeexRPCDialect } from './speex.types';
 import type { SpeexSpeakResult } from './speex.client';
-import type { DCredentialsApiKey, DCredentialsLLMSService, DSpeexCredentials, DSpeexEngineAny, SpeexRPCDialect } from './speex.types';
-import type { SpeexSpeechParticle, SpeexWire_Access, SpeexWire_ListVoices_Output, SpeexWire_Voice } from './server/speex.wiretypes';
+import type { SpeexWire_Access, SpeexWire_ListVoices_Output, SpeexWire_Voice } from './server/speex.wiretypes';


 // Configuration
@@ -74,26 +74,24 @@ export async function speexSynthesizeRPC(
          const audioBuffer = _base64ToArrayBuffer(particle.base64);

          // Playback
-          if (options.playback && audioPlayer) {
+          if (options.playback && audioPlayer)
            audioPlayer.enqueueChunk(audioBuffer);
-          }

          // Accumulate for return
-          if (options.returnAudio) {
+          if (options.returnAudio)
            audioChunks.push(audioBuffer);
-          }

          // Callback
          callbacks?.onChunk?.(audioBuffer);
          break;

        case 'done':
-          if (audioPlayer) {
+          if (audioPlayer)
            audioPlayer.endPlayback();
-          }
          break;

        case 'error':
+          // noinspection ExceptionCaughtLocallyJS
          throw new Error(particle.e);
      }
    }
@@ -206,11 +204,11 @@ function _resolveFromLLMService(dialect: SpeexRPCDialect, credentials: DCredenti
  if (!service) return null;

  // Extract credentials based on LLM vendor type
-  const setup = service.setup as Record<string, any>;
+  const setup = service.setup as Record<string, any> || {};

  switch (dialect) {
    case 'elevenlabs':
-      // ElevenLabs doesn't typically link to LLM services
+      // ElevenLabs doesn't link to LLM services
      return null;

    case 'openai':
@@ -261,8 +259,9 @@ function _buildWireVoice(engine: DSpeexEngineAny): SpeexWire_Voice {
    case 'localai':
      return {
        dialect: 'localai',
-        voiceId: voice.voiceId,
+        backend: voice.ttsBackend,
        model: voice.ttsModel,
+        language: voice.language,
      };

    case 'webspeech':
@@ -48,10 +48,13 @@ export interface DVoiceElevenLabs {
  // speakerBoost?: boolean;
 }

+// type LocalAITTSBackend = | 'coqui' | 'bark' | 'piper' | 'transformers-musicgen' | 'vall-e-x'
 export interface DVoiceLocalAI {
  vendorType: 'localai';
-  ttsModel?: string;
-  voiceId?: string;
+  // we let the user insert strings (or nothing) for the 2 fields below
+  ttsModel?: string;    // Model name (e.g., 'kokoro', 'tts_models/en/ljspeech/glow-tts', 'v2/en_speaker_4' for bark)
+  ttsBackend?: string;  // Backend (e.g., 'coqui', 'bark', 'piper', 'transformers-musicgen', 'vall-e-x')
+  language?: string;    // Language code for multilingual models (e.g., 'en', 'fr' for xtts_v2)
 }

 export interface DVoiceOpenAI {
@@ -1,6 +1,17 @@
 import type { ISpeexVendor } from './ISpeexVendor';


+/**
+ * LocalAI TTS Vendor
+ *
+ * LocalAI supports multiple TTS backends: coqui, bark, piper, transformers-musicgen, vall-e-x.
+ * When no backend is specified, LocalAI uses its default configuration.
+ *
+ * Default recommendation: Use 'kokoro' model without specifying a backend for the best
+ * out-of-the-box experience with high-quality neural TTS.
+ *
+ * @see https://localai.io/features/text-to-audio/
+ */
 export const SpeexVendorLocalAI: ISpeexVendor<'localai'> = {
  vendorType: 'localai',
  name: 'LocalAI',
@@ -26,7 +37,8 @@ export const SpeexVendorLocalAI: ISpeexVendor<'localai'> = {

  getDefaultVoice: () => ({
    vendorType: 'localai',
-    ttsModel: undefined, // depends on what's installed
-    voiceId: undefined,
+    ttsBackend: undefined,
+    ttsModel: 'kokoro', // recommended default - high quality neural TTS
+    language: undefined,
  }),
 };