Speex: LocalAI vendor

This commit is contained in:
Enrico Ros
2025-11-26 02:17:00 -08:00
parent a003600839
commit e62ffa02e9
8 changed files with 193 additions and 135 deletions
@@ -230,23 +230,24 @@ function LocalAIConfig({ engine, onUpdate, mode }: {
const voice = engine.voice as DVoiceLocalAI;
return <>
<FormControl>
<FormLabelStart title='Voice ID' description='LocalAI voice identifier' />
<Input
value={voice.voiceId ?? ''}
onChange={(e) => onUpdate({ voice: { ...voice, voiceId: e.target.value } })}
placeholder='e.g., en-us-amy-low'
/>
<FormHelperText>Depends on your LocalAI TTS configuration</FormHelperText>
</FormControl>
<FormControl>
<FormLabelStart title='Model' description='TTS model name' />
<Input
value={voice.ttsModel ?? ''}
onChange={(e) => onUpdate({ voice: { ...voice, ttsModel: e.target.value } })}
placeholder='e.g., piper'
placeholder='e.g., kokoro'
/>
<FormHelperText>Model to use for speech synthesis</FormHelperText>
</FormControl>
<FormControl>
<FormLabelStart title='Backend' description='TTS backend (optional)' />
<Input
value={voice.ttsBackend ?? ''}
onChange={(e) => onUpdate({ voice: { ...voice, ttsBackend: e.target.value || undefined } })}
placeholder='e.g., coqui, bark, piper'
/>
<FormHelperText>Leave empty for default backend</FormHelperText>
</FormControl>
</>;
}
+13 -7
View File
@@ -1,11 +1,21 @@
import { createTRPCRouter, edgeProcedure } from '~/server/trpc/trpc.server';
import { SpeexSpeechParticle, SpeexWire, SpeexWire_ListVoices_Output } from './speex.wiretypes';
import { SpeexSpeechParticle, SpeexWire, SpeexWire_Access, SpeexWire_ListVoices_Output, SpeexWire_Voice } from './speex.wiretypes';
import { listVoicesElevenLabs, synthesizeElevenLabs } from './synthesize-elevenlabs';
import { synthesizeOpenAIProtocol } from './synthesize-openai';
interface SynthesizeBackendFnParams<TSpeexAccess extends SpeexWire_Access> {
access: TSpeexAccess;
text: string;
voice: SpeexWire_Voice;
streaming: boolean;
signal?: AbortSignal;
}
export type SynthesizeBackendFn<TSpeexAccess extends SpeexWire_Access> = (params: SynthesizeBackendFnParams<TSpeexAccess>) => AsyncGenerator<SpeexSpeechParticle>;
export const speexRouter = createTRPCRouter({
/**
@@ -16,11 +26,8 @@ export const speexRouter = createTRPCRouter({
.input(SpeexWire.Synthesize_input_schema)
.mutation(async function* ({ input, ctx }): AsyncGenerator<SpeexSpeechParticle> {
const { access, text, voice, streaming } = input;
try {
yield { t: 'start' };
// Route based on access.dialect discriminant
switch (access.dialect) {
case 'elevenlabs':
yield* synthesizeElevenLabs({ access, text, voice, streaming, signal: ctx.reqSignal });
@@ -32,9 +39,8 @@ export const speexRouter = createTRPCRouter({
break;
default:
yield { t: 'error', e: 'Unknown dialect' };
const _exhaustiveCheck: never = access;
}
} catch (error) {
yield { t: 'error', e: error instanceof Error ? error.message : 'Synthesis failed' };
}
+3 -2
View File
@@ -59,8 +59,9 @@ export namespace SpeexWire {
export const LocalAI_schema = z.object({
dialect: z.literal('localai'),
voiceId: z.string().optional(),
model: z.string().optional(),
backend: z.string().optional(), // ttsBackend (e.g., 'coqui', 'bark', 'piper', 'vall-e-x')
model: z.string().optional(), // ttsModel (e.g., 'kokoro', 'tts_models/en/ljspeech/glow-tts')
language: z.string().optional(), // for multilingual models like xtts_v2
});
export const OpenAI_schema = z.object({
@@ -1,7 +1,8 @@
import { env } from '~/server/env.server';
import { fetchJsonOrTRPCThrow, fetchResponseOrTRPCThrow } from '~/server/trpc/trpc.router.fetchers';
import type { SpeexSpeechParticle, SpeexWire_Access_ElevenLabs, SpeexWire_ListVoices_Output, SpeexWire_Voice } from './speex.wiretypes';
import type { SpeexWire_Access_ElevenLabs, SpeexWire_ListVoices_Output } from './speex.wiretypes';
import type { SynthesizeBackendFn } from './speex.router';
// configuration
@@ -10,16 +11,7 @@ const MIN_CHUNK_SIZE = 4096;
const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM'; // Rachel
interface SynthesizeElevenLabsParams {
access: SpeexWire_Access_ElevenLabs;
text: string;
voice: SpeexWire_Voice;
streaming: boolean;
signal?: AbortSignal;
}
export async function* synthesizeElevenLabs(params: SynthesizeElevenLabsParams): AsyncGenerator<SpeexSpeechParticle> {
export const synthesizeElevenLabs: SynthesizeBackendFn<SpeexWire_Access_ElevenLabs> = async function* (params) {
const { access, text: inputText, voice, streaming, signal } = params;
// Safety check: trim text that's too long
@@ -27,8 +19,8 @@ export async function* synthesizeElevenLabs(params: SynthesizeElevenLabsParams):
if (text.length > SAFETY_TEXT_LENGTH)
text = text.slice(0, SAFETY_TEXT_LENGTH);
// Build request
const voiceId = voice.voiceId || DEFAULT_VOICE_ID;
// Build request - narrow to elevenlabs dialect for type safety
const voiceId = (voice.dialect === 'elevenlabs' ? voice.voiceId : undefined) || DEFAULT_VOICE_ID;
const model = voice.model || 'eleven_turbo_v2_5';
const path = `/v1/text-to-speech/${voiceId}${streaming ? '/stream' : ''}`;
const { headers, url } = _elevenlabsAccess(access, path);
@@ -106,7 +98,7 @@ export async function* synthesizeElevenLabs(params: SynthesizeElevenLabsParams):
} catch (error: any) {
yield { t: 'error', e: `ElevenLabs stream error: ${error.message || 'Unknown error'}` };
}
}
};
export async function listVoicesElevenLabs(access: SpeexWire_Access_ElevenLabs): Promise<SpeexWire_ListVoices_Output> {
+130 -86
View File
@@ -5,78 +5,92 @@
* Endpoint: POST /v1/audio/speech
*/
import { fetchResponseOrTRPCThrow } from '~/server/trpc/trpc.router.fetchers';
import { fetchJsonOrTRPCThrow, fetchResponseOrTRPCThrow } from '~/server/trpc/trpc.router.fetchers';
import type { SpeexSpeechParticle, SpeexWire_Access_OpenAI, SpeexWire_Voice } from './speex.wiretypes';
import type { SynthesizeBackendFn } from './speex.router';
import type { SpeexWire_Access_OpenAI, SpeexWire_ListVoices_Output } from './speex.wiretypes';
// configuration
const SAFETY_TEXT_LENGTH = 4096; // OpenAI max
const MIN_CHUNK_SIZE = 4096;
const DEFAULT_VOICE_ID = 'alloy';
const DEFAULT_MODEL = 'tts-1';
const MIN_CHUNK_SIZE = 4096; // bytes
const FALLBACK_OPENAI_MODEL = 'tts-1';
const FALLBACK_OPENAI_VOICE_ID = 'alloy';
interface SynthesizeOpenAIParams {
access: SpeexWire_Access_OpenAI;
text: string;
voice: SpeexWire_Voice;
streaming: boolean;
signal?: AbortSignal;
/** OpenAI TTS API: POST /v1/audio/speech */
interface OpenAIWire_TTSRequest {
input: string;
model: string; // required: 'tts-1', 'tts-1-hd', 'gpt-4o-mini-tts'
voice: string; // required: 'alloy', 'echo', 'fable', etc.
response_format?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm';
speed?: number; // 0.25-4.0
instructions?: string; // voice instructions
}
/** LocalAI TTS API: POST /v1/audio/speech (OpenAI-similar) */
interface LocalAIWire_TTSRequest {
input: string;
model?: string; // optional: e.g., 'kokoro'
backend?: string; // optional: 'coqui', 'bark', 'piper', 'transformers-musicgen', 'vall-e-x'
language?: string; // optional: for multilingual models
response_format?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm'; // defaults to 'wav', 'mp3' also seem to work well, with kokoro at least
}
/**
* Synthesize speech using OpenAI-compatible TTS API.
* Works with both OpenAI and LocalAI dialects.
* Synthesize speech using OpenAI-compatible/similar TTS API.
*/
export async function* synthesizeOpenAIProtocol(params: SynthesizeOpenAIParams): AsyncGenerator<SpeexSpeechParticle> {
export const synthesizeOpenAIProtocol: SynthesizeBackendFn<SpeexWire_Access_OpenAI> = async function* (params) {
const { access, text: inputText, voice, streaming, signal } = params;
// Safety check: trim text that's too long
// safety check: trim text that's too long
let text = inputText;
if (text.length > SAFETY_TEXT_LENGTH)
text = text.slice(0, SAFETY_TEXT_LENGTH);
// Resolve host and API key based on dialect
// request.headers
const { host, apiKey } = _resolveAccess(access);
// Build request
const voiceId = voice.voiceId || DEFAULT_VOICE_ID;
const model = voice.model || DEFAULT_MODEL;
const url = `${host}/v1/audio/speech`;
const body: OpenAIWire_TTSRequest = {
input: text,
model,
voice: voiceId,
// Use wav for streaming (lower latency, no decoding overhead)
// Use mp3 for non-streaming (smaller size)
response_format: streaming ? 'wav' : 'mp3',
};
// Add optional parameters if present
if (voice.dialect === 'openai') {
if (voice.speed !== undefined) body.speed = voice.speed;
if (voice.instruction) body.instructions = voice.instruction;
}
// Build headers
const headers: HeadersInit = {
'Content-Type': 'application/json',
...(!apiKey ? {} : { 'Authorization': `Bearer ${apiKey}` }),
...(!access.orgId ? {} : { 'OpenAI-Organization': access.orgId }),
};
if (apiKey) {
headers['Authorization'] = `Bearer ${apiKey}`;
}
if (access.orgId) {
headers['OpenAI-Organization'] = access.orgId;
// request.body
let body: OpenAIWire_TTSRequest | LocalAIWire_TTSRequest;
switch (access.dialect) {
case 'localai':
if (voice.dialect !== 'localai') throw new Error('Voice dialect mismatch for LocalAI access');
body = {
input: text,
...(voice.backend ? { backend: voice.backend } : {}),
...(voice.model ? { model: voice.model } : {}),
...(voice.language ? { language: voice.language } : {}),
response_format: streaming ? 'wav' : 'mp3',
} satisfies LocalAIWire_TTSRequest;
break;
case 'openai':
if (voice.dialect !== 'openai') throw new Error('Voice dialect mismatch for OpenAI access');
body = {
input: text,
model: voice.model || FALLBACK_OPENAI_MODEL,
voice: ('voiceId' in voice ? voice.voiceId : undefined) || FALLBACK_OPENAI_VOICE_ID,
...(voice.speed !== undefined ? { speed: voice.speed } : {}),
...(voice.instruction ? { instructions: voice.instruction } : {}),
response_format: streaming ? 'wav' : 'mp3',
} satisfies OpenAIWire_TTSRequest;
break;
}
// Fetch
// connect
let response: Response;
try {
response = await fetchResponseOrTRPCThrow({
url,
url: `${host}/v1/audio/speech`,
method: 'POST',
headers,
body,
@@ -89,7 +103,7 @@ export async function* synthesizeOpenAIProtocol(params: SynthesizeOpenAIParams):
return;
}
// Non-streaming: return entire audio at once
// non-streaming: return entire audio at once
if (!streaming) {
try {
const audioArrayBuffer = await response.arrayBuffer();
@@ -102,12 +116,10 @@ export async function* synthesizeOpenAIProtocol(params: SynthesizeOpenAIParams):
return;
}
// Streaming: read chunks
// streaming: read chunks
const reader = response.body?.getReader();
if (!reader) {
yield { t: 'error', e: 'No stream reader available' };
return;
}
if (!reader)
return yield { t: 'error', e: 'No stream reader available' };
try {
const accumulatedChunks: Uint8Array[] = [];
@@ -141,51 +153,83 @@ export async function* synthesizeOpenAIProtocol(params: SynthesizeOpenAIParams):
} catch (error: any) {
yield { t: 'error', e: `Stream error: ${error.message || 'Unknown error'}` };
}
};
//
// List Voices - LocalAI
//
const KNOWN_TTS_MODELS: Record<string, { name: string; description: string }> = {
'kokoro': { name: 'Kokoro', description: 'High-quality neural TTS' },
'bark': { name: 'Bark', description: 'Text-to-audio by Suno AI' },
'piper': { name: 'Piper', description: 'Fast local TTS' },
'coqui': { name: 'Coqui', description: 'Coqui TTS engine' },
'vall-e-x': { name: 'VALL-E X', description: 'Zero-shot voice cloning' },
'tts-1': { name: 'TTS-1', description: 'OpenAI-compatible TTS' },
'tts-1-hd': { name: 'TTS-1 HD', description: 'High-definition TTS' },
};
/** LocalAI GET /v1/models response */
interface LocalAIWire_ModelsResponse {
object: 'list';
data: Array<{ id: string; object: 'model' }>;
}
/**
* List available TTS models from LocalAI instance
*/
export async function listVoicesLocalAI(access: SpeexWire_Access_OpenAI): Promise<SpeexWire_ListVoices_Output> {
if (access.dialect !== 'localai')
throw new Error('listVoicesLocalAI requires localai dialect');
const { host, apiKey } = _resolveAccess(access);
const headers: HeadersInit = {
'Content-Type': 'application/json',
...(!apiKey ? {} : { 'Authorization': `Bearer ${apiKey}` }),
};
let modelsResponse: LocalAIWire_ModelsResponse;
try {
modelsResponse = await fetchJsonOrTRPCThrow<LocalAIWire_ModelsResponse>({
url: `${host}/v1/models`,
headers,
name: 'LocalAI',
});
} catch (error: any) {
console.warn('[listVoicesLocalAI] Failed to fetch models:', error.message);
return { voices: [] };
}
// Filter to known TTS models only
const ttsModels = modelsResponse.data.filter(model => model.id in KNOWN_TTS_MODELS);
return {
voices: ttsModels.map(model => ({
id: model.id,
name: KNOWN_TTS_MODELS[model.id].name,
description: KNOWN_TTS_MODELS[model.id].description,
})),
};
}
// Helpers
function _resolveAccess(access: SpeexWire_Access_OpenAI): { host: string; apiKey: string } {
if (access.dialect === 'openai') {
// OpenAI: use default host if not specified, API key required
let host = (access.apiHost || 'https://api.openai.com').trim();
if (!host.startsWith('http'))
host = `https://${host}`;
if (host.endsWith('/'))
host = host.slice(0, -1);
return {
host,
apiKey: access.apiKey || '',
};
}
// LocalAI: host required, API key optional
let host = (access.apiHost || '').trim();
if (!host) throw new Error('LocalAI requires apiHost to be specified');
function _resolveAccess(access: Readonly<SpeexWire_Access_OpenAI>): { host: string; apiKey: string } {
// determine host
const isOpenAI = access.dialect === 'openai';
let host = isOpenAI
? (access.apiHost || 'https://api.openai.com').trim()
: (access.apiHost || '').trim();
if (!host) throw new Error('LocalAI requires a host URL');
if (!host.startsWith('http')) {
// noinspection HttpUrlsUsage
host = `http://${host}`; // LocalAI is often local, default to http
host = isOpenAI ? `https://${host}` : `http://${host}`; // LocalAI is often local, default to http
}
if (host.endsWith('/'))
host = host.slice(0, -1);
return {
host,
apiKey: access.apiKey || '',
};
}
// Wire types
interface OpenAIWire_TTSRequest {
input: string;
model: string;
voice: string;
response_format?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm';
speed?: number;
instructions?: string;
return { host, apiKey: access.apiKey || '' };
}
+10 -11
View File
@@ -10,9 +10,9 @@ import { findModelsServiceOrNull } from '~/common/stores/llms/store-llms';
import { AudioLivePlayer } from '~/common/util/audio/AudioLivePlayer';
import type { DCredentialsApiKey, DCredentialsLLMSService, DSpeexEngineAny, SpeexRPCDialect } from './speex.types';
import type { SpeexSpeakResult } from './speex.client';
import type { DCredentialsApiKey, DCredentialsLLMSService, DSpeexCredentials, DSpeexEngineAny, SpeexRPCDialect } from './speex.types';
import type { SpeexSpeechParticle, SpeexWire_Access, SpeexWire_ListVoices_Output, SpeexWire_Voice } from './server/speex.wiretypes';
import type { SpeexWire_Access, SpeexWire_ListVoices_Output, SpeexWire_Voice } from './server/speex.wiretypes';
// Configuration
@@ -74,26 +74,24 @@ export async function speexSynthesizeRPC(
const audioBuffer = _base64ToArrayBuffer(particle.base64);
// Playback
if (options.playback && audioPlayer) {
if (options.playback && audioPlayer)
audioPlayer.enqueueChunk(audioBuffer);
}
// Accumulate for return
if (options.returnAudio) {
if (options.returnAudio)
audioChunks.push(audioBuffer);
}
// Callback
callbacks?.onChunk?.(audioBuffer);
break;
case 'done':
if (audioPlayer) {
if (audioPlayer)
audioPlayer.endPlayback();
}
break;
case 'error':
// noinspection ExceptionCaughtLocallyJS
throw new Error(particle.e);
}
}
@@ -206,11 +204,11 @@ function _resolveFromLLMService(dialect: SpeexRPCDialect, credentials: DCredenti
if (!service) return null;
// Extract credentials based on LLM vendor type
const setup = service.setup as Record<string, any>;
const setup = service.setup as Record<string, any> || {};
switch (dialect) {
case 'elevenlabs':
// ElevenLabs doesn't typically link to LLM services
// ElevenLabs doesn't link to LLM services
return null;
case 'openai':
@@ -261,8 +259,9 @@ function _buildWireVoice(engine: DSpeexEngineAny): SpeexWire_Voice {
case 'localai':
return {
dialect: 'localai',
voiceId: voice.voiceId,
backend: voice.ttsBackend,
model: voice.ttsModel,
language: voice.language,
};
case 'webspeech':
+5 -2
View File
@@ -48,10 +48,13 @@ export interface DVoiceElevenLabs {
// speakerBoost?: boolean;
}
// type LocalAITTSBackend = | 'coqui' | 'bark' | 'piper' | 'transformers-musicgen' | 'vall-e-x'
export interface DVoiceLocalAI {
vendorType: 'localai';
ttsModel?: string;
voiceId?: string;
// we let the user insert strings (or nothing) for the 2 fields below
ttsModel?: string; // Model name (e.g., 'kokoro', 'tts_models/en/ljspeech/glow-tts', 'v2/en_speaker_4' for bark)
ttsBackend?: string; // Backend (e.g., 'coqui', 'bark', 'piper', 'transformers-musicgen', 'vall-e-x')
language?: string; // Language code for multilingual models (e.g., 'en', 'fr' for xtts_v2)
}
export interface DVoiceOpenAI {
+14 -2
View File
@@ -1,6 +1,17 @@
import type { ISpeexVendor } from './ISpeexVendor';
/**
* LocalAI TTS Vendor
*
* LocalAI supports multiple TTS backends: coqui, bark, piper, transformers-musicgen, vall-e-x.
* When no backend is specified, LocalAI uses its default configuration.
*
* Default recommendation: Use 'kokoro' model without specifying a backend for the best
* out-of-the-box experience with high-quality neural TTS.
*
* @see https://localai.io/features/text-to-audio/
*/
export const SpeexVendorLocalAI: ISpeexVendor<'localai'> = {
vendorType: 'localai',
name: 'LocalAI',
@@ -26,7 +37,8 @@ export const SpeexVendorLocalAI: ISpeexVendor<'localai'> = {
getDefaultVoice: () => ({
vendorType: 'localai',
ttsModel: undefined, // depends on what's installed
voiceId: undefined,
ttsBackend: undefined,
ttsModel: 'kokoro', // recommended default - high quality neural TTS
language: undefined,
}),
};