mirror of
https://github.com/enricoros/big-AGI.git
synced 2026-05-10 21:50:14 -07:00
Speex: +Inworld
This commit is contained in:
@@ -37,7 +37,7 @@ async function _getSpeexCsfModule() {
|
||||
// --- /CSF
|
||||
|
||||
|
||||
type _DSpeexEngineRPC = DSpeexEngine<'elevenlabs'> | DSpeexEngine<'localai'> | DSpeexEngine<'openai'>;
|
||||
type _DSpeexEngineRPC = DSpeexEngine<'elevenlabs'> | DSpeexEngine<'inworld'> | DSpeexEngine<'localai'> | DSpeexEngine<'openai'>;
|
||||
|
||||
|
||||
/**
|
||||
@@ -177,8 +177,9 @@ function _buildRPCWireAccess({ credentials: c, vendorType }: _DSpeexEngineRPC):
|
||||
case 'api-key':
|
||||
switch (vendorType) {
|
||||
case 'elevenlabs':
|
||||
case 'inworld':
|
||||
return {
|
||||
dialect: 'elevenlabs',
|
||||
dialect: vendorType,
|
||||
apiKey: c.apiKey,
|
||||
...(c.apiHost && { apiHost: c.apiHost }),
|
||||
};
|
||||
@@ -203,7 +204,8 @@ function _buildRPCWireAccess({ credentials: c, vendorType }: _DSpeexEngineRPC):
|
||||
if (!service) return null;
|
||||
switch (vendorType) {
|
||||
case 'elevenlabs':
|
||||
// no linking for ElevenLabs - we shall NOT be here
|
||||
case 'inworld':
|
||||
// no linking for ElevenLabs or Inworld - we shall NOT be here
|
||||
return null;
|
||||
|
||||
case 'openai':
|
||||
@@ -240,7 +242,22 @@ function _shouldUseCSF({ credentials: c, vendorType }: _DSpeexEngineRPC): boolea
|
||||
switch (c.type) {
|
||||
case 'api-key':
|
||||
// Auto-enable CSF for local URLs (LocalAI typically runs locally)
|
||||
return vendorType === 'localai' && isLocalUrl(c.apiHost);
|
||||
switch (vendorType) {
|
||||
case 'inworld':
|
||||
return false; // Inworld has blocked CORS policy - never CSF
|
||||
|
||||
case 'localai':
|
||||
return isLocalUrl(c.apiHost);
|
||||
|
||||
default:
|
||||
const _exhaustiveCheck: never = vendorType;
|
||||
// fallthrough
|
||||
case 'elevenlabs':
|
||||
case 'openai':
|
||||
break;
|
||||
}
|
||||
// NOTE: we should have a switch or something
|
||||
return false;
|
||||
|
||||
case 'llms-service':
|
||||
const service = findModelsServiceOrNull(c.serviceId);
|
||||
|
||||
@@ -15,6 +15,7 @@ export type SpeexSpeechParticle =
|
||||
|
||||
export type SpeexWire_Access = z.infer<typeof SpeexWire.Access_schema>;
|
||||
export type SpeexWire_Access_ElevenLabs = z.infer<typeof SpeexWire.AccessElevenLabs_schema>;
|
||||
export type SpeexWire_Access_Inworld = z.infer<typeof SpeexWire.AccessInworld_schema>;
|
||||
export type SpeexWire_Access_OpenAI = z.infer<typeof SpeexWire.AccessOpenAI_schema>;
|
||||
|
||||
export type SpeexWire_Voice = z.infer<typeof SpeexWire.Voice_schema>;
|
||||
@@ -39,6 +40,12 @@ export namespace SpeexWire {
|
||||
apiHost: z.string().optional(),
|
||||
});
|
||||
|
||||
export const AccessInworld_schema = z.object({
|
||||
dialect: z.literal('inworld'),
|
||||
apiKey: z.string(), // base64-encoded API key from Inworld Portal
|
||||
apiHost: z.string().optional(), // defaults to api.inworld.ai
|
||||
});
|
||||
|
||||
export const AccessOpenAI_schema = z.object({
|
||||
dialect: z.enum(['localai', 'openai']),
|
||||
apiKey: z.string().optional(), // openai: required, localai: optional
|
||||
@@ -47,7 +54,7 @@ export namespace SpeexWire {
|
||||
});
|
||||
|
||||
export const Access_schema = z.discriminatedUnion('dialect',
|
||||
[AccessElevenLabs_schema, AccessOpenAI_schema],
|
||||
[AccessElevenLabs_schema, AccessInworld_schema, AccessOpenAI_schema],
|
||||
);
|
||||
|
||||
|
||||
@@ -59,6 +66,14 @@ export namespace SpeexWire {
|
||||
ttsVoiceId: z.string().optional(),
|
||||
});
|
||||
|
||||
export const VoiceInworld_schema = z.object({
|
||||
dialect: z.literal('inworld'),
|
||||
ttsModel: z.enum(['inworld-tts-1.5-max', 'inworld-tts-1.5-mini']).optional(),
|
||||
ttsVoiceId: z.string().optional(),
|
||||
ttsTemperature: z.number().min(0).max(2).optional(),
|
||||
ttsSpeakingRate: z.number().min(0.5).max(1.5).optional(),
|
||||
});
|
||||
|
||||
export const VoiceLocalAI_schema = z.object({
|
||||
dialect: z.literal('localai'),
|
||||
ttsBackend: z.string().optional(), // e.g., 'coqui', 'bark', 'piper', 'vall-e-x'
|
||||
@@ -75,7 +90,7 @@ export namespace SpeexWire {
|
||||
});
|
||||
|
||||
export const Voice_schema = z.discriminatedUnion('dialect',
|
||||
[VoiceElevenLabs_schema, VoiceLocalAI_schema, VoiceOpenAI_schema],
|
||||
[VoiceElevenLabs_schema, VoiceInworld_schema, VoiceLocalAI_schema, VoiceOpenAI_schema],
|
||||
);
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,290 @@
|
||||
/**
|
||||
* Inworld AI TTS Synthesizer
|
||||
*
|
||||
* Implements Inworld's Text-to-Speech API:
|
||||
* - Non-streaming: POST /tts/v1/voice
|
||||
* - Streaming: POST /tts/v1/voice:stream (newline-delimited JSON)
|
||||
* - Authentication: Basic auth with base64-encoded API key
|
||||
*
|
||||
* API Reference: https://docs.inworld.ai/api-reference/ttsAPI/texttospeech/synthesize-speech
|
||||
*/
|
||||
|
||||
import * as z from 'zod/v4';
|
||||
|
||||
import { fetchJsonOrTRPCThrow, fetchResponseOrTRPCThrow } from '~/server/trpc/trpc.router.fetchers';
|
||||
|
||||
import type { SpeexSpeechParticle, SpeexWire_Access_Inworld, SpeexWire_ListVoices_Output } from './rpc.wiretypes';
|
||||
import type { SynthesizeBackendFn } from './synthesize.core';
|
||||
import { SPEEX_DEBUG, SPEEX_DEFAULTS } from '../../speex.config';
|
||||
|
||||
|
||||
export namespace InworldWire_TTS_Synthesize {
|
||||
|
||||
/// Request Schema
|
||||
// API Reference: https://docs.inworld.ai/api-reference/ttsAPI/texttospeech/synthesize-speech
|
||||
// Note: Request schemas are intentionally loose - server validates constraints
|
||||
|
||||
const _AudioConfig_schema = z.object({
|
||||
audioEncoding: z.enum(['LINEAR16', 'MP3', 'OGG_OPUS', 'ALAW', 'MULAW', 'FLAC']).or(z.string()).optional(),
|
||||
sampleRateHertz: z.number().optional(), // 8000-48000 Hz
|
||||
speakingRate: z.number().optional(), // 0.5x to 1.5x speed
|
||||
bitRate: z.number().optional(), // for compressed formats
|
||||
});
|
||||
|
||||
export type Request = z.infer<typeof Request_schema>;
|
||||
export const Request_schema = z.object({
|
||||
text: z.string(), // max 2000 chars per request
|
||||
voiceId: z.string(),
|
||||
modelId: z.string(), // e.g., 'inworld-tts-1.5-max', 'inworld-tts-1.5-mini'
|
||||
temperature: z.number().optional(), // 0-2, default 1.1
|
||||
applyTextNormalization: z.enum(['ON', 'OFF', 'UNSPECIFIED']).or(z.string()).optional(),
|
||||
audioConfig: _AudioConfig_schema.optional(),
|
||||
timestampType: z.enum(['WORD', 'CHARACTER', 'UNSPECIFIED']).or(z.string()).optional(),
|
||||
});
|
||||
|
||||
|
||||
/// Response Schema (non-streaming)
|
||||
|
||||
const _Usage_schema = z.object({
|
||||
processedCharactersCount: z.number().optional(),
|
||||
modelId: z.string().optional(),
|
||||
});
|
||||
|
||||
export type Response = z.infer<typeof Response_schema>;
|
||||
export const Response_schema = z.object({
|
||||
audioContent: z.string(), // base64-encoded audio (max 16MB)
|
||||
usage: _Usage_schema.optional(),
|
||||
timestampInfo: z.object({
|
||||
wordAlignment: z.unknown().optional(),
|
||||
characterAlignment: z.unknown().optional(),
|
||||
}).optional(),
|
||||
phoneticDetails: z.array(z.unknown()).optional(), // TTS 1.5 models only
|
||||
});
|
||||
|
||||
/// Streaming Chunk Schema (newline-delimited JSON, wrapped in "result")
|
||||
|
||||
const _StreamChunkResult_schema = z.object({
|
||||
audioContent: z.string().optional(), // base64-encoded audio chunk
|
||||
usage: _Usage_schema.optional(),
|
||||
timestampInfo: z.object({
|
||||
wordAlignment: z.unknown().optional(),
|
||||
}).optional(),
|
||||
});
|
||||
|
||||
export type StreamChunk = z.infer<typeof StreamChunk_schema>;
|
||||
export const StreamChunk_schema = z.object({
|
||||
result: _StreamChunkResult_schema,
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
export namespace InworldWire_TTS_ListVoices {
|
||||
|
||||
// Voice resource from Voices API
|
||||
// API Reference: https://docs.inworld.ai/api-reference/voices/list-voices-in-a-workspace
|
||||
// Note: Workspace ID can be omitted from path - derived from API key
|
||||
|
||||
const _Voice_schema = z.object({
|
||||
name: z.string().optional(), // Resource name: workspaces/{workspace}/voices/{voice}
|
||||
langCode: z.string().optional(), // we won't restrict to known codes
|
||||
displayName: z.string().optional(), // Human-readable name (required in API, optional for parsing)
|
||||
description: z.string().optional(),
|
||||
tags: z.array(z.string()).optional(), // e.g., ['male', 'energetic', 'expressive']
|
||||
voiceId: z.string(), // Globally unique: {workspace}__{voice}
|
||||
});
|
||||
|
||||
export type Response = z.infer<typeof Response_schema>;
|
||||
export const Response_schema = z.object({
|
||||
voices: z.array(_Voice_schema),
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
function _selectModel(priority: 'fast' | 'balanced' | 'quality' | undefined, languageCode: string | undefined): string {
|
||||
const fast = SPEEX_DEFAULTS.INWORLD_MODEL_FAST;
|
||||
const quality = SPEEX_DEFAULTS.INWORLD_MODEL;
|
||||
return priority === 'fast' ? fast // lowest latency
|
||||
: priority === 'quality' ? quality // highest quality
|
||||
: languageCode?.toLowerCase() === 'en' ? fast : quality; // 'balanced'/undefined
|
||||
}
|
||||
|
||||
|
||||
export const synthesizeInworld: SynthesizeBackendFn<SpeexWire_Access_Inworld> = async function* (params) {
|
||||
const { access, text: inputText, voice, streaming, languageCode, priority, signal } = params;
|
||||
if (access.dialect !== 'inworld' || voice.dialect !== 'inworld')
|
||||
throw new Error('Mismatched dialect in Inworld synthesize');
|
||||
|
||||
// safety check: trim text that's too long (Inworld max is 2000 chars)
|
||||
// NOTE: we shall make sure the caller 'chunker' is aware of the 2000 max
|
||||
let text = inputText;
|
||||
if (text.length > SPEEX_DEFAULTS.INWORLD_TTS_MAX_LEN) {
|
||||
text = text.slice(0, SPEEX_DEFAULTS.INWORLD_TTS_MAX_LEN);
|
||||
yield { t: 'log', level: 'info', message: `Text truncated to ${SPEEX_DEFAULTS.INWORLD_TTS_MAX_LEN} characters (Inworld limit)` };
|
||||
}
|
||||
|
||||
// request
|
||||
const { headers, url } = _inworldAccess(access, streaming ? '/tts/v1/voice:stream' : '/tts/v1/voice');
|
||||
const body: InworldWire_TTS_Synthesize.Request = {
|
||||
text,
|
||||
voiceId: voice.ttsVoiceId || SPEEX_DEFAULTS.INWORLD_VOICE,
|
||||
modelId: voice.ttsModel || _selectModel(priority, languageCode),
|
||||
...(voice.ttsTemperature !== undefined && { temperature: voice.ttsTemperature }),
|
||||
audioConfig: {
|
||||
audioEncoding: 'MP3', // MP3 for browser MediaSource compatibility
|
||||
sampleRateHertz: 48000, // also default
|
||||
...(voice.ttsSpeakingRate !== undefined && { speakingRate: voice.ttsSpeakingRate }),
|
||||
},
|
||||
// applyTextNormalization: ... // defaults to automatically detecting whether to apply text normalization
|
||||
} as const;
|
||||
|
||||
// fetch
|
||||
let response: Response;
|
||||
try {
|
||||
if (SPEEX_DEBUG) console.log(`[Speex][Inworld] POST (stream=${streaming})`, { url, headers: { ...headers, Authorization: '[REDACTED]' }, body });
|
||||
response = await fetchResponseOrTRPCThrow({ url, method: 'POST', headers, body, signal, name: 'Inworld' });
|
||||
} catch (error: any) {
|
||||
yield { t: 'error', e: `Inworld fetch failed: ${error.message || 'Unknown error'}` };
|
||||
return;
|
||||
}
|
||||
|
||||
// stream back S/NS response
|
||||
try {
|
||||
yield* streaming
|
||||
? _streamInworldChunks(response, text.length)
|
||||
: _returnInworldWhole(response, text.length);
|
||||
} catch (error: any) {
|
||||
yield { t: 'error', e: `Inworld audio error: ${error.message || 'Unknown error'}` };
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/** Process streaming response (newline-delimited JSON chunks). */
|
||||
async function* _streamInworldChunks(response: Response, textLength: number): AsyncGenerator<SpeexSpeechParticle> {
|
||||
if (!response.body) throw new Error('Inworld streaming response has no body');
|
||||
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = '';
|
||||
let totalBytes = 0;
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (value) buffer += decoder.decode(value, { stream: true });
|
||||
if (done) buffer += '\n'; // on stream end, add newline to flush any remaining buffer
|
||||
|
||||
// JSON: process complete lines
|
||||
const lines = buffer.split('\n');
|
||||
buffer = lines.pop() || ''; // Keep incomplete line in buffer
|
||||
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed) continue;
|
||||
|
||||
try {
|
||||
const parseResult = InworldWire_TTS_Synthesize.StreamChunk_schema.safeParse(JSON.parse(trimmed));
|
||||
if (!parseResult.success) {
|
||||
if (SPEEX_DEBUG) console.warn('[Speex][Inworld] Invalid streaming chunk:', parseResult.error.message, trimmed.slice(0, 100));
|
||||
continue;
|
||||
}
|
||||
|
||||
const { result } = parseResult.data;
|
||||
if (result.audioContent) {
|
||||
const audioBytes = Math.ceil(result.audioContent.length * 3 / 4); // Approximate base64 decoded size
|
||||
totalBytes += audioBytes;
|
||||
yield {
|
||||
t: 'audio',
|
||||
chunk: true,
|
||||
base64: result.audioContent,
|
||||
contentType: 'audio/mpeg',
|
||||
};
|
||||
}
|
||||
} catch {
|
||||
// Ignore parse errors (partial/malformed chunks)
|
||||
}
|
||||
}
|
||||
|
||||
if (done) break;
|
||||
}
|
||||
|
||||
yield { t: 'done', chars: textLength, audioBytes: totalBytes };
|
||||
|
||||
} finally {
|
||||
reader.releaseLock();
|
||||
}
|
||||
}
|
||||
|
||||
/** Process non-streaming response (single JSON with base64 audio). */
|
||||
async function* _returnInworldWhole(response: Response, textLength: number): AsyncGenerator<SpeexSpeechParticle> {
|
||||
const json = InworldWire_TTS_Synthesize.Response_schema.parse(await response.json());
|
||||
|
||||
const audioBytes = Math.ceil(json.audioContent.length * 3 / 4);
|
||||
|
||||
yield {
|
||||
t: 'audio',
|
||||
chunk: false,
|
||||
base64: json.audioContent,
|
||||
contentType: 'audio/mpeg',
|
||||
characterCost: json.usage?.processedCharactersCount,
|
||||
};
|
||||
|
||||
yield { t: 'done', chars: textLength, audioBytes };
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* List available voices from Inworld.
|
||||
* API: GET /voices/v1/voices (workspace derived from API key)
|
||||
*/
|
||||
export async function listVoicesInworld(access: SpeexWire_Access_Inworld): Promise<SpeexWire_ListVoices_Output> {
|
||||
const { headers, url } = _inworldAccess(access, '/voices/v1/voices');
|
||||
|
||||
const voicesResponse = InworldWire_TTS_ListVoices.Response_schema.parse(
|
||||
await fetchJsonOrTRPCThrow({ url, headers, name: 'Inworld' }),
|
||||
);
|
||||
|
||||
const voices = voicesResponse.voices.map(voice => ({
|
||||
id: voice.voiceId,
|
||||
name: voice.displayName || voice.voiceId,
|
||||
description: voice.description || undefined,
|
||||
category: voice.tags?.join(', ') || undefined,
|
||||
}));
|
||||
|
||||
// ensure default voice is in the list
|
||||
const defaultVoiceId = SPEEX_DEFAULTS.INWORLD_VOICE;
|
||||
if (!voices.some(v => v.id === defaultVoiceId)) {
|
||||
console.error(`[Speex][Inworld] Default voice "${defaultVoiceId}" not found in voice list, adding it manually.`);
|
||||
voices.unshift({
|
||||
id: defaultVoiceId,
|
||||
name: defaultVoiceId,
|
||||
description: 'Default voice',
|
||||
category: undefined,
|
||||
});
|
||||
}
|
||||
|
||||
return { voices };
|
||||
}
|
||||
|
||||
|
||||
// Helpers
|
||||
|
||||
function _inworldAccess(access: SpeexWire_Access_Inworld, apiPath: string): { headers: HeadersInit; url: string } {
|
||||
const apiKey = (access.apiKey || '').trim();
|
||||
if (!apiKey)
|
||||
throw new Error('Missing Inworld API key');
|
||||
|
||||
let host = (access.apiHost || 'api.inworld.ai').trim();
|
||||
if (!host.startsWith('http'))
|
||||
host = `https://${host}`;
|
||||
if (host.endsWith('/') && apiPath.startsWith('/'))
|
||||
host = host.slice(0, -1);
|
||||
|
||||
return {
|
||||
headers: {
|
||||
'Authorization': `Basic ${apiKey}`, // Inworld API key is already base64-encoded
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
url: host + apiPath,
|
||||
};
|
||||
}
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
import type { SpeexSpeechParticle, SpeexWire_Access, SpeexWire_ListVoices_Output, SpeexWire_Synthesize_Input, SpeexWire_Voice } from './rpc.wiretypes';
|
||||
import { listVoicesElevenLabs, synthesizeElevenLabs } from './synthesize-elevenlabs';
|
||||
import { listVoicesInworld, synthesizeInworld } from './synthesize-inworld';
|
||||
import { listVoicesLocalAIOrThrow, listVoicesOpenAI, synthesizeOpenAIProtocol } from './synthesize-openai';
|
||||
|
||||
|
||||
@@ -46,6 +47,10 @@ export async function* speexRpcCoreSynthesize(input: SpeexWire_Synthesize_Input,
|
||||
yield* synthesizeElevenLabs({ access, text, voice, streaming, languageCode, priority, signal });
|
||||
break;
|
||||
|
||||
case 'inworld':
|
||||
yield* synthesizeInworld({ access, text, voice, streaming, languageCode, priority, signal });
|
||||
break;
|
||||
|
||||
case 'localai':
|
||||
case 'openai':
|
||||
yield* synthesizeOpenAIProtocol({ access, text, voice, streaming, languageCode, priority, signal });
|
||||
@@ -68,6 +73,9 @@ export async function speexRpcCoreListVoices(access: SpeexWire_Access): Promise<
|
||||
case 'elevenlabs':
|
||||
return await listVoicesElevenLabs(access);
|
||||
|
||||
case 'inworld':
|
||||
return await listVoicesInworld(access);
|
||||
|
||||
case 'openai':
|
||||
return { voices: listVoicesOpenAI() };
|
||||
|
||||
|
||||
@@ -191,6 +191,7 @@ export function speakRawText_withHandle(
|
||||
switch (effectiveEngine.vendorType) {
|
||||
// RPC providers: route through speex.router RPC
|
||||
case 'elevenlabs':
|
||||
case 'inworld':
|
||||
case 'openai':
|
||||
case 'localai': {
|
||||
|
||||
|
||||
@@ -22,4 +22,10 @@ export const SPEEX_DEFAULTS = {
|
||||
// LocalAI - kokoro is a high-quality neural TTS
|
||||
LOCALAI_MODEL: 'kokoro',
|
||||
|
||||
// Inworld - high-quality, low-latency TTS with voice cloning
|
||||
INWORLD_MODEL: 'inworld-tts-1.5-max', // best quality (~200ms latency, $10/1M chars)
|
||||
INWORLD_MODEL_FAST: 'inworld-tts-1.5-mini', // fastest (<100ms latency, $5/1M chars)
|
||||
INWORLD_VOICE: 'Alex', // default voice
|
||||
INWORLD_TTS_MAX_LEN: 2000, // max chars per TTS request - as of 2026-01-27 it's 2000
|
||||
|
||||
} as const;
|
||||
@@ -7,7 +7,7 @@ import type { SpeexWire_VoiceOption } from './protocols/rpc/rpc.wiretypes';
|
||||
|
||||
// Speex Vendor Types (supported TTS providers)
|
||||
|
||||
export type DSpeexVendorType = 'elevenlabs' | 'localai' | 'openai' | 'webspeech';
|
||||
export type DSpeexVendorType = 'elevenlabs' | 'inworld' | 'localai' | 'openai' | 'webspeech';
|
||||
|
||||
|
||||
// Speex Engines - instances of TTS Vendors Types - persisted in store-module-speex
|
||||
@@ -33,6 +33,7 @@ export type SpeexEngineId = string; // agiUuidV4('speex.engine.instance')
|
||||
// helper for mapping credentials and voice types to the engine type
|
||||
interface _TypeMap extends Record<DSpeexVendorType, { voice: unknown; credentials: unknown }> {
|
||||
'elevenlabs': { voice: DVoiceElevenLabs; credentials: DCredentialsApiKey };
|
||||
'inworld': { voice: DVoiceInworld; credentials: DCredentialsApiKey };
|
||||
'localai': { voice: DVoiceLocalAI; credentials: DCredentialsLLMSService | DCredentialsApiKey };
|
||||
'openai': { voice: DVoiceOpenAI; credentials: DCredentialsLLMSService | DCredentialsApiKey };
|
||||
'webspeech': { voice: DVoiceWebSpeech; credentials: DCredentialsNone };
|
||||
@@ -55,6 +56,14 @@ export interface DVoiceElevenLabs {
|
||||
// ttsS?: boolean;
|
||||
}
|
||||
|
||||
export interface DVoiceInworld {
|
||||
dialect: 'inworld';
|
||||
ttsModel?: 'inworld-tts-1.5-max' | 'inworld-tts-1.5-mini';
|
||||
ttsVoiceId?: string; // e.g., 'Alex', 'Ashley', 'Dennis'
|
||||
ttsTemperature?: number; // 0-2, default 1.1 (controls expressiveness)
|
||||
ttsSpeakingRate?: number; // 0.5-1.5, default 1.0
|
||||
}
|
||||
|
||||
// type LocalAITTSBackend = | 'coqui' | 'bark' | 'piper' | 'transformers-musicgen' | 'vall-e-x'
|
||||
export interface DVoiceLocalAI {
|
||||
dialect: 'localai';
|
||||
|
||||
@@ -3,8 +3,9 @@ import type { ModelVendorId } from '~/modules/llms/vendors/vendors.registry';
|
||||
import type { DSpeexEngineAny, DSpeexVendorType } from './speex.types';
|
||||
import type { ISpeexVendor, ISpeexVendorAny } from './ISpeexVendor';
|
||||
|
||||
// vendor imports (will be implemented as stubs initially)
|
||||
// vendor imports
|
||||
import { SpeexVendorElevenLabs } from './vendors/elevenlabs.vendor';
|
||||
import { SpeexVendorInworld } from './vendors/inworld.vendor';
|
||||
import { SpeexVendorLocalAI } from './vendors/localai.vendor';
|
||||
import { SpeexVendorOpenAI } from './vendors/openai.vendor';
|
||||
import { SpeexVendorWebSpeech } from './vendors/webspeech.vendor';
|
||||
@@ -14,6 +15,7 @@ import { SpeexVendorWebSpeech } from './vendors/webspeech.vendor';
|
||||
|
||||
const _SPEEX_VENDOR_REGISTRY: { [key in DSpeexVendorType]: ISpeexVendor<key> } = {
|
||||
elevenlabs: SpeexVendorElevenLabs,
|
||||
inworld: SpeexVendorInworld,
|
||||
localai: SpeexVendorLocalAI,
|
||||
openai: SpeexVendorOpenAI,
|
||||
webspeech: SpeexVendorWebSpeech,
|
||||
|
||||
+31
@@ -0,0 +1,31 @@
|
||||
import type { ISpeexVendor } from '../ISpeexVendor';
|
||||
import { SPEEX_DEFAULTS } from '../speex.config';
|
||||
|
||||
|
||||
export const SpeexVendorInworld: ISpeexVendor<'inworld'> = {
|
||||
vendorType: 'inworld',
|
||||
name: 'Inworld',
|
||||
protocol: 'rpc',
|
||||
location: 'cloud',
|
||||
priority: 15, // between ElevenLabs (10) and LocalAI (20)
|
||||
|
||||
autoFromLlmVendorIds: undefined,
|
||||
|
||||
capabilities: {
|
||||
streaming: true,
|
||||
voiceListing: true,
|
||||
speedControl: true,
|
||||
pitchControl: false,
|
||||
},
|
||||
|
||||
getDefaultCredentials: () => ({
|
||||
type: 'api-key',
|
||||
apiKey: '',
|
||||
}),
|
||||
|
||||
getDefaultVoice: () => ({
|
||||
dialect: 'inworld',
|
||||
ttsModel: SPEEX_DEFAULTS.INWORLD_MODEL,
|
||||
ttsVoiceId: SPEEX_DEFAULTS.INWORLD_VOICE,
|
||||
}),
|
||||
};
|
||||
Reference in New Issue
Block a user