Speex: +Inworld

This commit is contained in:
Enrico Ros
2026-01-27 23:49:14 -08:00
parent 91539346ee
commit bae691e33e
9 changed files with 387 additions and 8 deletions
+21 -4
View File
@@ -37,7 +37,7 @@ async function _getSpeexCsfModule() {
// --- /CSF
type _DSpeexEngineRPC = DSpeexEngine<'elevenlabs'> | DSpeexEngine<'localai'> | DSpeexEngine<'openai'>;
type _DSpeexEngineRPC = DSpeexEngine<'elevenlabs'> | DSpeexEngine<'inworld'> | DSpeexEngine<'localai'> | DSpeexEngine<'openai'>;
/**
@@ -177,8 +177,9 @@ function _buildRPCWireAccess({ credentials: c, vendorType }: _DSpeexEngineRPC):
case 'api-key':
switch (vendorType) {
case 'elevenlabs':
case 'inworld':
return {
dialect: 'elevenlabs',
dialect: vendorType,
apiKey: c.apiKey,
...(c.apiHost && { apiHost: c.apiHost }),
};
@@ -203,7 +204,8 @@ function _buildRPCWireAccess({ credentials: c, vendorType }: _DSpeexEngineRPC):
if (!service) return null;
switch (vendorType) {
case 'elevenlabs':
// no linking for ElevenLabs - we shall NOT be here
case 'inworld':
// no linking for ElevenLabs or Inworld - we shall NOT be here
return null;
case 'openai':
@@ -240,7 +242,22 @@ function _shouldUseCSF({ credentials: c, vendorType }: _DSpeexEngineRPC): boolea
switch (c.type) {
case 'api-key':
// Auto-enable CSF for local URLs (LocalAI typically runs locally)
return vendorType === 'localai' && isLocalUrl(c.apiHost);
switch (vendorType) {
case 'inworld':
return false; // Inworld has blocked CORS policy - never CSF
case 'localai':
return isLocalUrl(c.apiHost);
default:
const _exhaustiveCheck: never = vendorType;
// fallthrough
case 'elevenlabs':
case 'openai':
break;
}
// NOTE: we should have a switch or something
return false;
case 'llms-service':
const service = findModelsServiceOrNull(c.serviceId);
@@ -15,6 +15,7 @@ export type SpeexSpeechParticle =
export type SpeexWire_Access = z.infer<typeof SpeexWire.Access_schema>;
export type SpeexWire_Access_ElevenLabs = z.infer<typeof SpeexWire.AccessElevenLabs_schema>;
export type SpeexWire_Access_Inworld = z.infer<typeof SpeexWire.AccessInworld_schema>;
export type SpeexWire_Access_OpenAI = z.infer<typeof SpeexWire.AccessOpenAI_schema>;
export type SpeexWire_Voice = z.infer<typeof SpeexWire.Voice_schema>;
@@ -39,6 +40,12 @@ export namespace SpeexWire {
apiHost: z.string().optional(),
});
export const AccessInworld_schema = z.object({
dialect: z.literal('inworld'),
apiKey: z.string(), // base64-encoded API key from Inworld Portal
apiHost: z.string().optional(), // defaults to api.inworld.ai
});
export const AccessOpenAI_schema = z.object({
dialect: z.enum(['localai', 'openai']),
apiKey: z.string().optional(), // openai: required, localai: optional
@@ -47,7 +54,7 @@ export namespace SpeexWire {
});
export const Access_schema = z.discriminatedUnion('dialect',
[AccessElevenLabs_schema, AccessOpenAI_schema],
[AccessElevenLabs_schema, AccessInworld_schema, AccessOpenAI_schema],
);
@@ -59,6 +66,14 @@ export namespace SpeexWire {
ttsVoiceId: z.string().optional(),
});
export const VoiceInworld_schema = z.object({
dialect: z.literal('inworld'),
ttsModel: z.enum(['inworld-tts-1.5-max', 'inworld-tts-1.5-mini']).optional(),
ttsVoiceId: z.string().optional(),
ttsTemperature: z.number().min(0).max(2).optional(),
ttsSpeakingRate: z.number().min(0.5).max(1.5).optional(),
});
export const VoiceLocalAI_schema = z.object({
dialect: z.literal('localai'),
ttsBackend: z.string().optional(), // e.g., 'coqui', 'bark', 'piper', 'vall-e-x'
@@ -75,7 +90,7 @@ export namespace SpeexWire {
});
export const Voice_schema = z.discriminatedUnion('dialect',
[VoiceElevenLabs_schema, VoiceLocalAI_schema, VoiceOpenAI_schema],
[VoiceElevenLabs_schema, VoiceInworld_schema, VoiceLocalAI_schema, VoiceOpenAI_schema],
);
@@ -0,0 +1,290 @@
/**
* Inworld AI TTS Synthesizer
*
* Implements Inworld's Text-to-Speech API:
* - Non-streaming: POST /tts/v1/voice
* - Streaming: POST /tts/v1/voice:stream (newline-delimited JSON)
* - Authentication: Basic auth with base64-encoded API key
*
* API Reference: https://docs.inworld.ai/api-reference/ttsAPI/texttospeech/synthesize-speech
*/
import * as z from 'zod/v4';
import { fetchJsonOrTRPCThrow, fetchResponseOrTRPCThrow } from '~/server/trpc/trpc.router.fetchers';
import type { SpeexSpeechParticle, SpeexWire_Access_Inworld, SpeexWire_ListVoices_Output } from './rpc.wiretypes';
import type { SynthesizeBackendFn } from './synthesize.core';
import { SPEEX_DEBUG, SPEEX_DEFAULTS } from '../../speex.config';
export namespace InworldWire_TTS_Synthesize {
/// Request Schema
// API Reference: https://docs.inworld.ai/api-reference/ttsAPI/texttospeech/synthesize-speech
// Note: Request schemas are intentionally loose - server validates constraints
const _AudioConfig_schema = z.object({
audioEncoding: z.enum(['LINEAR16', 'MP3', 'OGG_OPUS', 'ALAW', 'MULAW', 'FLAC']).or(z.string()).optional(),
sampleRateHertz: z.number().optional(), // 8000-48000 Hz
speakingRate: z.number().optional(), // 0.5x to 1.5x speed
bitRate: z.number().optional(), // for compressed formats
});
export type Request = z.infer<typeof Request_schema>;
export const Request_schema = z.object({
text: z.string(), // max 2000 chars per request
voiceId: z.string(),
modelId: z.string(), // e.g., 'inworld-tts-1.5-max', 'inworld-tts-1.5-mini'
temperature: z.number().optional(), // 0-2, default 1.1
applyTextNormalization: z.enum(['ON', 'OFF', 'UNSPECIFIED']).or(z.string()).optional(),
audioConfig: _AudioConfig_schema.optional(),
timestampType: z.enum(['WORD', 'CHARACTER', 'UNSPECIFIED']).or(z.string()).optional(),
});
/// Response Schema (non-streaming)
const _Usage_schema = z.object({
processedCharactersCount: z.number().optional(),
modelId: z.string().optional(),
});
export type Response = z.infer<typeof Response_schema>;
export const Response_schema = z.object({
audioContent: z.string(), // base64-encoded audio (max 16MB)
usage: _Usage_schema.optional(),
timestampInfo: z.object({
wordAlignment: z.unknown().optional(),
characterAlignment: z.unknown().optional(),
}).optional(),
phoneticDetails: z.array(z.unknown()).optional(), // TTS 1.5 models only
});
/// Streaming Chunk Schema (newline-delimited JSON, wrapped in "result")
const _StreamChunkResult_schema = z.object({
audioContent: z.string().optional(), // base64-encoded audio chunk
usage: _Usage_schema.optional(),
timestampInfo: z.object({
wordAlignment: z.unknown().optional(),
}).optional(),
});
export type StreamChunk = z.infer<typeof StreamChunk_schema>;
export const StreamChunk_schema = z.object({
result: _StreamChunkResult_schema,
});
}
export namespace InworldWire_TTS_ListVoices {
// Voice resource from Voices API
// API Reference: https://docs.inworld.ai/api-reference/voices/list-voices-in-a-workspace
// Note: Workspace ID can be omitted from path - derived from API key
const _Voice_schema = z.object({
name: z.string().optional(), // Resource name: workspaces/{workspace}/voices/{voice}
langCode: z.string().optional(), // we won't restrict to known codes
displayName: z.string().optional(), // Human-readable name (required in API, optional for parsing)
description: z.string().optional(),
tags: z.array(z.string()).optional(), // e.g., ['male', 'energetic', 'expressive']
voiceId: z.string(), // Globally unique: {workspace}__{voice}
});
export type Response = z.infer<typeof Response_schema>;
export const Response_schema = z.object({
voices: z.array(_Voice_schema),
});
}
function _selectModel(priority: 'fast' | 'balanced' | 'quality' | undefined, languageCode: string | undefined): string {
const fast = SPEEX_DEFAULTS.INWORLD_MODEL_FAST;
const quality = SPEEX_DEFAULTS.INWORLD_MODEL;
return priority === 'fast' ? fast // lowest latency
: priority === 'quality' ? quality // highest quality
: languageCode?.toLowerCase() === 'en' ? fast : quality; // 'balanced'/undefined
}
export const synthesizeInworld: SynthesizeBackendFn<SpeexWire_Access_Inworld> = async function* (params) {
const { access, text: inputText, voice, streaming, languageCode, priority, signal } = params;
if (access.dialect !== 'inworld' || voice.dialect !== 'inworld')
throw new Error('Mismatched dialect in Inworld synthesize');
// safety check: trim text that's too long (Inworld max is 2000 chars)
// NOTE: we shall make sure the caller 'chunker' is aware of the 2000 max
let text = inputText;
if (text.length > SPEEX_DEFAULTS.INWORLD_TTS_MAX_LEN) {
text = text.slice(0, SPEEX_DEFAULTS.INWORLD_TTS_MAX_LEN);
yield { t: 'log', level: 'info', message: `Text truncated to ${SPEEX_DEFAULTS.INWORLD_TTS_MAX_LEN} characters (Inworld limit)` };
}
// request
const { headers, url } = _inworldAccess(access, streaming ? '/tts/v1/voice:stream' : '/tts/v1/voice');
const body: InworldWire_TTS_Synthesize.Request = {
text,
voiceId: voice.ttsVoiceId || SPEEX_DEFAULTS.INWORLD_VOICE,
modelId: voice.ttsModel || _selectModel(priority, languageCode),
...(voice.ttsTemperature !== undefined && { temperature: voice.ttsTemperature }),
audioConfig: {
audioEncoding: 'MP3', // MP3 for browser MediaSource compatibility
sampleRateHertz: 48000, // also default
...(voice.ttsSpeakingRate !== undefined && { speakingRate: voice.ttsSpeakingRate }),
},
// applyTextNormalization: ... // defaults to automatically detecting whether to apply text normalization
} as const;
// fetch
let response: Response;
try {
if (SPEEX_DEBUG) console.log(`[Speex][Inworld] POST (stream=${streaming})`, { url, headers: { ...headers, Authorization: '[REDACTED]' }, body });
response = await fetchResponseOrTRPCThrow({ url, method: 'POST', headers, body, signal, name: 'Inworld' });
} catch (error: any) {
yield { t: 'error', e: `Inworld fetch failed: ${error.message || 'Unknown error'}` };
return;
}
// stream back S/NS response
try {
yield* streaming
? _streamInworldChunks(response, text.length)
: _returnInworldWhole(response, text.length);
} catch (error: any) {
yield { t: 'error', e: `Inworld audio error: ${error.message || 'Unknown error'}` };
}
};
/** Process streaming response (newline-delimited JSON chunks). */
async function* _streamInworldChunks(response: Response, textLength: number): AsyncGenerator<SpeexSpeechParticle> {
if (!response.body) throw new Error('Inworld streaming response has no body');
const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = '';
let totalBytes = 0;
try {
while (true) {
const { done, value } = await reader.read();
if (value) buffer += decoder.decode(value, { stream: true });
if (done) buffer += '\n'; // on stream end, add newline to flush any remaining buffer
// JSON: process complete lines
const lines = buffer.split('\n');
buffer = lines.pop() || ''; // Keep incomplete line in buffer
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) continue;
try {
const parseResult = InworldWire_TTS_Synthesize.StreamChunk_schema.safeParse(JSON.parse(trimmed));
if (!parseResult.success) {
if (SPEEX_DEBUG) console.warn('[Speex][Inworld] Invalid streaming chunk:', parseResult.error.message, trimmed.slice(0, 100));
continue;
}
const { result } = parseResult.data;
if (result.audioContent) {
const audioBytes = Math.ceil(result.audioContent.length * 3 / 4); // Approximate base64 decoded size
totalBytes += audioBytes;
yield {
t: 'audio',
chunk: true,
base64: result.audioContent,
contentType: 'audio/mpeg',
};
}
} catch {
// Ignore parse errors (partial/malformed chunks)
}
}
if (done) break;
}
yield { t: 'done', chars: textLength, audioBytes: totalBytes };
} finally {
reader.releaseLock();
}
}
/** Process non-streaming response (single JSON with base64 audio). */
async function* _returnInworldWhole(response: Response, textLength: number): AsyncGenerator<SpeexSpeechParticle> {
const json = InworldWire_TTS_Synthesize.Response_schema.parse(await response.json());
const audioBytes = Math.ceil(json.audioContent.length * 3 / 4);
yield {
t: 'audio',
chunk: false,
base64: json.audioContent,
contentType: 'audio/mpeg',
characterCost: json.usage?.processedCharactersCount,
};
yield { t: 'done', chars: textLength, audioBytes };
}
/**
* List available voices from Inworld.
* API: GET /voices/v1/voices (workspace derived from API key)
*/
export async function listVoicesInworld(access: SpeexWire_Access_Inworld): Promise<SpeexWire_ListVoices_Output> {
const { headers, url } = _inworldAccess(access, '/voices/v1/voices');
const voicesResponse = InworldWire_TTS_ListVoices.Response_schema.parse(
await fetchJsonOrTRPCThrow({ url, headers, name: 'Inworld' }),
);
const voices = voicesResponse.voices.map(voice => ({
id: voice.voiceId,
name: voice.displayName || voice.voiceId,
description: voice.description || undefined,
category: voice.tags?.join(', ') || undefined,
}));
// ensure default voice is in the list
const defaultVoiceId = SPEEX_DEFAULTS.INWORLD_VOICE;
if (!voices.some(v => v.id === defaultVoiceId)) {
console.error(`[Speex][Inworld] Default voice "${defaultVoiceId}" not found in voice list, adding it manually.`);
voices.unshift({
id: defaultVoiceId,
name: defaultVoiceId,
description: 'Default voice',
category: undefined,
});
}
return { voices };
}
// Helpers
function _inworldAccess(access: SpeexWire_Access_Inworld, apiPath: string): { headers: HeadersInit; url: string } {
const apiKey = (access.apiKey || '').trim();
if (!apiKey)
throw new Error('Missing Inworld API key');
let host = (access.apiHost || 'api.inworld.ai').trim();
if (!host.startsWith('http'))
host = `https://${host}`;
if (host.endsWith('/') && apiPath.startsWith('/'))
host = host.slice(0, -1);
return {
headers: {
'Authorization': `Basic ${apiKey}`, // Inworld API key is already base64-encoded
'Content-Type': 'application/json',
},
url: host + apiPath,
};
}
@@ -10,6 +10,7 @@
import type { SpeexSpeechParticle, SpeexWire_Access, SpeexWire_ListVoices_Output, SpeexWire_Synthesize_Input, SpeexWire_Voice } from './rpc.wiretypes';
import { listVoicesElevenLabs, synthesizeElevenLabs } from './synthesize-elevenlabs';
import { listVoicesInworld, synthesizeInworld } from './synthesize-inworld';
import { listVoicesLocalAIOrThrow, listVoicesOpenAI, synthesizeOpenAIProtocol } from './synthesize-openai';
@@ -46,6 +47,10 @@ export async function* speexRpcCoreSynthesize(input: SpeexWire_Synthesize_Input,
yield* synthesizeElevenLabs({ access, text, voice, streaming, languageCode, priority, signal });
break;
case 'inworld':
yield* synthesizeInworld({ access, text, voice, streaming, languageCode, priority, signal });
break;
case 'localai':
case 'openai':
yield* synthesizeOpenAIProtocol({ access, text, voice, streaming, languageCode, priority, signal });
@@ -68,6 +73,9 @@ export async function speexRpcCoreListVoices(access: SpeexWire_Access): Promise<
case 'elevenlabs':
return await listVoicesElevenLabs(access);
case 'inworld':
return await listVoicesInworld(access);
case 'openai':
return { voices: listVoicesOpenAI() };
+1
View File
@@ -191,6 +191,7 @@ export function speakRawText_withHandle(
switch (effectiveEngine.vendorType) {
// RPC providers: route through speex.router RPC
case 'elevenlabs':
case 'inworld':
case 'openai':
case 'localai': {
+6
View File
@@ -22,4 +22,10 @@ export const SPEEX_DEFAULTS = {
// LocalAI - kokoro is a high-quality neural TTS
LOCALAI_MODEL: 'kokoro',
// Inworld - high-quality, low-latency TTS with voice cloning
INWORLD_MODEL: 'inworld-tts-1.5-max', // best quality (~200ms latency, $10/1M chars)
INWORLD_MODEL_FAST: 'inworld-tts-1.5-mini', // fastest (<100ms latency, $5/1M chars)
INWORLD_VOICE: 'Alex', // default voice
INWORLD_TTS_MAX_LEN: 2000, // max chars per TTS request - as of 2026-01-27 it's 2000
} as const;
+10 -1
View File
@@ -7,7 +7,7 @@ import type { SpeexWire_VoiceOption } from './protocols/rpc/rpc.wiretypes';
// Speex Vendor Types (supported TTS providers)
export type DSpeexVendorType = 'elevenlabs' | 'localai' | 'openai' | 'webspeech';
export type DSpeexVendorType = 'elevenlabs' | 'inworld' | 'localai' | 'openai' | 'webspeech';
// Speex Engines - instances of TTS Vendors Types - persisted in store-module-speex
@@ -33,6 +33,7 @@ export type SpeexEngineId = string; // agiUuidV4('speex.engine.instance')
// helper for mapping credentials and voice types to the engine type
interface _TypeMap extends Record<DSpeexVendorType, { voice: unknown; credentials: unknown }> {
'elevenlabs': { voice: DVoiceElevenLabs; credentials: DCredentialsApiKey };
'inworld': { voice: DVoiceInworld; credentials: DCredentialsApiKey };
'localai': { voice: DVoiceLocalAI; credentials: DCredentialsLLMSService | DCredentialsApiKey };
'openai': { voice: DVoiceOpenAI; credentials: DCredentialsLLMSService | DCredentialsApiKey };
'webspeech': { voice: DVoiceWebSpeech; credentials: DCredentialsNone };
@@ -55,6 +56,14 @@ export interface DVoiceElevenLabs {
// ttsS?: boolean;
}
export interface DVoiceInworld {
dialect: 'inworld';
ttsModel?: 'inworld-tts-1.5-max' | 'inworld-tts-1.5-mini';
ttsVoiceId?: string; // e.g., 'Alex', 'Ashley', 'Dennis'
ttsTemperature?: number; // 0-2, default 1.1 (controls expressiveness)
ttsSpeakingRate?: number; // 0.5-1.5, default 1.0
}
// type LocalAITTSBackend = | 'coqui' | 'bark' | 'piper' | 'transformers-musicgen' | 'vall-e-x'
export interface DVoiceLocalAI {
dialect: 'localai';
+3 -1
View File
@@ -3,8 +3,9 @@ import type { ModelVendorId } from '~/modules/llms/vendors/vendors.registry';
import type { DSpeexEngineAny, DSpeexVendorType } from './speex.types';
import type { ISpeexVendor, ISpeexVendorAny } from './ISpeexVendor';
// vendor imports (will be implemented as stubs initially)
// vendor imports
import { SpeexVendorElevenLabs } from './vendors/elevenlabs.vendor';
import { SpeexVendorInworld } from './vendors/inworld.vendor';
import { SpeexVendorLocalAI } from './vendors/localai.vendor';
import { SpeexVendorOpenAI } from './vendors/openai.vendor';
import { SpeexVendorWebSpeech } from './vendors/webspeech.vendor';
@@ -14,6 +15,7 @@ import { SpeexVendorWebSpeech } from './vendors/webspeech.vendor';
const _SPEEX_VENDOR_REGISTRY: { [key in DSpeexVendorType]: ISpeexVendor<key> } = {
elevenlabs: SpeexVendorElevenLabs,
inworld: SpeexVendorInworld,
localai: SpeexVendorLocalAI,
openai: SpeexVendorOpenAI,
webspeech: SpeexVendorWebSpeech,
+31
View File
@@ -0,0 +1,31 @@
import type { ISpeexVendor } from '../ISpeexVendor';
import { SPEEX_DEFAULTS } from '../speex.config';
export const SpeexVendorInworld: ISpeexVendor<'inworld'> = {
vendorType: 'inworld',
name: 'Inworld',
protocol: 'rpc',
location: 'cloud',
priority: 15, // between ElevenLabs (10) and LocalAI (20)
autoFromLlmVendorIds: undefined,
capabilities: {
streaming: true,
voiceListing: true,
speedControl: true,
pitchControl: false,
},
getDefaultCredentials: () => ({
type: 'api-key',
apiKey: '',
}),
getDefaultVoice: () => ({
dialect: 'inworld',
ttsModel: SPEEX_DEFAULTS.INWORLD_MODEL,
ttsVoiceId: SPEEX_DEFAULTS.INWORLD_VOICE,
}),
};