diff --git a/src/modules/speex/protocols/rpc/rpc.client.ts b/src/modules/speex/protocols/rpc/rpc.client.ts index 3a19ef33e..eb7b4383e 100644 --- a/src/modules/speex/protocols/rpc/rpc.client.ts +++ b/src/modules/speex/protocols/rpc/rpc.client.ts @@ -37,7 +37,7 @@ async function _getSpeexCsfModule() { // --- /CSF -type _DSpeexEngineRPC = DSpeexEngine<'elevenlabs'> | DSpeexEngine<'localai'> | DSpeexEngine<'openai'>; +type _DSpeexEngineRPC = DSpeexEngine<'elevenlabs'> | DSpeexEngine<'inworld'> | DSpeexEngine<'localai'> | DSpeexEngine<'openai'>; /** @@ -177,8 +177,9 @@ function _buildRPCWireAccess({ credentials: c, vendorType }: _DSpeexEngineRPC): case 'api-key': switch (vendorType) { case 'elevenlabs': + case 'inworld': return { - dialect: 'elevenlabs', + dialect: vendorType, apiKey: c.apiKey, ...(c.apiHost && { apiHost: c.apiHost }), }; @@ -203,7 +204,8 @@ function _buildRPCWireAccess({ credentials: c, vendorType }: _DSpeexEngineRPC): if (!service) return null; switch (vendorType) { case 'elevenlabs': - // no linking for ElevenLabs - we shall NOT be here + case 'inworld': + // no linking for ElevenLabs or Inworld - we shall NOT be here return null; case 'openai': @@ -240,7 +242,22 @@ function _shouldUseCSF({ credentials: c, vendorType }: _DSpeexEngineRPC): boolea switch (c.type) { case 'api-key': // Auto-enable CSF for local URLs (LocalAI typically runs locally) - return vendorType === 'localai' && isLocalUrl(c.apiHost); + switch (vendorType) { + case 'inworld': + return false; // Inworld has blocked CORS policy - never CSF + + case 'localai': + return isLocalUrl(c.apiHost); + + default: + const _exhaustiveCheck: never = vendorType; + // fallthrough + case 'elevenlabs': + case 'openai': + break; + } + // NOTE: we should have a switch or something + return false; case 'llms-service': const service = findModelsServiceOrNull(c.serviceId); diff --git a/src/modules/speex/protocols/rpc/rpc.wiretypes.ts b/src/modules/speex/protocols/rpc/rpc.wiretypes.ts index de27b6511..619dce3a3 100644 --- a/src/modules/speex/protocols/rpc/rpc.wiretypes.ts +++ b/src/modules/speex/protocols/rpc/rpc.wiretypes.ts @@ -15,6 +15,7 @@ export type SpeexSpeechParticle = export type SpeexWire_Access = z.infer; export type SpeexWire_Access_ElevenLabs = z.infer; +export type SpeexWire_Access_Inworld = z.infer; export type SpeexWire_Access_OpenAI = z.infer; export type SpeexWire_Voice = z.infer; @@ -39,6 +40,12 @@ export namespace SpeexWire { apiHost: z.string().optional(), }); + export const AccessInworld_schema = z.object({ + dialect: z.literal('inworld'), + apiKey: z.string(), // base64-encoded API key from Inworld Portal + apiHost: z.string().optional(), // defaults to api.inworld.ai + }); + export const AccessOpenAI_schema = z.object({ dialect: z.enum(['localai', 'openai']), apiKey: z.string().optional(), // openai: required, localai: optional @@ -47,7 +54,7 @@ export namespace SpeexWire { }); export const Access_schema = z.discriminatedUnion('dialect', - [AccessElevenLabs_schema, AccessOpenAI_schema], + [AccessElevenLabs_schema, AccessInworld_schema, AccessOpenAI_schema], ); @@ -59,6 +66,14 @@ export namespace SpeexWire { ttsVoiceId: z.string().optional(), }); + export const VoiceInworld_schema = z.object({ + dialect: z.literal('inworld'), + ttsModel: z.enum(['inworld-tts-1.5-max', 'inworld-tts-1.5-mini']).optional(), + ttsVoiceId: z.string().optional(), + ttsTemperature: z.number().min(0).max(2).optional(), + ttsSpeakingRate: z.number().min(0.5).max(1.5).optional(), + }); + export const VoiceLocalAI_schema = z.object({ dialect: z.literal('localai'), ttsBackend: z.string().optional(), // e.g., 'coqui', 'bark', 'piper', 'vall-e-x' @@ -75,7 +90,7 @@ export namespace SpeexWire { }); export const Voice_schema = z.discriminatedUnion('dialect', - [VoiceElevenLabs_schema, VoiceLocalAI_schema, VoiceOpenAI_schema], + [VoiceElevenLabs_schema, VoiceInworld_schema, VoiceLocalAI_schema, VoiceOpenAI_schema], ); diff --git a/src/modules/speex/protocols/rpc/synthesize-inworld.ts b/src/modules/speex/protocols/rpc/synthesize-inworld.ts new file mode 100644 index 000000000..74b98d84e --- /dev/null +++ b/src/modules/speex/protocols/rpc/synthesize-inworld.ts @@ -0,0 +1,290 @@ +/** + * Inworld AI TTS Synthesizer + * + * Implements Inworld's Text-to-Speech API: + * - Non-streaming: POST /tts/v1/voice + * - Streaming: POST /tts/v1/voice:stream (newline-delimited JSON) + * - Authentication: Basic auth with base64-encoded API key + * + * API Reference: https://docs.inworld.ai/api-reference/ttsAPI/texttospeech/synthesize-speech + */ + +import * as z from 'zod/v4'; + +import { fetchJsonOrTRPCThrow, fetchResponseOrTRPCThrow } from '~/server/trpc/trpc.router.fetchers'; + +import type { SpeexSpeechParticle, SpeexWire_Access_Inworld, SpeexWire_ListVoices_Output } from './rpc.wiretypes'; +import type { SynthesizeBackendFn } from './synthesize.core'; +import { SPEEX_DEBUG, SPEEX_DEFAULTS } from '../../speex.config'; + + +export namespace InworldWire_TTS_Synthesize { + + /// Request Schema + // API Reference: https://docs.inworld.ai/api-reference/ttsAPI/texttospeech/synthesize-speech + // Note: Request schemas are intentionally loose - server validates constraints + + const _AudioConfig_schema = z.object({ + audioEncoding: z.enum(['LINEAR16', 'MP3', 'OGG_OPUS', 'ALAW', 'MULAW', 'FLAC']).or(z.string()).optional(), + sampleRateHertz: z.number().optional(), // 8000-48000 Hz + speakingRate: z.number().optional(), // 0.5x to 1.5x speed + bitRate: z.number().optional(), // for compressed formats + }); + + export type Request = z.infer; + export const Request_schema = z.object({ + text: z.string(), // max 2000 chars per request + voiceId: z.string(), + modelId: z.string(), // e.g., 'inworld-tts-1.5-max', 'inworld-tts-1.5-mini' + temperature: z.number().optional(), // 0-2, default 1.1 + applyTextNormalization: z.enum(['ON', 'OFF', 'UNSPECIFIED']).or(z.string()).optional(), + audioConfig: _AudioConfig_schema.optional(), + timestampType: z.enum(['WORD', 'CHARACTER', 'UNSPECIFIED']).or(z.string()).optional(), + }); + + + /// Response Schema (non-streaming) + + const _Usage_schema = z.object({ + processedCharactersCount: z.number().optional(), + modelId: z.string().optional(), + }); + + export type Response = z.infer; + export const Response_schema = z.object({ + audioContent: z.string(), // base64-encoded audio (max 16MB) + usage: _Usage_schema.optional(), + timestampInfo: z.object({ + wordAlignment: z.unknown().optional(), + characterAlignment: z.unknown().optional(), + }).optional(), + phoneticDetails: z.array(z.unknown()).optional(), // TTS 1.5 models only + }); + + /// Streaming Chunk Schema (newline-delimited JSON, wrapped in "result") + + const _StreamChunkResult_schema = z.object({ + audioContent: z.string().optional(), // base64-encoded audio chunk + usage: _Usage_schema.optional(), + timestampInfo: z.object({ + wordAlignment: z.unknown().optional(), + }).optional(), + }); + + export type StreamChunk = z.infer; + export const StreamChunk_schema = z.object({ + result: _StreamChunkResult_schema, + }); + +} + +export namespace InworldWire_TTS_ListVoices { + + // Voice resource from Voices API + // API Reference: https://docs.inworld.ai/api-reference/voices/list-voices-in-a-workspace + // Note: Workspace ID can be omitted from path - derived from API key + + const _Voice_schema = z.object({ + name: z.string().optional(), // Resource name: workspaces/{workspace}/voices/{voice} + langCode: z.string().optional(), // we won't restrict to known codes + displayName: z.string().optional(), // Human-readable name (required in API, optional for parsing) + description: z.string().optional(), + tags: z.array(z.string()).optional(), // e.g., ['male', 'energetic', 'expressive'] + voiceId: z.string(), // Globally unique: {workspace}__{voice} + }); + + export type Response = z.infer; + export const Response_schema = z.object({ + voices: z.array(_Voice_schema), + }); + +} + + +function _selectModel(priority: 'fast' | 'balanced' | 'quality' | undefined, languageCode: string | undefined): string { + const fast = SPEEX_DEFAULTS.INWORLD_MODEL_FAST; + const quality = SPEEX_DEFAULTS.INWORLD_MODEL; + return priority === 'fast' ? fast // lowest latency + : priority === 'quality' ? quality // highest quality + : languageCode?.toLowerCase() === 'en' ? fast : quality; // 'balanced'/undefined +} + + +export const synthesizeInworld: SynthesizeBackendFn = async function* (params) { + const { access, text: inputText, voice, streaming, languageCode, priority, signal } = params; + if (access.dialect !== 'inworld' || voice.dialect !== 'inworld') + throw new Error('Mismatched dialect in Inworld synthesize'); + + // safety check: trim text that's too long (Inworld max is 2000 chars) + // NOTE: we shall make sure the caller 'chunker' is aware of the 2000 max + let text = inputText; + if (text.length > SPEEX_DEFAULTS.INWORLD_TTS_MAX_LEN) { + text = text.slice(0, SPEEX_DEFAULTS.INWORLD_TTS_MAX_LEN); + yield { t: 'log', level: 'info', message: `Text truncated to ${SPEEX_DEFAULTS.INWORLD_TTS_MAX_LEN} characters (Inworld limit)` }; + } + + // request + const { headers, url } = _inworldAccess(access, streaming ? '/tts/v1/voice:stream' : '/tts/v1/voice'); + const body: InworldWire_TTS_Synthesize.Request = { + text, + voiceId: voice.ttsVoiceId || SPEEX_DEFAULTS.INWORLD_VOICE, + modelId: voice.ttsModel || _selectModel(priority, languageCode), + ...(voice.ttsTemperature !== undefined && { temperature: voice.ttsTemperature }), + audioConfig: { + audioEncoding: 'MP3', // MP3 for browser MediaSource compatibility + sampleRateHertz: 48000, // also default + ...(voice.ttsSpeakingRate !== undefined && { speakingRate: voice.ttsSpeakingRate }), + }, + // applyTextNormalization: ... // defaults to automatically detecting whether to apply text normalization + } as const; + + // fetch + let response: Response; + try { + if (SPEEX_DEBUG) console.log(`[Speex][Inworld] POST (stream=${streaming})`, { url, headers: { ...headers, Authorization: '[REDACTED]' }, body }); + response = await fetchResponseOrTRPCThrow({ url, method: 'POST', headers, body, signal, name: 'Inworld' }); + } catch (error: any) { + yield { t: 'error', e: `Inworld fetch failed: ${error.message || 'Unknown error'}` }; + return; + } + + // stream back S/NS response + try { + yield* streaming + ? _streamInworldChunks(response, text.length) + : _returnInworldWhole(response, text.length); + } catch (error: any) { + yield { t: 'error', e: `Inworld audio error: ${error.message || 'Unknown error'}` }; + } +}; + + +/** Process streaming response (newline-delimited JSON chunks). */ +async function* _streamInworldChunks(response: Response, textLength: number): AsyncGenerator { + if (!response.body) throw new Error('Inworld streaming response has no body'); + + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + let buffer = ''; + let totalBytes = 0; + + try { + while (true) { + const { done, value } = await reader.read(); + if (value) buffer += decoder.decode(value, { stream: true }); + if (done) buffer += '\n'; // on stream end, add newline to flush any remaining buffer + + // JSON: process complete lines + const lines = buffer.split('\n'); + buffer = lines.pop() || ''; // Keep incomplete line in buffer + + for (const line of lines) { + const trimmed = line.trim(); + if (!trimmed) continue; + + try { + const parseResult = InworldWire_TTS_Synthesize.StreamChunk_schema.safeParse(JSON.parse(trimmed)); + if (!parseResult.success) { + if (SPEEX_DEBUG) console.warn('[Speex][Inworld] Invalid streaming chunk:', parseResult.error.message, trimmed.slice(0, 100)); + continue; + } + + const { result } = parseResult.data; + if (result.audioContent) { + const audioBytes = Math.ceil(result.audioContent.length * 3 / 4); // Approximate base64 decoded size + totalBytes += audioBytes; + yield { + t: 'audio', + chunk: true, + base64: result.audioContent, + contentType: 'audio/mpeg', + }; + } + } catch { + // Ignore parse errors (partial/malformed chunks) + } + } + + if (done) break; + } + + yield { t: 'done', chars: textLength, audioBytes: totalBytes }; + + } finally { + reader.releaseLock(); + } +} + +/** Process non-streaming response (single JSON with base64 audio). */ +async function* _returnInworldWhole(response: Response, textLength: number): AsyncGenerator { + const json = InworldWire_TTS_Synthesize.Response_schema.parse(await response.json()); + + const audioBytes = Math.ceil(json.audioContent.length * 3 / 4); + + yield { + t: 'audio', + chunk: false, + base64: json.audioContent, + contentType: 'audio/mpeg', + characterCost: json.usage?.processedCharactersCount, + }; + + yield { t: 'done', chars: textLength, audioBytes }; +} + + +/** + * List available voices from Inworld. + * API: GET /voices/v1/voices (workspace derived from API key) + */ +export async function listVoicesInworld(access: SpeexWire_Access_Inworld): Promise { + const { headers, url } = _inworldAccess(access, '/voices/v1/voices'); + + const voicesResponse = InworldWire_TTS_ListVoices.Response_schema.parse( + await fetchJsonOrTRPCThrow({ url, headers, name: 'Inworld' }), + ); + + const voices = voicesResponse.voices.map(voice => ({ + id: voice.voiceId, + name: voice.displayName || voice.voiceId, + description: voice.description || undefined, + category: voice.tags?.join(', ') || undefined, + })); + + // ensure default voice is in the list + const defaultVoiceId = SPEEX_DEFAULTS.INWORLD_VOICE; + if (!voices.some(v => v.id === defaultVoiceId)) { + console.error(`[Speex][Inworld] Default voice "${defaultVoiceId}" not found in voice list, adding it manually.`); + voices.unshift({ + id: defaultVoiceId, + name: defaultVoiceId, + description: 'Default voice', + category: undefined, + }); + } + + return { voices }; +} + + +// Helpers + +function _inworldAccess(access: SpeexWire_Access_Inworld, apiPath: string): { headers: HeadersInit; url: string } { + const apiKey = (access.apiKey || '').trim(); + if (!apiKey) + throw new Error('Missing Inworld API key'); + + let host = (access.apiHost || 'api.inworld.ai').trim(); + if (!host.startsWith('http')) + host = `https://${host}`; + if (host.endsWith('/') && apiPath.startsWith('/')) + host = host.slice(0, -1); + + return { + headers: { + 'Authorization': `Basic ${apiKey}`, // Inworld API key is already base64-encoded + 'Content-Type': 'application/json', + }, + url: host + apiPath, + }; +} diff --git a/src/modules/speex/protocols/rpc/synthesize.core.ts b/src/modules/speex/protocols/rpc/synthesize.core.ts index 61bad3f4e..96c1f8251 100644 --- a/src/modules/speex/protocols/rpc/synthesize.core.ts +++ b/src/modules/speex/protocols/rpc/synthesize.core.ts @@ -10,6 +10,7 @@ import type { SpeexSpeechParticle, SpeexWire_Access, SpeexWire_ListVoices_Output, SpeexWire_Synthesize_Input, SpeexWire_Voice } from './rpc.wiretypes'; import { listVoicesElevenLabs, synthesizeElevenLabs } from './synthesize-elevenlabs'; +import { listVoicesInworld, synthesizeInworld } from './synthesize-inworld'; import { listVoicesLocalAIOrThrow, listVoicesOpenAI, synthesizeOpenAIProtocol } from './synthesize-openai'; @@ -46,6 +47,10 @@ export async function* speexRpcCoreSynthesize(input: SpeexWire_Synthesize_Input, yield* synthesizeElevenLabs({ access, text, voice, streaming, languageCode, priority, signal }); break; + case 'inworld': + yield* synthesizeInworld({ access, text, voice, streaming, languageCode, priority, signal }); + break; + case 'localai': case 'openai': yield* synthesizeOpenAIProtocol({ access, text, voice, streaming, languageCode, priority, signal }); @@ -68,6 +73,9 @@ export async function speexRpcCoreListVoices(access: SpeexWire_Access): Promise< case 'elevenlabs': return await listVoicesElevenLabs(access); + case 'inworld': + return await listVoicesInworld(access); + case 'openai': return { voices: listVoicesOpenAI() }; diff --git a/src/modules/speex/speex.client.ts b/src/modules/speex/speex.client.ts index 2c69e1f5f..cb987fed4 100644 --- a/src/modules/speex/speex.client.ts +++ b/src/modules/speex/speex.client.ts @@ -191,6 +191,7 @@ export function speakRawText_withHandle( switch (effectiveEngine.vendorType) { // RPC providers: route through speex.router RPC case 'elevenlabs': + case 'inworld': case 'openai': case 'localai': { diff --git a/src/modules/speex/speex.config.ts b/src/modules/speex/speex.config.ts index 0dce418ab..9fad9bece 100644 --- a/src/modules/speex/speex.config.ts +++ b/src/modules/speex/speex.config.ts @@ -22,4 +22,10 @@ export const SPEEX_DEFAULTS = { // LocalAI - kokoro is a high-quality neural TTS LOCALAI_MODEL: 'kokoro', + // Inworld - high-quality, low-latency TTS with voice cloning + INWORLD_MODEL: 'inworld-tts-1.5-max', // best quality (~200ms latency, $10/1M chars) + INWORLD_MODEL_FAST: 'inworld-tts-1.5-mini', // fastest (<100ms latency, $5/1M chars) + INWORLD_VOICE: 'Alex', // default voice + INWORLD_TTS_MAX_LEN: 2000, // max chars per TTS request - as of 2026-01-27 it's 2000 + } as const; \ No newline at end of file diff --git a/src/modules/speex/speex.types.ts b/src/modules/speex/speex.types.ts index 4ac160e3f..a656611b4 100644 --- a/src/modules/speex/speex.types.ts +++ b/src/modules/speex/speex.types.ts @@ -7,7 +7,7 @@ import type { SpeexWire_VoiceOption } from './protocols/rpc/rpc.wiretypes'; // Speex Vendor Types (supported TTS providers) -export type DSpeexVendorType = 'elevenlabs' | 'localai' | 'openai' | 'webspeech'; +export type DSpeexVendorType = 'elevenlabs' | 'inworld' | 'localai' | 'openai' | 'webspeech'; // Speex Engines - instances of TTS Vendors Types - persisted in store-module-speex @@ -33,6 +33,7 @@ export type SpeexEngineId = string; // agiUuidV4('speex.engine.instance') // helper for mapping credentials and voice types to the engine type interface _TypeMap extends Record { 'elevenlabs': { voice: DVoiceElevenLabs; credentials: DCredentialsApiKey }; + 'inworld': { voice: DVoiceInworld; credentials: DCredentialsApiKey }; 'localai': { voice: DVoiceLocalAI; credentials: DCredentialsLLMSService | DCredentialsApiKey }; 'openai': { voice: DVoiceOpenAI; credentials: DCredentialsLLMSService | DCredentialsApiKey }; 'webspeech': { voice: DVoiceWebSpeech; credentials: DCredentialsNone }; @@ -55,6 +56,14 @@ export interface DVoiceElevenLabs { // ttsS?: boolean; } +export interface DVoiceInworld { + dialect: 'inworld'; + ttsModel?: 'inworld-tts-1.5-max' | 'inworld-tts-1.5-mini'; + ttsVoiceId?: string; // e.g., 'Alex', 'Ashley', 'Dennis' + ttsTemperature?: number; // 0-2, default 1.1 (controls expressiveness) + ttsSpeakingRate?: number; // 0.5-1.5, default 1.0 +} + // type LocalAITTSBackend = | 'coqui' | 'bark' | 'piper' | 'transformers-musicgen' | 'vall-e-x' export interface DVoiceLocalAI { dialect: 'localai'; diff --git a/src/modules/speex/speex.vendors-registry.ts b/src/modules/speex/speex.vendors-registry.ts index a2296db4d..1d93486c9 100644 --- a/src/modules/speex/speex.vendors-registry.ts +++ b/src/modules/speex/speex.vendors-registry.ts @@ -3,8 +3,9 @@ import type { ModelVendorId } from '~/modules/llms/vendors/vendors.registry'; import type { DSpeexEngineAny, DSpeexVendorType } from './speex.types'; import type { ISpeexVendor, ISpeexVendorAny } from './ISpeexVendor'; -// vendor imports (will be implemented as stubs initially) +// vendor imports import { SpeexVendorElevenLabs } from './vendors/elevenlabs.vendor'; +import { SpeexVendorInworld } from './vendors/inworld.vendor'; import { SpeexVendorLocalAI } from './vendors/localai.vendor'; import { SpeexVendorOpenAI } from './vendors/openai.vendor'; import { SpeexVendorWebSpeech } from './vendors/webspeech.vendor'; @@ -14,6 +15,7 @@ import { SpeexVendorWebSpeech } from './vendors/webspeech.vendor'; const _SPEEX_VENDOR_REGISTRY: { [key in DSpeexVendorType]: ISpeexVendor } = { elevenlabs: SpeexVendorElevenLabs, + inworld: SpeexVendorInworld, localai: SpeexVendorLocalAI, openai: SpeexVendorOpenAI, webspeech: SpeexVendorWebSpeech, diff --git a/src/modules/speex/vendors/inworld.vendor.ts b/src/modules/speex/vendors/inworld.vendor.ts new file mode 100644 index 000000000..ccdeb9167 --- /dev/null +++ b/src/modules/speex/vendors/inworld.vendor.ts @@ -0,0 +1,31 @@ +import type { ISpeexVendor } from '../ISpeexVendor'; +import { SPEEX_DEFAULTS } from '../speex.config'; + + +export const SpeexVendorInworld: ISpeexVendor<'inworld'> = { + vendorType: 'inworld', + name: 'Inworld', + protocol: 'rpc', + location: 'cloud', + priority: 15, // between ElevenLabs (10) and LocalAI (20) + + autoFromLlmVendorIds: undefined, + + capabilities: { + streaming: true, + voiceListing: true, + speedControl: true, + pitchControl: false, + }, + + getDefaultCredentials: () => ({ + type: 'api-key', + apiKey: '', + }), + + getDefaultVoice: () => ({ + dialect: 'inworld', + ttsModel: SPEEX_DEFAULTS.INWORLD_MODEL, + ttsVoiceId: SPEEX_DEFAULTS.INWORLD_VOICE, + }), +};