Files
big-agi/src/modules/speex/speex.client.ts
T
2026-01-27 20:23:10 -08:00

308 lines
11 KiB
TypeScript

/**
* Speex - Speech Synthesis Module
*
* Centralized speech synthesis with provider abstraction.
* Supports multiple TTS engines: ElevenLabs, OpenAI, LocalAI, Web Speech.
*
* Future: NorthBridge integration for single-place queuing across TTS(s) and ASR.
*/
import { AudioAutoPlayer } from '~/common/util/audio/AudioAutoPlayer';
import { useUIPreferencesStore } from '~/common/stores/store-ui';
import type { DSpeexEngineAny, SpeexSpeakTextOptions, SpeexSpeakTextResult, SpeexSynthesizeOptions, SpeexSynthesizeResult, SpeexVoiceSelector } from './speex.types';
import { speexFindEngineById, speexFindGlobalEngine, speexFindValidEngineByType } from './store-module-speex';
import { speex_splitTextIntoChunks, speex_textApplyCharLimit, speex_textCleanupUnspoken } from './speex.processing';
import { SPEEX_DEBUG } from './speex.config';
import { speexSynthesize_RPC } from './protocols/rpc/rpc.client';
import { speexSynthesize_WebSpeech, speexSynthesize_WebSpeechStop } from './protocols/webspeech/webspeech.client';
interface _ChunkedCallbacks {
onChunkStart?: (progress: _ChunkedProgress) => void;
onChunkEnd?: (progress: _ChunkedProgress) => void;
onChunkError?: (error: Error, progress: _ChunkedProgress) => void;
onComplete?: (aborted: boolean) => void;
}
interface _ChunkedProgress {
chunkIndex: number; // current chunk (0-based)
totalChunks: number; // total chunks
currentChunkStart: string; // first ~100 chars of current chunk (for display)
}
/**
* Speaks text with automatic chunking, preprocessing, and abort support.
* Synthesizes and plays each chunk sequentially, waiting for playback to complete before the next.
* Breaks on error to avoid wasting API credits on repeated failures.
*/
export async function speakText(
inputText: string,
voiceSelector: SpeexVoiceSelector,
options?: SpeexSpeakTextOptions & SpeexSynthesizeOptions,
signal?: AbortSignal,
chunkedCallbacks?: _ChunkedCallbacks,
): Promise<SpeexSpeakTextResult> {
// preprocess text unless disabled
if (!options?.disableUnspeakable)
inputText = speex_textCleanupUnspoken(inputText);
if (!options?.disableCharLimit)
inputText = speex_textApplyCharLimit(inputText);
// chunk text unless disabled
const chunks = !options?.maxChunkLength ? [inputText]
: speex_splitTextIntoChunks(inputText, options.maxChunkLength /* 500 if missing */);
if (!chunks.length) {
chunkedCallbacks?.onComplete?.(false);
return { success: true, aborted: false, chunksSpoken: 0, totalChunks: 0 };
}
let chunksSpoken = 0;
let currentHandle: _SpeexSpeakHandle | null = null;
let firstError: { errorType: SpeexSpeakTextResult['errorType'], errorText: string } | undefined;
// wire up abort to stop current playback
const onAbort = () => currentHandle?.stop();
signal?.addEventListener('abort', onAbort);
try {
for (let i = 0; i < chunks.length && !signal?.aborted; i++) {
const chunkText = chunks[i];
const progress: _ChunkedProgress = {
chunkIndex: i,
totalChunks: chunks.length,
currentChunkStart: chunkText.slice(0, 100),
};
chunkedCallbacks?.onChunkStart?.(progress);
currentHandle = speakRawText_withHandle(chunkText, voiceSelector, options);
// wait for both playback and synthesis to complete
const [playbackCompleted, synthesisResult] = await Promise.all([
currentHandle.playbackComplete, // for a boolean
currentHandle.synthesisComplete, // for the SpeexSpeakResult
]);
currentHandle = null;
// check for synthesis errors - break to avoid wasting credits on repeated failures
if (!synthesisResult.success) {
firstError = firstError || {
errorType: synthesisResult.errorType,
errorText: synthesisResult.errorText,
};
chunkedCallbacks?.onChunkError?.(new Error(synthesisResult.errorText), progress);
break;
}
// check if stopped or aborted
if (!playbackCompleted || signal?.aborted)
break;
chunksSpoken++;
chunkedCallbacks?.onChunkEnd?.(progress);
}
} finally {
signal?.removeEventListener('abort', onAbort);
}
const aborted = signal?.aborted ?? false;
chunkedCallbacks?.onComplete?.(aborted);
return {
success: !firstError && !aborted && chunksSpoken === chunks.length,
aborted,
chunksSpoken,
totalChunks: chunks.length,
...firstError,
};
}
/**
* Handle returned by speakTextWithHandle() for controlled playback.
* Allows waiting for playback completion and stopping mid-playback.
*/
interface _SpeexSpeakHandle {
readonly synthesisComplete: Promise<SpeexSynthesizeResult>;
readonly playbackComplete: Promise<boolean>;
/** Stops both synthesis and playback immediately */
stop(): void;
}
/**
* Speak text with a handle for controlled playback.
*
* Returns a _SpeexSpeakHandle that allows:
* - Awaiting synthesis completion (synthesisComplete)
* - Awaiting playback completion (playbackComplete)
* - Stopping both synthesis and playback mid-stream (stop())
*
* @example
* ```typescript
* // Fire and forget (ignores handle)
* speakTextWithHandle('Hello world');
*
* // Wait for playback to complete
* const handle = speakTextWithHandle('Hello world');
* await handle.playbackComplete;
*
* // Stop early
* const handle = speakTextWithHandle('Long text...');
* handle.stop();
* ```
*/
export function speakRawText_withHandle(
rawText: string, // this won't be processed - use speakText for chunking, cleanup, etc.
voiceSelector: SpeexVoiceSelector,
rpcOptions?: SpeexSynthesizeOptions,
): _SpeexSpeakHandle {
// resolve engine from voice selector
const engine = _engineFromSelector(voiceSelector);
if (!engine)
return {
synthesisComplete: Promise.resolve({ success: false, errorType: 'tts-no-engine', errorText: 'No TTS engine configured. Please configure a TTS engine in Settings.' } satisfies SpeexSynthesizeResult),
playbackComplete: Promise.resolve(false), // no engine = not completed
stop: () => {
},
};
// apply voice override from selector (merge with engine defaults)
const effectiveEngine = _engineApplyVoiceOverride(engine, voiceSelector);
if (SPEEX_DEBUG) console.log(`[Speex] speakRawText: Using effective engine ${effectiveEngine.engineId} (vendor: ${effectiveEngine.vendorType})`, { length: rawText.length, voiceSelector });
const {
rpcDisableStreaming = false,
disablePlayback = false,
disableLivePlayback = false,
rpcReturnAudio = false,
languageCode = _getUIPreferenceLanguageCode(),
priority,
} = rpcOptions || {};
let isStopped = false;
switch (effectiveEngine.vendorType) {
// RPC providers: route through speex.router RPC
case 'elevenlabs':
case 'openai':
case 'localai': {
const abortController = new AbortController();
let audioPlayer: AudioAutoPlayer | undefined;
const createAudioPlayer = disablePlayback ? undefined
: () => audioPlayer = new AudioAutoPlayer(disableLivePlayback);
// deferred resolver for the promise (so stop and the end don't race with awaiting the promise)
let playbackCompleteResolve: (completed: boolean) => void;
const playbackComplete = new Promise<boolean>(resolve => playbackCompleteResolve = resolve);
const synthesisComplete = speexSynthesize_RPC(
effectiveEngine,
rawText,
{ dataStreaming: !rpcDisableStreaming, languageCode, returnAudioBuffer: rpcReturnAudio, priority },
abortController,
createAudioPlayer,
).then(async (result) => {
// wait for playback to complete (unless stopped already)
if (!isStopped && audioPlayer)
await audioPlayer.waitForPlaybackEnd();
// resolves the playback completion promise
playbackCompleteResolve(!isStopped);
// return synthesis result for the synthesis completion promise
return result;
}).catch((error) => {
// ensure playbackComplete resolves even on unexpected errors
playbackCompleteResolve(false);
return { success: false, errorType: 'tts-exception', errorText: error?.message || 'Unexpected synthesis error' } satisfies SpeexSynthesizeResult;
});
// _SpeexSpeakHandle
return {
synthesisComplete,
playbackComplete, // resolves just a tad earlier than synthesisComplete
stop: () => {
isStopped = true;
abortController.abort();
audioPlayer?.stop();
playbackCompleteResolve(false);
},
};
}
// Web Speech: client-only, no RPC
case 'webspeech': {
// if we disable playback, we have nothing to do here, really, as Web Speech API is playback-centric
if (disablePlayback) {
return {
synthesisComplete: Promise.resolve({ success: false, errorType: 'tts-playback-disabled', errorText: 'Playback is disabled for Web Speech synthesis.' } satisfies SpeexSynthesizeResult),
playbackComplete: Promise.resolve(false), // playback disabled = not completed
stop: () => {
// no-op
},
};
}
const synthesisComplete = speexSynthesize_WebSpeech(rawText, effectiveEngine.voice);
// playbackComplete: true if finished normally, false if stopped or error
const playbackComplete = synthesisComplete.then(() => !isStopped).catch(() => false);
// _SpeexSpeakHandle
return {
synthesisComplete,
playbackComplete,
stop: () => {
isStopped = true;
speexSynthesize_WebSpeechStop();
},
};
}
}
}
// -- Private helpers --
function _engineFromSelector(selector: SpeexVoiceSelector): DSpeexEngineAny | null {
if (selector) {
// A. most specific selector: engineId
if ('engineId' in selector && selector.engineId) {
const engine = speexFindEngineById(selector.engineId, false /* force through */);
if (engine) return engine;
}
// B. voice.dialect - find first matching engine that's probably valid
if ('voice' in selector && selector.voice?.dialect) {
const engine = speexFindValidEngineByType(selector.voice.dialect);
if (engine) return engine;
}
}
// C. fall back to global engine (active or priority-ranked)
return speexFindGlobalEngine();
}
function _engineApplyVoiceOverride(engine: DSpeexEngineAny, selector: SpeexVoiceSelector): DSpeexEngineAny {
return (!selector || !('voice' in selector) || !selector.voice) ? engine : {
...engine,
voice: { ...engine.voice, ...selector.voice },
} as DSpeexEngineAny;
}
// extract base language code (e.g., 'en-US' -> 'en', 'fr' -> 'fr')
function _getUIPreferenceLanguageCode(): string | undefined {
const { preferredLanguage } = useUIPreferencesStore.getState();
return preferredLanguage?.split('-')[0]?.toLowerCase() || undefined;
}