mirror of
https://github.com/enricoros/big-AGI.git
synced 2026-05-11 14:10:15 -07:00
308 lines
11 KiB
TypeScript
308 lines
11 KiB
TypeScript
/**
|
|
* Speex - Speech Synthesis Module
|
|
*
|
|
* Centralized speech synthesis with provider abstraction.
|
|
* Supports multiple TTS engines: ElevenLabs, OpenAI, LocalAI, Web Speech.
|
|
*
|
|
* Future: NorthBridge integration for single-place queuing across TTS(s) and ASR.
|
|
*/
|
|
|
|
import { AudioAutoPlayer } from '~/common/util/audio/AudioAutoPlayer';
|
|
import { useUIPreferencesStore } from '~/common/stores/store-ui';
|
|
|
|
import type { DSpeexEngineAny, SpeexSpeakTextOptions, SpeexSpeakTextResult, SpeexSynthesizeOptions, SpeexSynthesizeResult, SpeexVoiceSelector } from './speex.types';
|
|
import { speexFindEngineById, speexFindGlobalEngine, speexFindValidEngineByType } from './store-module-speex';
|
|
import { speex_splitTextIntoChunks, speex_textApplyCharLimit, speex_textCleanupUnspoken } from './speex.processing';
|
|
|
|
import { SPEEX_DEBUG } from './speex.config';
|
|
import { speexSynthesize_RPC } from './protocols/rpc/rpc.client';
|
|
import { speexSynthesize_WebSpeech, speexSynthesize_WebSpeechStop } from './protocols/webspeech/webspeech.client';
|
|
|
|
|
|
interface _ChunkedCallbacks {
|
|
onChunkStart?: (progress: _ChunkedProgress) => void;
|
|
onChunkEnd?: (progress: _ChunkedProgress) => void;
|
|
onChunkError?: (error: Error, progress: _ChunkedProgress) => void;
|
|
onComplete?: (aborted: boolean) => void;
|
|
}
|
|
|
|
interface _ChunkedProgress {
|
|
chunkIndex: number; // current chunk (0-based)
|
|
totalChunks: number; // total chunks
|
|
currentChunkStart: string; // first ~100 chars of current chunk (for display)
|
|
}
|
|
|
|
|
|
/**
|
|
* Speaks text with automatic chunking, preprocessing, and abort support.
|
|
* Synthesizes and plays each chunk sequentially, waiting for playback to complete before the next.
|
|
* Breaks on error to avoid wasting API credits on repeated failures.
|
|
*/
|
|
export async function speakText(
|
|
inputText: string,
|
|
voiceSelector: SpeexVoiceSelector,
|
|
options?: SpeexSpeakTextOptions & SpeexSynthesizeOptions,
|
|
signal?: AbortSignal,
|
|
chunkedCallbacks?: _ChunkedCallbacks,
|
|
): Promise<SpeexSpeakTextResult> {
|
|
|
|
// preprocess text unless disabled
|
|
if (!options?.disableUnspeakable)
|
|
inputText = speex_textCleanupUnspoken(inputText);
|
|
if (!options?.disableCharLimit)
|
|
inputText = speex_textApplyCharLimit(inputText);
|
|
|
|
// chunk text unless disabled
|
|
const chunks = !options?.maxChunkLength ? [inputText]
|
|
: speex_splitTextIntoChunks(inputText, options.maxChunkLength /* 500 if missing */);
|
|
if (!chunks.length) {
|
|
chunkedCallbacks?.onComplete?.(false);
|
|
return { success: true, aborted: false, chunksSpoken: 0, totalChunks: 0 };
|
|
}
|
|
|
|
let chunksSpoken = 0;
|
|
let currentHandle: _SpeexSpeakHandle | null = null;
|
|
let firstError: { errorType: SpeexSpeakTextResult['errorType'], errorText: string } | undefined;
|
|
|
|
// wire up abort to stop current playback
|
|
const onAbort = () => currentHandle?.stop();
|
|
signal?.addEventListener('abort', onAbort);
|
|
|
|
try {
|
|
for (let i = 0; i < chunks.length && !signal?.aborted; i++) {
|
|
const chunkText = chunks[i];
|
|
const progress: _ChunkedProgress = {
|
|
chunkIndex: i,
|
|
totalChunks: chunks.length,
|
|
currentChunkStart: chunkText.slice(0, 100),
|
|
};
|
|
chunkedCallbacks?.onChunkStart?.(progress);
|
|
|
|
currentHandle = speakRawText_withHandle(chunkText, voiceSelector, options);
|
|
|
|
// wait for both playback and synthesis to complete
|
|
const [playbackCompleted, synthesisResult] = await Promise.all([
|
|
currentHandle.playbackComplete, // for a boolean
|
|
currentHandle.synthesisComplete, // for the SpeexSpeakResult
|
|
]);
|
|
|
|
currentHandle = null;
|
|
|
|
// check for synthesis errors - break to avoid wasting credits on repeated failures
|
|
if (!synthesisResult.success) {
|
|
firstError = firstError || {
|
|
errorType: synthesisResult.errorType,
|
|
errorText: synthesisResult.errorText,
|
|
};
|
|
chunkedCallbacks?.onChunkError?.(new Error(synthesisResult.errorText), progress);
|
|
break;
|
|
}
|
|
|
|
// check if stopped or aborted
|
|
if (!playbackCompleted || signal?.aborted)
|
|
break;
|
|
|
|
chunksSpoken++;
|
|
chunkedCallbacks?.onChunkEnd?.(progress);
|
|
}
|
|
} finally {
|
|
signal?.removeEventListener('abort', onAbort);
|
|
}
|
|
|
|
const aborted = signal?.aborted ?? false;
|
|
chunkedCallbacks?.onComplete?.(aborted);
|
|
|
|
return {
|
|
success: !firstError && !aborted && chunksSpoken === chunks.length,
|
|
aborted,
|
|
chunksSpoken,
|
|
totalChunks: chunks.length,
|
|
...firstError,
|
|
};
|
|
}
|
|
|
|
|
|
/**
|
|
* Handle returned by speakTextWithHandle() for controlled playback.
|
|
* Allows waiting for playback completion and stopping mid-playback.
|
|
*/
|
|
interface _SpeexSpeakHandle {
|
|
readonly synthesisComplete: Promise<SpeexSynthesizeResult>;
|
|
readonly playbackComplete: Promise<boolean>;
|
|
|
|
/** Stops both synthesis and playback immediately */
|
|
stop(): void;
|
|
}
|
|
|
|
/**
|
|
* Speak text with a handle for controlled playback.
|
|
*
|
|
* Returns a _SpeexSpeakHandle that allows:
|
|
* - Awaiting synthesis completion (synthesisComplete)
|
|
* - Awaiting playback completion (playbackComplete)
|
|
* - Stopping both synthesis and playback mid-stream (stop())
|
|
*
|
|
* @example
|
|
* ```typescript
|
|
* // Fire and forget (ignores handle)
|
|
* speakTextWithHandle('Hello world');
|
|
*
|
|
* // Wait for playback to complete
|
|
* const handle = speakTextWithHandle('Hello world');
|
|
* await handle.playbackComplete;
|
|
*
|
|
* // Stop early
|
|
* const handle = speakTextWithHandle('Long text...');
|
|
* handle.stop();
|
|
* ```
|
|
*/
|
|
export function speakRawText_withHandle(
|
|
rawText: string, // this won't be processed - use speakText for chunking, cleanup, etc.
|
|
voiceSelector: SpeexVoiceSelector,
|
|
rpcOptions?: SpeexSynthesizeOptions,
|
|
): _SpeexSpeakHandle {
|
|
|
|
// resolve engine from voice selector
|
|
const engine = _engineFromSelector(voiceSelector);
|
|
if (!engine)
|
|
return {
|
|
synthesisComplete: Promise.resolve({ success: false, errorType: 'tts-no-engine', errorText: 'No TTS engine configured. Please configure a TTS engine in Settings.' } satisfies SpeexSynthesizeResult),
|
|
playbackComplete: Promise.resolve(false), // no engine = not completed
|
|
stop: () => {
|
|
},
|
|
};
|
|
|
|
// apply voice override from selector (merge with engine defaults)
|
|
const effectiveEngine = _engineApplyVoiceOverride(engine, voiceSelector);
|
|
if (SPEEX_DEBUG) console.log(`[Speex] speakRawText: Using effective engine ${effectiveEngine.engineId} (vendor: ${effectiveEngine.vendorType})`, { length: rawText.length, voiceSelector });
|
|
|
|
const {
|
|
rpcDisableStreaming = false,
|
|
disablePlayback = false,
|
|
disableLivePlayback = false,
|
|
rpcReturnAudio = false,
|
|
languageCode = _getUIPreferenceLanguageCode(),
|
|
priority,
|
|
} = rpcOptions || {};
|
|
|
|
|
|
let isStopped = false;
|
|
|
|
switch (effectiveEngine.vendorType) {
|
|
// RPC providers: route through speex.router RPC
|
|
case 'elevenlabs':
|
|
case 'openai':
|
|
case 'localai': {
|
|
|
|
const abortController = new AbortController();
|
|
let audioPlayer: AudioAutoPlayer | undefined;
|
|
const createAudioPlayer = disablePlayback ? undefined
|
|
: () => audioPlayer = new AudioAutoPlayer(disableLivePlayback);
|
|
|
|
// deferred resolver for the promise (so stop and the end don't race with awaiting the promise)
|
|
let playbackCompleteResolve: (completed: boolean) => void;
|
|
const playbackComplete = new Promise<boolean>(resolve => playbackCompleteResolve = resolve);
|
|
|
|
const synthesisComplete = speexSynthesize_RPC(
|
|
effectiveEngine,
|
|
rawText,
|
|
{ dataStreaming: !rpcDisableStreaming, languageCode, returnAudioBuffer: rpcReturnAudio, priority },
|
|
abortController,
|
|
createAudioPlayer,
|
|
).then(async (result) => {
|
|
|
|
// wait for playback to complete (unless stopped already)
|
|
if (!isStopped && audioPlayer)
|
|
await audioPlayer.waitForPlaybackEnd();
|
|
|
|
// resolves the playback completion promise
|
|
playbackCompleteResolve(!isStopped);
|
|
|
|
// return synthesis result for the synthesis completion promise
|
|
return result;
|
|
|
|
}).catch((error) => {
|
|
// ensure playbackComplete resolves even on unexpected errors
|
|
playbackCompleteResolve(false);
|
|
return { success: false, errorType: 'tts-exception', errorText: error?.message || 'Unexpected synthesis error' } satisfies SpeexSynthesizeResult;
|
|
});
|
|
|
|
// _SpeexSpeakHandle
|
|
return {
|
|
synthesisComplete,
|
|
playbackComplete, // resolves just a tad earlier than synthesisComplete
|
|
stop: () => {
|
|
isStopped = true;
|
|
abortController.abort();
|
|
audioPlayer?.stop();
|
|
playbackCompleteResolve(false);
|
|
},
|
|
};
|
|
}
|
|
|
|
// Web Speech: client-only, no RPC
|
|
case 'webspeech': {
|
|
|
|
// if we disable playback, we have nothing to do here, really, as Web Speech API is playback-centric
|
|
if (disablePlayback) {
|
|
return {
|
|
synthesisComplete: Promise.resolve({ success: false, errorType: 'tts-playback-disabled', errorText: 'Playback is disabled for Web Speech synthesis.' } satisfies SpeexSynthesizeResult),
|
|
playbackComplete: Promise.resolve(false), // playback disabled = not completed
|
|
stop: () => {
|
|
// no-op
|
|
},
|
|
};
|
|
}
|
|
|
|
const synthesisComplete = speexSynthesize_WebSpeech(rawText, effectiveEngine.voice);
|
|
// playbackComplete: true if finished normally, false if stopped or error
|
|
const playbackComplete = synthesisComplete.then(() => !isStopped).catch(() => false);
|
|
|
|
// _SpeexSpeakHandle
|
|
return {
|
|
synthesisComplete,
|
|
playbackComplete,
|
|
stop: () => {
|
|
isStopped = true;
|
|
speexSynthesize_WebSpeechStop();
|
|
},
|
|
};
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// -- Private helpers --
|
|
|
|
function _engineFromSelector(selector: SpeexVoiceSelector): DSpeexEngineAny | null {
|
|
if (selector) {
|
|
// A. most specific selector: engineId
|
|
if ('engineId' in selector && selector.engineId) {
|
|
const engine = speexFindEngineById(selector.engineId, false /* force through */);
|
|
if (engine) return engine;
|
|
}
|
|
|
|
// B. voice.dialect - find first matching engine that's probably valid
|
|
if ('voice' in selector && selector.voice?.dialect) {
|
|
const engine = speexFindValidEngineByType(selector.voice.dialect);
|
|
if (engine) return engine;
|
|
}
|
|
}
|
|
|
|
// C. fall back to global engine (active or priority-ranked)
|
|
return speexFindGlobalEngine();
|
|
}
|
|
|
|
function _engineApplyVoiceOverride(engine: DSpeexEngineAny, selector: SpeexVoiceSelector): DSpeexEngineAny {
|
|
return (!selector || !('voice' in selector) || !selector.voice) ? engine : {
|
|
...engine,
|
|
voice: { ...engine.voice, ...selector.voice },
|
|
} as DSpeexEngineAny;
|
|
}
|
|
|
|
// extract base language code (e.g., 'en-US' -> 'en', 'fr' -> 'fr')
|
|
function _getUIPreferenceLanguageCode(): string | undefined {
|
|
const { preferredLanguage } = useUIPreferencesStore.getState();
|
|
return preferredLanguage?.split('-')[0]?.toLowerCase() || undefined;
|
|
}
|