mirror of
https://github.com/enricoros/big-AGI.git
synced 2026-05-10 21:50:14 -07:00
Speex: fix elevenlabs
This commit is contained in:
@@ -144,6 +144,7 @@ function ElevenLabsConfig({ engine, onUpdate, mode }: {
|
||||
engine={engine}
|
||||
voiceId={voice.ttsVoiceId ?? null}
|
||||
onVoiceChange={handleVoiceChange}
|
||||
autoPreview
|
||||
/>
|
||||
</FormControl>
|
||||
|
||||
|
||||
@@ -3,7 +3,8 @@ import { useQuery } from '@tanstack/react-query';
|
||||
|
||||
import { CircularProgress, Option, Select } from '@mui/joy';
|
||||
import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown';
|
||||
import RecordVoiceOverTwoToneIcon from '@mui/icons-material/RecordVoiceOverTwoTone';
|
||||
|
||||
import { AudioPlayer } from '~/common/util/audio/AudioPlayer';
|
||||
|
||||
import type { DSpeexEngineAny, SpeexListVoiceOption } from '../speex.types';
|
||||
import { speexListVoices_RPC } from '../protocols/rpc/rpc.client';
|
||||
@@ -24,11 +25,24 @@ export function SpeexVoiceDropdown(props: {
|
||||
// external state - module
|
||||
const { voices, isLoading, error } = useSpeexVoices(engine);
|
||||
|
||||
// track user-initiated voice changes for preview (not initial load or voice list changes)
|
||||
const [userSelectedVoiceId, setUserSelectedVoiceId] = React.useState<string | null>(null);
|
||||
|
||||
|
||||
// [effect] auto-preview: play voice sample only when user explicitly selects a voice
|
||||
const selectedVoice = userSelectedVoiceId ? voices.find(v => v.id === userSelectedVoiceId) : null;
|
||||
const previewUrl = (autoPreview && selectedVoice?.previewUrl) || null;
|
||||
React.useEffect(() => {
|
||||
if (previewUrl)
|
||||
void AudioPlayer.playUrl(previewUrl);
|
||||
}, [previewUrl]);
|
||||
|
||||
|
||||
// handlers
|
||||
|
||||
const handleVoiceChange = React.useCallback((_event: unknown, value: string | null) => {
|
||||
if (value) onVoiceChange(value);
|
||||
setUserSelectedVoiceId(value);
|
||||
value && onVoiceChange(value);
|
||||
}, [onVoiceChange]);
|
||||
|
||||
|
||||
@@ -44,7 +58,7 @@ export function SpeexVoiceDropdown(props: {
|
||||
: voices.length === 0 ? 'No voices available'
|
||||
: 'Select a voice'
|
||||
}
|
||||
startDecorator={<RecordVoiceOverTwoToneIcon />}
|
||||
// startDecorator={<RecordVoiceOverTwoToneIcon />}
|
||||
endDecorator={isLoading && <CircularProgress size='sm' />}
|
||||
indicator={<KeyboardArrowDownIcon />}
|
||||
slotProps={{
|
||||
@@ -53,7 +67,7 @@ export function SpeexVoiceDropdown(props: {
|
||||
}}
|
||||
>
|
||||
{voices.map(voice => (
|
||||
<Option key={voice.id} value={voice.id}>
|
||||
<Option key={voice.id} value={voice.id} label={voice.name}>
|
||||
{voice.name}
|
||||
{voice.description && <span style={{ opacity: 0.6, marginLeft: 8 }}>({voice.description})</span>}
|
||||
</Option>
|
||||
|
||||
@@ -6,12 +6,14 @@
|
||||
*/
|
||||
|
||||
import { apiAsync, apiStream } from '~/common/util/trpc.client';
|
||||
import { convert_Base64_To_UInt8Array, convert_UInt8Array_To_Base64 } from '~/common/util/blobUtils';
|
||||
import { findModelsServiceOrNull } from '~/common/stores/llms/store-llms';
|
||||
|
||||
import type { DLocalAIServiceSettings } from '~/modules/llms/vendors/localai/localai.vendor';
|
||||
import type { DOpenAIServiceSettings } from '~/modules/llms/vendors/openai/openai.vendor';
|
||||
|
||||
import { AudioLivePlayer } from '~/common/util/audio/AudioLivePlayer';
|
||||
import { AudioPlayer } from '~/common/util/audio/AudioPlayer';
|
||||
|
||||
import type { DSpeexEngine, SpeexSpeakResult } from '../../speex.types';
|
||||
import type { SpeexWire_Access, SpeexWire_ListVoices_Output, SpeexWire_Voice } from './rpc.wiretypes';
|
||||
@@ -28,9 +30,10 @@ export async function speexSynthesize_RPC(
|
||||
text: string,
|
||||
options: {
|
||||
streaming: boolean;
|
||||
languageCode?: string;
|
||||
priority?: 'fast' | 'balanced' | 'quality';
|
||||
playback: boolean;
|
||||
returnAudio: boolean;
|
||||
languageCode?: string
|
||||
},
|
||||
callbacks?: {
|
||||
onStart?: () => void;
|
||||
@@ -63,7 +66,7 @@ export async function speexSynthesize_RPC(
|
||||
|
||||
// call the streaming RPC - whether the backend will stream in chunks or as a whole
|
||||
const particleStream = await apiStream.speex.synthesize.mutate(
|
||||
{ access, text, voice, streaming: options.streaming, languageCode: options.languageCode },
|
||||
{ access, text, voice, streaming: options.streaming, languageCode: options.languageCode, priority: options.priority },
|
||||
{ signal: abortController.signal },
|
||||
);
|
||||
|
||||
@@ -78,12 +81,16 @@ export async function speexSynthesize_RPC(
|
||||
|
||||
case 'audio':
|
||||
// Decode base64 to ArrayBuffer
|
||||
// const audioBuffer = convert_Base64_To_UInt8Array(particle.base64, 'speexSynthesize_RPC audio chunk'); // preload conversion
|
||||
const audioBuffer = _base64ToArrayBuffer(particle.base64);
|
||||
const audioBuffer = convert_Base64_To_UInt8Array(particle.base64, 'speex.rpc.client').buffer;
|
||||
|
||||
// Playback
|
||||
if (options.playback)
|
||||
audioPlayer?.enqueueChunk(audioBuffer);
|
||||
// Playback: streaming uses AudioLivePlayer for chunked playback,
|
||||
// non-streaming uses AudioPlayer for single-buffer playback
|
||||
if (options.playback) {
|
||||
if (particle.chunk)
|
||||
audioPlayer?.enqueueChunk(audioBuffer);
|
||||
else
|
||||
void AudioPlayer.playBuffer(audioBuffer); // fire-and-forget for whole audio
|
||||
}
|
||||
|
||||
// Accumulate for return
|
||||
if (options.returnAudio)
|
||||
@@ -93,6 +100,10 @@ export async function speexSynthesize_RPC(
|
||||
callbacks?.onChunk?.(audioBuffer);
|
||||
break;
|
||||
|
||||
case 'log':
|
||||
console.log(`[Speex] (${particle.level})`, particle.message);
|
||||
break;
|
||||
|
||||
case 'done':
|
||||
audioPlayer?.endPlayback();
|
||||
break;
|
||||
@@ -117,7 +128,7 @@ export async function speexSynthesize_RPC(
|
||||
combined.set(new Uint8Array(chunk), offset);
|
||||
offset += chunk.byteLength;
|
||||
}
|
||||
result.audioBase64 = _arrayBufferToBase64(combined.buffer);
|
||||
result.audioBase64 = convert_UInt8Array_To_Base64(combined, 'speex.rpc.client');
|
||||
}
|
||||
|
||||
return result;
|
||||
@@ -212,25 +223,3 @@ function _buildRPCWireAccess({ credentials: c, vendorType }: _DSpeexEngineRPC):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Private: Helpers
|
||||
|
||||
// TODO: use `blobUtils.ts` functions instead?
|
||||
|
||||
function _base64ToArrayBuffer(base64: string): ArrayBuffer {
|
||||
const binaryString = atob(base64);
|
||||
const bytes = new Uint8Array(binaryString.length);
|
||||
for (let i = 0; i < binaryString.length; i++) {
|
||||
bytes[i] = binaryString.charCodeAt(i);
|
||||
}
|
||||
return bytes.buffer;
|
||||
}
|
||||
|
||||
function _arrayBufferToBase64(buffer: ArrayBuffer): string {
|
||||
const bytes = new Uint8Array(buffer);
|
||||
let binary = '';
|
||||
for (let i = 0; i < bytes.byteLength; i++) {
|
||||
binary += String.fromCharCode(bytes[i]);
|
||||
}
|
||||
return btoa(binary);
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ interface SynthesizeBackendFnParams<TSpeexAccess extends SpeexWire_Access> {
|
||||
voice: SpeexWire_Voice;
|
||||
streaming: boolean;
|
||||
languageCode?: string;
|
||||
priority?: 'fast' | 'balanced' | 'quality';
|
||||
signal?: AbortSignal;
|
||||
}
|
||||
|
||||
@@ -26,17 +27,18 @@ export const speexRouter = createTRPCRouter({
|
||||
synthesize: edgeProcedure
|
||||
.input(SpeexWire.Synthesize_input_schema)
|
||||
.mutation(async function* ({ input, ctx }): AsyncGenerator<SpeexSpeechParticle> {
|
||||
const { access, text, voice, streaming, languageCode } = input;
|
||||
const { access, text, voice, streaming, languageCode, priority } = input;
|
||||
|
||||
try {
|
||||
yield { t: 'start' };
|
||||
switch (access.dialect) {
|
||||
case 'elevenlabs':
|
||||
yield* synthesizeElevenLabs({ access, text, voice, streaming, languageCode, signal: ctx.reqSignal });
|
||||
yield* synthesizeElevenLabs({ access, text, voice, streaming, languageCode, priority, signal: ctx.reqSignal });
|
||||
break;
|
||||
|
||||
case 'localai':
|
||||
case 'openai':
|
||||
yield* synthesizeOpenAIProtocol({ access, text, voice, streaming, languageCode, signal: ctx.reqSignal });
|
||||
yield* synthesizeOpenAIProtocol({ access, text, voice, streaming, languageCode, priority, signal: ctx.reqSignal });
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
@@ -8,6 +8,7 @@ export type SpeexSpeechParticle =
|
||||
| { t: 'start' }
|
||||
| { t: 'audio'; base64: string; chunk: boolean; contentType?: string; characterCost?: number; ttsLatencyMs?: number }
|
||||
| { t: 'done'; durationMs?: number; chars?: number }
|
||||
| { t: 'log'; level: 'info', message: string }
|
||||
| { t: 'error'; e: string }
|
||||
;
|
||||
|
||||
@@ -86,6 +87,7 @@ export namespace SpeexWire {
|
||||
voice: SpeexWire.Voice_schema,
|
||||
streaming: z.boolean().default(true),
|
||||
languageCode: z.string().optional(), // ISO language code (e.g., 'en', 'fr') for model selection fallback
|
||||
priority: z.enum(['fast', 'balanced', 'quality']).optional(), // Hint for speed vs quality tradeoff
|
||||
});
|
||||
|
||||
|
||||
@@ -96,7 +98,12 @@ export namespace SpeexWire {
|
||||
name: z.string(),
|
||||
description: z.string().optional(),
|
||||
previewUrl: z.string().optional(),
|
||||
category: z.string().optional(),
|
||||
category: z.string().optional(), // e.g., 'premade', 'cloned', 'professional'
|
||||
// Voice labels (flattened for simplicity)
|
||||
// gender: z.string().optional(), // e.g., 'male', 'female', 'neutral'
|
||||
// accent: z.string().optional(), // e.g., 'american', 'british', 'australian'
|
||||
// age: z.string().optional(), // e.g., 'young', 'middle_aged', 'old'
|
||||
// language: z.string().optional(), // e.g., 'en', 'es', 'multilingual'
|
||||
});
|
||||
|
||||
export const ListVoices_input_schema = z.object({
|
||||
|
||||
@@ -10,35 +10,37 @@ import { returnAudioWholeOrThrow, streamAudioChunksOrThrow } from './rpc.streami
|
||||
// configuration
|
||||
const SAFETY_TEXT_LENGTH = 1000;
|
||||
const MIN_CHUNK_SIZE = 4096;
|
||||
const MODEL_FAST = 'eleven_turbo_v2_5'; // Fastest, English-optimized
|
||||
const MODEL_QUALITY = 'eleven_multilingual_v2'; // Highest quality, multilingual
|
||||
const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM'; // Rachel
|
||||
const DEFAULT_MODEL_ENGLISH = 'eleven_turbo_v2_5';
|
||||
const DEFAULT_MODEL_MULTILINGUAL = 'eleven_multilingual_v2';
|
||||
|
||||
|
||||
const _selectModelForLanguage = (languageCode: string | undefined): string =>
|
||||
languageCode?.toLowerCase() === 'en' ? DEFAULT_MODEL_ENGLISH : DEFAULT_MODEL_MULTILINGUAL;
|
||||
const _selectModel = (priority: 'fast' | 'balanced' | 'quality' | undefined, languageCode: string | undefined): string => {
|
||||
return priority === 'fast' ? MODEL_FAST // lowest latency, best for real-time use cases like calls
|
||||
: priority === 'quality' ? MODEL_QUALITY // multilingual v2 (highest quality)
|
||||
: languageCode?.toLowerCase() === 'en' ? MODEL_FAST : MODEL_QUALITY; // 'balanced'/undefined: English → turbo, non-English → multilingual
|
||||
};
|
||||
|
||||
|
||||
export const synthesizeElevenLabs: SynthesizeBackendFn<SpeexWire_Access_ElevenLabs> = async function* (params) {
|
||||
|
||||
// destructure and validate
|
||||
const { access, text: inputText, voice, streaming, languageCode, signal } = params;
|
||||
const { access, text: inputText, voice, streaming, languageCode, priority, signal } = params;
|
||||
if (access.dialect !== 'elevenlabs' || voice.dialect !== 'elevenlabs')
|
||||
throw new Error('Mismatched dialect in ElevenLabs synthesize');
|
||||
|
||||
|
||||
// safety check: trim text that's too long
|
||||
let text = inputText;
|
||||
if (text.length > SAFETY_TEXT_LENGTH)
|
||||
if (text.length > SAFETY_TEXT_LENGTH) {
|
||||
text = text.slice(0, SAFETY_TEXT_LENGTH);
|
||||
|
||||
// -> log.info
|
||||
yield { t: 'log', level: 'info', message: `Text truncated to ${SAFETY_TEXT_LENGTH} characters` };
|
||||
}
|
||||
|
||||
// build request - narrow to elevenlabs dialect for type safety
|
||||
const voiceId = (voice.dialect === 'elevenlabs' ? voice.ttsVoiceId : undefined) || DEFAULT_VOICE_ID;
|
||||
|
||||
// Model selection: use explicit model if provided, otherwise auto-select based on language
|
||||
const explicitModel = voice.dialect === 'elevenlabs' ? voice.ttsModel : undefined;
|
||||
const model = explicitModel || _selectModelForLanguage(languageCode);
|
||||
const voiceId = voice.ttsVoiceId || env.ELEVENLABS_VOICE_ID || DEFAULT_VOICE_ID;
|
||||
const model = voice.ttsModel || _selectModel(priority, languageCode);
|
||||
|
||||
const path = `/v1/text-to-speech/${voiceId}${streaming ? '/stream' : ''}`;
|
||||
const { headers, url } = _elevenlabsAccess(access, path);
|
||||
@@ -46,7 +48,7 @@ export const synthesizeElevenLabs: SynthesizeBackendFn<SpeexWire_Access_ElevenLa
|
||||
const body: ElevenLabsWire.TTS_Request = {
|
||||
text,
|
||||
model_id: model,
|
||||
};
|
||||
} as const;
|
||||
|
||||
// Fetch
|
||||
let response: Response;
|
||||
@@ -101,6 +103,11 @@ export async function listVoicesElevenLabs(access: SpeexWire_Access_ElevenLabs):
|
||||
description: voice.description || undefined,
|
||||
previewUrl: voice.preview_url || undefined,
|
||||
category: voice.category,
|
||||
// Flatten labels for UI display
|
||||
// gender: voice.labels?.gender || undefined,
|
||||
// accent: voice.labels?.accent || undefined,
|
||||
// age: voice.labels?.age || undefined,
|
||||
// language: voice.labels?.language || undefined,
|
||||
})),
|
||||
};
|
||||
}
|
||||
@@ -142,6 +149,30 @@ function _elevenlabsAccess(access: SpeexWire_Access_ElevenLabs, apiPath: string)
|
||||
|
||||
namespace ElevenLabsWire {
|
||||
|
||||
// export type VoicesList = z.infer<typeof VoicesList_schema>;
|
||||
export const VoicesList_schema = z.object({
|
||||
voices: z.array(z.object({
|
||||
voice_id: z.string(),
|
||||
name: z.string(),
|
||||
category: z.enum(['premade', 'cloned', 'professional']).or(z.string()),
|
||||
labels: z.looseObject({
|
||||
gender: z.enum(['male', 'female', 'neutral']).or(z.string()).nullish(),
|
||||
accent: z.string().nullish(),
|
||||
age: z.string().nullish(),
|
||||
language: z.string().nullish(),
|
||||
}),
|
||||
description: z.string().nullish(),
|
||||
preview_url: z.string().nullish(),
|
||||
settings: z.object({
|
||||
stability: z.number(),
|
||||
similarity_boost: z.number(),
|
||||
}).nullish(),
|
||||
// high_quality_base_model_ids: z.array(z.string()).nullish(),
|
||||
is_owner: z.boolean().nullish(),
|
||||
is_legacy: z.boolean().nullish(),
|
||||
})),
|
||||
});
|
||||
|
||||
export type TTS_Request = z.infer<typeof TTS_Request_schema>;
|
||||
export const TTS_Request_schema = z.object({
|
||||
text: z.string(),
|
||||
@@ -152,20 +183,4 @@ namespace ElevenLabsWire {
|
||||
}).optional(),
|
||||
});
|
||||
|
||||
// export type VoicesList = z.infer<typeof VoicesList_schema>;
|
||||
export const VoicesList_schema = z.object({
|
||||
voices: z.array(z.object({
|
||||
voice_id: z.string(),
|
||||
name: z.string(),
|
||||
category: z.string(),
|
||||
labels: z.record(z.string(), z.string()),
|
||||
description: z.string(),
|
||||
preview_url: z.string(),
|
||||
settings: z.object({
|
||||
stability: z.number(),
|
||||
similarity_boost: z.number(),
|
||||
}),
|
||||
})),
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
@@ -59,7 +59,7 @@ export const synthesizeOpenAIProtocol: SynthesizeBackendFn<SpeexWire_Access_Open
|
||||
const headers: HeadersInit = {
|
||||
'Content-Type': 'application/json',
|
||||
...(!apiKey ? {} : { 'Authorization': `Bearer ${apiKey}` }),
|
||||
...(!access.orgId ? {} : { 'OpenAI-Organization': access.orgId }),
|
||||
...(!access.apiOrgId ? {} : { 'OpenAI-Organization': access.apiOrgId }),
|
||||
};
|
||||
|
||||
// request.body
|
||||
|
||||
@@ -32,9 +32,10 @@ type _Speak_Callbacks = {
|
||||
export async function speakText(inputText: string, voiceSelector: _Speak_VoiceSelector, options?: SpeexSpeakOptions, callbacks?: _Speak_Callbacks): Promise<SpeexSpeakResult> {
|
||||
|
||||
const streaming = options?.streaming ?? true;
|
||||
const languageCode = options?.languageCode ?? _getUIPreferenceLanguageCode();
|
||||
const priority = options?.priority;
|
||||
const playback = options?.playback ?? true;
|
||||
const returnAudio = options?.returnAudio ?? !streaming;
|
||||
const languageCode = options?.languageCode ?? _getUIPreferenceLanguageCode();
|
||||
|
||||
// resolve engine from voice selector
|
||||
const engine = _engineFromSelector(voiceSelector);
|
||||
@@ -50,7 +51,7 @@ export async function speakText(inputText: string, voiceSelector: _Speak_VoiceSe
|
||||
case 'elevenlabs':
|
||||
case 'openai':
|
||||
case 'localai':
|
||||
return speexSynthesize_RPC(effectiveEngine, inputText, { streaming, playback, returnAudio, languageCode }, callbacks);
|
||||
return speexSynthesize_RPC(effectiveEngine, inputText, { streaming, playback, returnAudio, languageCode, priority }, callbacks);
|
||||
|
||||
// Web Speech: client-only, no RPC
|
||||
case 'webspeech':
|
||||
|
||||
@@ -108,10 +108,12 @@ export type SpeexListVoiceOption = SpeexWire_VoiceOption;
|
||||
export type SpeexSpeakOptions = {
|
||||
label?: string; // For NorthBridge queue display
|
||||
personaUid?: string; // For NorthBridge queue icon / controls (if the audio came from a persona)
|
||||
// core options
|
||||
streaming?: boolean; // Streaming defaults to True
|
||||
languageCode?: string; // ISO language code (e.g., 'en', 'fr') - auto-detected from preferredLanguage if not provided
|
||||
priority?: 'fast' | 'balanced' | 'quality'; // Hint for speed vs quality tradeoff: 'fast' = low latency (turbo models), 'quality' = highest quality
|
||||
playback?: boolean; // Play audio (default: true)
|
||||
returnAudio?: boolean; // Accumulate full audio buffer in result, even if streaming (for save/download)
|
||||
languageCode?: string; // ISO language code (e.g., 'en', 'fr') - auto-detected from preferredLanguage if not provided
|
||||
}
|
||||
|
||||
export type SpeexSpeakResult = {
|
||||
|
||||
@@ -225,7 +225,11 @@ export const useSpeexStore = create<SpeexStore>()(persist(
|
||||
isAutoDetected: true,
|
||||
isAutoLinked: false,
|
||||
credentials: { type: 'api-key', apiKey: apiKey.trim() },
|
||||
voice: { dialect: 'elevenlabs', ttsModel: 'eleven_multilingual_v2', ttsVoiceId: voiceId || undefined },
|
||||
voice: {
|
||||
dialect: 'elevenlabs',
|
||||
ttsModel: 'eleven_multilingual_v2',
|
||||
...((typeof voiceId === 'string' && voiceId.trim()) ? { ttsVoiceId: voiceId.trim() } : {}),
|
||||
},
|
||||
});
|
||||
console.log('[DEV] Speex: Migrated legacy ElevenLabs configuration');
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user