diff --git a/pages/api/elevenlabs/speech.ts b/pages/api/elevenlabs/speech.ts index f2105eb8b..18b847698 100644 --- a/pages/api/elevenlabs/speech.ts +++ b/pages/api/elevenlabs/speech.ts @@ -56,9 +56,13 @@ export async function postToElevenLabs(apiKey: string, api export default async function handler(req: NextRequest) { try { - const { apiKey = '', text, voiceId: userVoiceId } = (await req.json()) as ElevenLabs.API.TextToSpeech.RequestBody; + const { apiKey = '', text, voiceId: userVoiceId, nonEnglish } = (await req.json()) as ElevenLabs.API.TextToSpeech.RequestBody; const voiceId = userVoiceId || process.env.ELEVENLABS_VOICE_ID || '21m00Tcm4TlvDq8ikWAM'; - const response = await postToElevenLabs(apiKey, `/v1/text-to-speech/${voiceId}`, { text }); + const requestPayload: ElevenLabs.Wire.TextToSpeech.Request = { + text: text, + ...(nonEnglish ? { model_id: 'eleven_multilingual_v1' } : {}), + }; + const response = await postToElevenLabs(apiKey, `/v1/text-to-speech/${voiceId}`, requestPayload); const audioBuffer: ElevenLabs.API.TextToSpeech.Response = await response.arrayBuffer(); return new NextResponse(audioBuffer, { status: 200, headers: { 'Content-Type': 'audio/mpeg' } }); } catch (error) { diff --git a/src/apps/settings/UISettings.tsx b/src/apps/settings/UISettings.tsx index 9262e70b2..9fe20f5a5 100644 --- a/src/apps/settings/UISettings.tsx +++ b/src/apps/settings/UISettings.tsx @@ -127,7 +127,7 @@ export function UISettings() { - + Language diff --git a/src/modules/elevenlabs/ElevenlabsSettings.tsx b/src/modules/elevenlabs/ElevenlabsSettings.tsx index 1eaaa672c..b57987a52 100644 --- a/src/modules/elevenlabs/ElevenlabsSettings.tsx +++ b/src/modules/elevenlabs/ElevenlabsSettings.tsx @@ -1,6 +1,5 @@ import * as React from 'react'; import { shallow } from 'zustand/shallow'; -import { useQuery } from '@tanstack/react-query'; import { Box, CircularProgress, FormControl, FormHelperText, FormLabel, IconButton, Input, Option, Radio, RadioGroup, Select, Stack } from '@mui/joy'; import KeyIcon from '@mui/icons-material/Key'; @@ -13,8 +12,7 @@ import { Section } from '@/common/components/Section'; import { settingsGap } from '@/common/theme'; import { useSettingsStore } from '@/common/state/store-settings'; -import { ElevenLabs } from './elevenlabs.types'; -import { isValidElevenLabsApiKey, requireUserKeyElevenLabs } from './elevenlabs.client'; +import { isValidElevenLabsApiKey, requireUserKeyElevenLabs, useElevenLabsVoices } from './elevenlabs.client'; export function ElevenlabsSettings() { @@ -31,16 +29,7 @@ export function ElevenlabsSettings() { const requiresKey = requireUserKeyElevenLabs; const isValidKey = apiKey ? isValidElevenLabsApiKey(apiKey) : !requiresKey; - // load voices, if the server has a key, or the user provided one - const { data: voicesData, isLoading: loadingVoices } = useQuery(['voices', apiKey], { - enabled: isValidKey, - queryFn: () => fetch('/api/elevenlabs/voices', { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ ...(apiKey ? { apiKey: apiKey } : {}) }), - }).then(res => res.json() as Promise), - staleTime: 1000 * 60 * 5, // 5 minutes - }); + const { voicesData, loadingVoices } = useElevenLabsVoices(apiKey, isValidKey); const handleToggleApiKeyVisibility = () => setShowApiKeyValue(!showApiKeyValue); diff --git a/src/modules/elevenlabs/elevenlabs.client.ts b/src/modules/elevenlabs/elevenlabs.client.ts index 661a6b01f..987cc10b4 100644 --- a/src/modules/elevenlabs/elevenlabs.client.ts +++ b/src/modules/elevenlabs/elevenlabs.client.ts @@ -1,4 +1,5 @@ import { ElevenLabs } from './elevenlabs.types'; +import { useQuery } from '@tanstack/react-query'; import { useSettingsStore } from '@/common/state/store-settings'; @@ -10,11 +11,11 @@ export const isValidElevenLabsApiKey = (apiKey?: string) => !!apiKey && apiKey.t export async function speakText(text: string) { if (!(text?.trim())) return; - const { elevenLabsApiKey, elevenLabsVoiceId } = useSettingsStore.getState(); - + const { elevenLabsApiKey, elevenLabsVoiceId, preferredLanguage } = useSettingsStore.getState(); try { // NOTE: hardcoded 1000 as a failsafe, since the API will take very long and consume lots of credits for longer texts - const audioBuffer = await callElevenlabsSpeech(text.slice(0, 1000), elevenLabsApiKey, elevenLabsVoiceId); + const nonEnglish = !(preferredLanguage.toLowerCase().startsWith('en')); + const audioBuffer = await callElevenlabsSpeech(text.slice(0, 1000), elevenLabsApiKey, elevenLabsVoiceId, nonEnglish); const audioContext = new AudioContext(); const bufferSource = audioContext.createBufferSource(); bufferSource.buffer = await audioContext.decodeAudioData(audioBuffer); @@ -26,11 +27,12 @@ export async function speakText(text: string) { } -async function callElevenlabsSpeech(text: string, elevenLabsApiKey: string, elevenLabsVoiceId: string): Promise { +async function callElevenlabsSpeech(text: string, elevenLabsApiKey: string, elevenLabsVoiceId: string, nonEnglish: boolean): Promise { const payload: ElevenLabs.API.TextToSpeech.RequestBody = { apiKey: elevenLabsApiKey, text, voiceId: elevenLabsVoiceId, + nonEnglish, }; const response = await fetch('/api/elevenlabs/speech', { @@ -46,3 +48,17 @@ async function callElevenlabsSpeech(text: string, elevenLabsApiKey: string, elev return await response.arrayBuffer(); } + + +export function useElevenLabsVoices(apiKey: string, isEnabled: boolean) { + const { data: voicesData, isLoading: loadingVoices } = useQuery(['elevenlabs-voices', apiKey], { + enabled: isEnabled, + queryFn: () => fetch('/api/elevenlabs/voices', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ ...(apiKey ? { apiKey } : {}) }), + }).then(res => res.json() as Promise), + staleTime: 1000 * 60 * 5, // 5 minutes + }); + return { voicesData, loadingVoices }; +} diff --git a/src/modules/elevenlabs/elevenlabs.types.ts b/src/modules/elevenlabs/elevenlabs.types.ts index 38302d7fd..b85535003 100644 --- a/src/modules/elevenlabs/elevenlabs.types.ts +++ b/src/modules/elevenlabs/elevenlabs.types.ts @@ -8,6 +8,7 @@ export namespace ElevenLabs { apiKey?: string; text: string; voiceId?: string; + nonEnglish: boolean; } export type Response = ArrayBuffer; @@ -36,6 +37,7 @@ export namespace ElevenLabs { export namespace TextToSpeech { export interface Request { text: string; + model_id?: 'eleven_monolingual_v1' | string; voice_settings?: { stability: number; similarity_boost: number;