fix(speex): preprocess text to remove code blocks and clean URLs for TTS

- Add preprocessTextForSpeech() function to filter out: - Code blocks (mermaid, sql, csv, etc.) - HTML/SVG blocks - Attachment references - HTTP/HTTPS prefixes from URLs - Add preprocessText option to SpeexSpeakOptions (default: true) - All existing TTS calls automatically use preprocessing Fixes #741 Co-authored-by: Enrico Ros <enricoros@users.noreply.github.com>
2026-05-10 21:50:14 -07:00 · 2025-11-30 23:22:42 +00:00
2 changed files with 40 additions and 2 deletions
@@ -15,6 +15,39 @@ import { speexSynthesize_RPC } from './protocols/rpc/rpc.client';
 import { speexSynthesize_WebSpeech } from './protocols/webspeech/webspeech.client';


+// Text Preprocessing for Speech
+
+/**
+ * Preprocesses text before TTS synthesis to remove elements that shouldn't be spoken.
+ * - Removes code blocks (including mermaid diagrams)
+ * - Removes attachment references
+ * - Cleans up URLs (removes "http://" and "https://")
+ */
+function preprocessTextForSpeech(text: string): string {
+  let processed = text;
+
+  // Remove code blocks (including mermaid, sql, csv, etc.)
+  // Match triple backticks with optional language identifier
+  processed = processed.replace(/```[\s\S]*?```/g, '');
+
+  // Remove standalone HTML/SVG blocks that might not be in triple backticks
+  processed = processed.replace(/<!DOCTYPE html>[\s\S]*?<\/html>/gi, '');
+  processed = processed.replace(/<svg[\s\S]*?<\/svg>/g, '');
+
+  // Remove attachment references (markdown image syntax and links to attachments)
+  processed = processed.replace(/!\[.*?\]\(.*?\)/g, '');
+
+  // Clean URLs - remove http:// and https:// prefixes for more natural speech
+  processed = processed.replace(/https?:\/\//gi, '');
+
+  // Clean up multiple consecutive newlines or spaces
+  processed = processed.replace(/\n{3,}/g, '\n\n');
+  processed = processed.replace(/\s{3,}/g, ' ');
+
+  return processed.trim();
+}
+
+
 // Speech Synthesis API

 export async function speakText(
@@ -34,6 +67,10 @@ export async function speakText(
  const priority = options?.priority;
  const playback = options?.playback ?? true;
  const returnAudio = options?.returnAudio ?? false;
+  const preprocessText = options?.preprocessText ?? true;
+
+  // Preprocess text to remove code blocks, attachments, and clean URLs
+  const processedText = preprocessText ? preprocessTextForSpeech(inputText) : inputText;

  // resolve engine from voice selector
  const engine = _engineFromSelector(voiceSelector);
@@ -49,11 +86,11 @@ export async function speakText(
      case 'elevenlabs':
      case 'openai':
      case 'localai':
-        return speexSynthesize_RPC(effectiveEngine, inputText, { streaming, playback, returnAudio, languageCode, priority }, callbacks);
+        return speexSynthesize_RPC(effectiveEngine, processedText, { streaming, playback, returnAudio, languageCode, priority }, callbacks);

      // Web Speech: client-only, no RPC
      case 'webspeech':
-        return speexSynthesize_WebSpeech(inputText, effectiveEngine.voice as DVoiceWebSpeech, callbacks);
+        return speexSynthesize_WebSpeech(processedText, effectiveEngine.voice as DVoiceWebSpeech, callbacks);
    }
  } catch (error) {
    callbacks?.onError?.(error instanceof Error ? error : new Error(String(error)));
@@ -133,6 +133,7 @@ export type SpeexSpeakOptions = {
  priority?: 'fast' | 'balanced' | 'quality'; // Hint for speed vs quality tradeoff: 'fast' = low latency (turbo models), 'quality' = highest quality
  playback?: boolean;       // Play audio (default: true)
  returnAudio?: boolean;    // Accumulate full audio buffer in result, even if streaming (for save/download)
+  preprocessText?: boolean; // Preprocess text to remove code blocks, clean URLs, etc. (default: true)
 }

 export type SpeexSpeakResult = {