From 9537ce59e8a26bc346b41f2b744fcf429f4db77b Mon Sep 17 00:00:00 2001
From: Enrico Ros <enrico.ros@gmail.com>
Date: Mon, 20 Apr 2026 13:18:55 -0700
Subject: [PATCH] LLM: cap initial max response to 128k

---
 src/modules/llms/llm.client.ts | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/modules/llms/llm.client.ts b/src/modules/llms/llm.client.ts
index 943a0d274..da71e4be1 100644
--- a/src/modules/llms/llm.client.ts
+++ b/src/modules/llms/llm.client.ts
@@ -18,6 +18,12 @@ import { findServiceAccessOrThrow } from './vendors/vendor.helpers';
  */
 export const LLMS_VARIANT_SEPARATOR = '::' as const;
 
+// Cap for the *initial* llmResponseTokens default to avoid runaway defaults on huge-context models.
+// The model's maxOutputTokens is unchanged (vendor-reported cap remains true); users can still raise
+// llmResponseTokens via the slider up to maxOutputTokens. On reset, this capped initial is used.
+const _INITIAL_RESPONSE_TOKENS_CAP = 128_000;
+
+
 function _clientIdWithVariant(id: string, idVariant?: string): string {
   return !idVariant ? id
     : idVariant.startsWith(LLMS_VARIANT_SEPARATOR) ? `${id}${idVariant}`
@@ -86,8 +92,9 @@ function _createDLLMFromModelDescription(d: ModelDescriptionSchema, service: DMo
   const contextTokens = d.contextWindow || null;
   const maxOutputTokens = d.maxCompletionTokens || (contextTokens ? Math.round(contextTokens / 2) : null); // fallback to half context window
 
-  // initial (user overridable) response tokens setting: equal to the max, if the max is given, or to 1/8th of the context window (when max is set to 1/2 of context)
-  const llmResponseTokens = !maxOutputTokens ? null : !d.maxCompletionTokens ? Math.round(maxOutputTokens / 4) : d.maxCompletionTokens;
+  // initial (user overridable) response tokens setting: equal to the max, if the max is given, or to 1/8th of the context window (when max is set to 1/2 of context); clamped to cap
+  const llmResponseTokens = !maxOutputTokens ? null
+    : Math.min(d.maxCompletionTokens ?? Math.round(maxOutputTokens / 4), _INITIAL_RESPONSE_TOKENS_CAP);
 
 
   // DLLM is a fundamental type in our application