From 4912a032509c44eb37235e31f05976ffbc04470d Mon Sep 17 00:00:00 2001 From: Enrico Ros Date: Tue, 10 Feb 2026 13:22:47 -0800 Subject: [PATCH] LLMs: Anthropic: Fast mode research preview --- src/common/stores/llms/llms.parameters.ts | 9 +++++++++ src/modules/aix/client/aix.client.ts | 3 ++- src/modules/aix/server/api/aix.wiretypes.ts | 1 + .../adapters/anthropic.messageCreate.ts | 4 ++++ .../chatGenerate/chatGenerate.dispatch.ts | 1 + .../dispatch/wiretypes/anthropic.wiretypes.ts | 6 ++++++ .../llms/models-modal/LLMParametersEditor.tsx | 15 +++++++++++++++ .../llms/server/anthropic/anthropic.access.ts | 8 ++++++-- .../llms/server/anthropic/anthropic.models.ts | 5 +++-- src/modules/llms/server/llm.server.types.ts | 1 + 10 files changed, 48 insertions(+), 5 deletions(-) diff --git a/src/common/stores/llms/llms.parameters.ts b/src/common/stores/llms/llms.parameters.ts index 0846ae9d9..9d33a98c8 100644 --- a/src/common/stores/llms/llms.parameters.ts +++ b/src/common/stores/llms/llms.parameters.ts @@ -159,6 +159,15 @@ export const DModelParameterRegistry = { // No initialValue - undefined means high effort (default, equivalent to omitting the parameter) }), + llmVndAntInfSpeed: _enumDef({ + label: 'Fast Mode', + type: 'enum', + description: 'Accelerated inference (~2.5x faster output) at 6x pricing. Preview access required.', + values: ['fast'], + enumPriceMultiplier: { fast: 6 }, + // No initialValue - undefined means standard speed (omitted from request) + }), + llmVndAntSkills: { label: 'Document Skills', type: 'string', diff --git a/src/modules/aix/client/aix.client.ts b/src/modules/aix/client/aix.client.ts index 8898a4869..389dd312f 100644 --- a/src/modules/aix/client/aix.client.ts +++ b/src/modules/aix/client/aix.client.ts @@ -50,7 +50,7 @@ export function aixCreateModelFromLLMOptions( // destructure input with the overrides const { llmRef, llmTemperature, llmResponseTokens, llmTopP, llmForceNoStream, - llmVndAnt1MContext, llmVndAntSkills, llmVndAntThinkingBudget, llmVndAntWebFetch, llmVndAntWebSearch, llmVndAntEffort, llmVndAntEffortMax, + llmVndAnt1MContext, llmVndAntInfSpeed, llmVndAntSkills, llmVndAntThinkingBudget, llmVndAntWebFetch, llmVndAntWebSearch, llmVndAntEffort, llmVndAntEffortMax, llmVndGeminiAspectRatio, llmVndGeminiImageSize, llmVndGeminiCodeExecution, llmVndGeminiComputerUse, llmVndGeminiGoogleSearch, llmVndGeminiMediaResolution, llmVndGeminiShowThoughts, llmVndGeminiThinkingBudget, llmVndGeminiThinkingLevel, llmVndGeminiThinkingLevel4, llmVndMoonReasoningEffort, // -> mapped to vndOaiReasoningEffort below // llmVndMoonshotWebSearch, @@ -108,6 +108,7 @@ export function aixCreateModelFromLLMOptions( ...(llmForceNoStream ? { forceNoStream: true } : {}), ...(llmVndAntThinkingBudget !== undefined ? { vndAntThinkingBudget: llmVndAntThinkingBudget === -1 ? 'adaptive' as const : llmVndAntThinkingBudget } : {}), ...(llmVndAnt1MContext ? { vndAnt1MContext: llmVndAnt1MContext } : {}), + ...(llmVndAntInfSpeed === 'fast' ? { vndAntInfSpeed: 'fast' } : {}), ...(llmVndAntSkills ? { vndAntSkills: llmVndAntSkills } : {}), ...(llmVndAntWebFetch === 'auto' ? { vndAntWebFetch: llmVndAntWebFetch } : {}), ...(llmVndAntWebSearch === 'auto' ? { vndAntWebSearch: llmVndAntWebSearch } : {}), diff --git a/src/modules/aix/server/api/aix.wiretypes.ts b/src/modules/aix/server/api/aix.wiretypes.ts index 89b012fed..e68a8c2c4 100644 --- a/src/modules/aix/server/api/aix.wiretypes.ts +++ b/src/modules/aix/server/api/aix.wiretypes.ts @@ -452,6 +452,7 @@ export namespace AixWire_API { // Anthropic vndAnt1MContext: z.boolean().optional(), vndAntEffort: z.enum(['low', 'medium', 'high', 'max']).optional(), + vndAntInfSpeed: z.enum(['fast']).optional(), vndAntSkills: z.string().optional(), vndAntThinkingBudget: z.number().or(z.literal('adaptive')).nullable().optional(), vndAntToolSearch: z.enum(['regex', 'bm25']).optional(), // Tool Search Tool variant diff --git a/src/modules/aix/server/dispatch/chatGenerate/adapters/anthropic.messageCreate.ts b/src/modules/aix/server/dispatch/chatGenerate/adapters/anthropic.messageCreate.ts index 668efc061..d0caf6ee8 100644 --- a/src/modules/aix/server/dispatch/chatGenerate/adapters/anthropic.messageCreate.ts +++ b/src/modules/aix/server/dispatch/chatGenerate/adapters/anthropic.messageCreate.ts @@ -193,6 +193,10 @@ export function aixToAnthropicMessageCreate(model: AixAPI_Model, _chatGenerate: console.warn('[Anthropic] Structured output_config.format may conflict with web_fetch citations'); } + // [Anthropic, fast-mode-2026-02-01] Fast inference mode (preview/waitlist) + if (model.vndAntInfSpeed === 'fast') + payload.speed = 'fast'; + // --- Tools --- // Allow/deny auto-adding hosted tools when custom tools are present diff --git a/src/modules/aix/server/dispatch/chatGenerate/chatGenerate.dispatch.ts b/src/modules/aix/server/dispatch/chatGenerate/chatGenerate.dispatch.ts index 7e363fe59..e000abe85 100644 --- a/src/modules/aix/server/dispatch/chatGenerate/chatGenerate.dispatch.ts +++ b/src/modules/aix/server/dispatch/chatGenerate/chatGenerate.dispatch.ts @@ -65,6 +65,7 @@ export function createChatGenerateDispatch(access: AixAPI_Access, model: AixAPI_ vndAnt1MContext: model.vndAnt1MContext === true, vndAntEffort: !!model.vndAntEffort, enableSkills: !!model.vndAntSkills, + enableFastMode: model.vndAntInfSpeed === 'fast', enableStrictOutputs: !!model.strictJsonOutput || !!model.strictToolInvocations, // [Anthropic, 2025-11-13] for both JSON output and grammar-constrained tool invocations inputs enableToolSearch: !!model.vndAntToolSearch, enableProgrammaticToolCalling: usesProgrammaticToolCalling, diff --git a/src/modules/aix/server/dispatch/wiretypes/anthropic.wiretypes.ts b/src/modules/aix/server/dispatch/wiretypes/anthropic.wiretypes.ts index b07eb92ea..0ee108e60 100644 --- a/src/modules/aix/server/dispatch/wiretypes/anthropic.wiretypes.ts +++ b/src/modules/aix/server/dispatch/wiretypes/anthropic.wiretypes.ts @@ -896,6 +896,12 @@ export namespace AnthropicWire_API_Message_Create { * */ top_p: z.number().optional(), + /** + * [Anthropic, fast-mode-2026-02-01] Accelerated inference mode. + * Preview/waitlist. Only supported on Claude Opus 4.6. + */ + speed: z.enum(['fast']).optional(), + /** * [Anthropic, 2026-02-01] Geographic region for model inference. * - "global": default, inference may run in any available geography diff --git a/src/modules/llms/models-modal/LLMParametersEditor.tsx b/src/modules/llms/models-modal/LLMParametersEditor.tsx index 65d8b38ea..627eea343 100644 --- a/src/modules/llms/models-modal/LLMParametersEditor.tsx +++ b/src/modules/llms/models-modal/LLMParametersEditor.tsx @@ -250,6 +250,7 @@ export function LLMParametersEditor(props: { llmVndAnt1MContext, llmVndAntEffort, llmVndAntEffortMax, + llmVndAntInfSpeed, llmVndAntSkills, llmVndAntThinkingBudget, llmVndAntWebFetch, @@ -467,6 +468,20 @@ export function LLMParametersEditor(props: { /> )} + {/* Anthropic Fast Mode - currently hidden via parameterSpec.hidden */} + {showParam('llmVndAntInfSpeed') && ( + { + if (!checked) onRemoveParameter('llmVndAntInfSpeed'); + else onChangeParameter({ llmVndAntInfSpeed: 'fast' }); + }} + /> + )} + {isExtra && showParam('llmVndAntSkills') && ( )} diff --git a/src/modules/llms/server/anthropic/anthropic.access.ts b/src/modules/llms/server/anthropic/anthropic.access.ts index 4a115dc6a..1a53657dd 100644 --- a/src/modules/llms/server/anthropic/anthropic.access.ts +++ b/src/modules/llms/server/anthropic/anthropic.access.ts @@ -85,6 +85,7 @@ export type AnthropicHeaderOptions = { vndAntEffort?: boolean; // [Anthropic, effort-2025-11-24] enableSkills?: boolean; enableCodeExecution?: boolean; + enableFastMode?: boolean; // [Anthropic, fast-mode-2026-02-01] enableStrictOutputs?: boolean; // [Anthropic, 2025-11-13] Structured Outputs (JSON outputs & strict tool use) enableToolSearch?: boolean; // [Anthropic, 2025-11-24] Tool Search Tool enableProgrammaticToolCalling?: boolean; // [Anthropic, 2025-11-24] Programmatic Tool Calling (allowed_callers, input_examples) @@ -165,9 +166,12 @@ function _anthropicHeaders(options?: AnthropicHeaderOptions): Record200K: $10/$37.50 (with 1M context enabled) // Cache pricing also tiered: write 1.25× input, read 0.10× input chatPrice: { @@ -359,6 +359,7 @@ export function llmsAntCreatePlaceholderModel(model: AnthropicWire_API_Models_Li const _ORT_ANT_IF_ALLOWLIST: ReadonlySet = new Set([ LLM_IF_OAI_Chat, LLM_IF_OAI_Vision, LLM_IF_OAI_Fn, LLM_IF_OAI_Reasoning, ] as const); +// NOTE: llmVndAntInfSpeed intentionally NOT included - fast mode not available through OpenRouter const _ORT_ANT_PARAM_ALLOWLIST: ReadonlySet = new Set([ 'llmVndAntEffort', 'llmVndAntEffortMax', 'llmVndAntThinkingBudget', diff --git a/src/modules/llms/server/llm.server.types.ts b/src/modules/llms/server/llm.server.types.ts index 995aa7c3b..90fbd85c1 100644 --- a/src/modules/llms/server/llm.server.types.ts +++ b/src/modules/llms/server/llm.server.types.ts @@ -80,6 +80,7 @@ const ModelParameterSpec_schema = z.object({ 'llmVndAnt1MContext', 'llmVndAntEffort', 'llmVndAntEffortMax', + 'llmVndAntInfSpeed', 'llmVndAntSkills', 'llmVndAntThinkingBudget', 'llmVndAntWebFetch',