From b11cac4328ec57b8122d525cee240fbd8d7e7b60 Mon Sep 17 00:00:00 2001 From: Enrico Ros Date: Thu, 29 Jan 2026 22:13:04 -0800 Subject: [PATCH] LLMs: sync CB --- .../llms/server/anthropic/anthropic.models.ts | 25 +++++----- .../llms/server/gemini/gemini.models.ts | 24 ++++----- .../server/openai/models/deepseek.models.ts | 4 +- .../server/openai/models/mistral.models.ts | 12 ++--- .../server/openai/models/moonshot.models.ts | 8 +-- .../server/openai/models/openai.models.ts | 49 ++++++++++--------- .../llms/server/openai/models/xai.models.ts | 16 +++--- 7 files changed, 70 insertions(+), 68 deletions(-) diff --git a/src/modules/llms/server/anthropic/anthropic.models.ts b/src/modules/llms/server/anthropic/anthropic.models.ts index d99ba83f1..dc7952432 100644 --- a/src/modules/llms/server/anthropic/anthropic.models.ts +++ b/src/modules/llms/server/anthropic/anthropic.models.ts @@ -37,6 +37,7 @@ const _hardcodedAnthropicVariants: ModelVariantMap = { description: 'Claude Opus 4.5 with extended thinking mode for complex reasoning and agentic workflows', interfaces: [...IF_4_R, LLM_IF_ANT_ToolsSearch], parameterSpecs: [...ANT_PAR_WEB_THINKING, { paramId: 'llmVndAntEffort' }, { paramId: 'llmVndAntSkills' }], + benchmark: { cbaElo: 1468 }, // claude-opus-4-5-20251101-thinking-32k maxCompletionTokens: 32000, }, @@ -47,7 +48,7 @@ const _hardcodedAnthropicVariants: ModelVariantMap = { maxCompletionTokens: 64000, interfaces: [...IF_4_R, LLM_IF_ANT_ToolsSearch], parameterSpecs: [...ANT_PAR_WEB_THINKING, { paramId: 'llmVndAnt1MContext' }, { paramId: 'llmVndAntSkills' }], - benchmark: { cbaElo: 1451 + 1 }, // FALLBACK-UNTIL-AVAILABLE: claude-opus-4-1-20250805-thinking-16k + 1 + benchmark: { cbaElo: 1450 }, // claude-sonnet-4-5-20250929-thinking-32k }, 'claude-haiku-4-5-20251001': { @@ -67,7 +68,7 @@ const _hardcodedAnthropicVariants: ModelVariantMap = { maxCompletionTokens: 32000, interfaces: IF_4_R, parameterSpecs: ANT_PAR_WEB_THINKING, - benchmark: { cbaElo: 1451 }, // claude-opus-4-1-20250805-thinking-16k + benchmark: { cbaElo: 1448 }, // claude-opus-4-1-20250805-thinking-16k }, // Claude 4 models with thinking variants @@ -79,7 +80,7 @@ const _hardcodedAnthropicVariants: ModelVariantMap = { maxCompletionTokens: 32000, interfaces: IF_4_R, parameterSpecs: ANT_PAR_WEB_THINKING, - benchmark: { cbaElo: 1420 }, // claude-opus-4-20250514-thinking-16k + benchmark: { cbaElo: 1424 }, // claude-opus-4-20250514-thinking-16k }, 'claude-sonnet-4-20250514': { @@ -100,7 +101,7 @@ const _hardcodedAnthropicVariants: ModelVariantMap = { maxCompletionTokens: 64000, interfaces: IF_4_R, parameterSpecs: ANT_PAR_WEB_THINKING, - benchmark: { cbaElo: 1385 }, // claude-3-7-sonnet-20250219-thinking-32k + benchmark: { cbaElo: 1389 }, // claude-3-7-sonnet-20250219-thinking-32k }, } as const; @@ -122,6 +123,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo interfaces: [...IF_4, LLM_IF_ANT_ToolsSearch], parameterSpecs: [...ANT_PAR_WEB, { paramId: 'llmVndAntEffort' }], chatPrice: { input: 5, output: 25, cache: { cType: 'ant-bp', read: 0.50, write: 6.25, duration: 300 } }, + benchmark: { cbaElo: 1466 }, // claude-opus-4-5-20251101 }, { id: 'claude-sonnet-4-5-20250929', // Active @@ -143,7 +145,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo duration: 300, }, }, - benchmark: { cbaElo: 1438 + 1 }, // FALLBACK-UNTIL-AVAILABLE: claude-opus-4-1-20250805 + 1 + benchmark: { cbaElo: 1450 }, // claude-sonnet-4-5-20250929 }, { id: 'claude-haiku-4-5-20251001', // Active @@ -154,6 +156,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo interfaces: IF_4, parameterSpecs: [...ANT_PAR_WEB, { paramId: 'llmVndAntSkills' }], chatPrice: { input: 1, output: 5, cache: { cType: 'ant-bp', read: 0.10, write: 1.25, duration: 300 } }, + benchmark: { cbaElo: 1403 }, // claude-haiku-4-5-20251001 }, // Claude 4.1 models @@ -166,7 +169,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo interfaces: IF_4, parameterSpecs: ANT_PAR_WEB, chatPrice: { input: 15, output: 75, cache: { cType: 'ant-bp', read: 1.50, write: 18.75, duration: 300 } }, - benchmark: { cbaElo: 1438 }, // claude-opus-4-1-20250805 + benchmark: { cbaElo: 1445 }, // claude-opus-4-1-20250805 }, // Claude 4 models @@ -180,7 +183,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo interfaces: IF_4, parameterSpecs: ANT_PAR_WEB, chatPrice: { input: 15, output: 75, cache: { cType: 'ant-bp', read: 1.50, write: 18.75, duration: 300 } }, - benchmark: { cbaElo: 1411 }, // claude-opus-4-20250514 + benchmark: { cbaElo: 1414 }, // claude-opus-4-20250514 }, { id: 'claude-sonnet-4-20250514', // Active @@ -202,7 +205,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo duration: 300, }, }, - benchmark: { cbaElo: 1386 }, // claude-sonnet-4-20250514 + benchmark: { cbaElo: 1390 }, // claude-sonnet-4-20250514 }, // Claude 3.7 models @@ -215,7 +218,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo interfaces: IF_4, parameterSpecs: ANT_PAR_WEB, chatPrice: { input: 3, output: 15, cache: { cType: 'ant-bp', read: 0.30, write: 3.75, duration: 300 } }, - benchmark: { cbaElo: 1369 }, // claude-3-7-sonnet-20250219 + benchmark: { cbaElo: 1372 }, // claude-3-7-sonnet-20250219 hidden: true, // deprecated isLegacy: true, }, @@ -232,7 +235,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo interfaces: IF_4, parameterSpecs: ANT_PAR_WEB, chatPrice: { input: 0.80, output: 4.00, cache: { cType: 'ant-bp', read: 0.08, write: 1.00, duration: 300 } }, - benchmark: { cbaElo: 1319, cbaMmlu: 75.2 }, // claude-3-5-haiku-20241022 + benchmark: { cbaElo: 1324 }, // claude-3-5-haiku-20241022 hidden: true, // deprecated isLegacy: true, }, @@ -248,7 +251,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo maxCompletionTokens: 4096, interfaces: IF_4, chatPrice: { input: 0.25, output: 1.25, cache: { cType: 'ant-bp', read: 0.03, write: 0.30, duration: 300 } }, - benchmark: { cbaElo: 1263, cbaMmlu: 75.1 }, + benchmark: { cbaElo: 1262 }, // claude-3-haiku-20240307 }, // Legacy/Retired models diff --git a/src/modules/llms/server/gemini/gemini.models.ts b/src/modules/llms/server/gemini/gemini.models.ts index 84ff6cc01..88525216a 100644 --- a/src/modules/llms/server/gemini/gemini.models.ts +++ b/src/modules/llms/server/gemini/gemini.models.ts @@ -172,7 +172,7 @@ const _knownGeminiModels: ({ { paramId: 'llmVndGeminiGoogleSearch' }, // { paramId: 'llmVndGeminiComputerUse' }, // we don't have the logic to handle this yet ], - benchmark: { cbaElo: 1490 }, // gemini-3-pro + benchmark: { cbaElo: 1487 }, // gemini-3-pro }, // 3.0 Pro Image Preview - Released November 20, 2025 @@ -221,7 +221,7 @@ const _knownGeminiModels: ({ { paramId: 'llmVndGeminiGoogleSearch' }, // { paramId: 'llmVndGeminiComputerUse' }, // we don't have the logic to handle this yet ], - benchmark: { cbaElo: 1480 }, // gemini-3-flash + benchmark: { cbaElo: 1471 }, // gemini-3-flash }, /// Generation 2.5 @@ -237,7 +237,7 @@ const _knownGeminiModels: ({ { paramId: 'llmVndGeminiThinkingBudget', rangeOverride: [128, 32768] /* does not support 0 which would turn thinking off */ }, { paramId: 'llmVndGeminiGoogleSearch' }, ], - benchmark: { cbaElo: 1451 }, // gemini-2.5-pro + benchmark: { cbaElo: 1450 }, // gemini-2.5-pro }, // REMOVED MODELS (no longer returned by API as of Jan 8, 2026): @@ -289,7 +289,7 @@ const _knownGeminiModels: ({ { paramId: 'llmVndGeminiThinkingBudget' }, { paramId: 'llmVndGeminiGoogleSearch' }, ], - benchmark: { cbaElo: 1406 + 2 }, // gemini-2.5-flash-preview-09-2025 - the +2 is to be on top of the non-preview 2.5-flash (1407) + benchmark: { cbaElo: 1405 }, // gemini-2.5-flash-preview-09-2025 }, // 2.5 Flash { @@ -303,7 +303,7 @@ const _knownGeminiModels: ({ { paramId: 'llmVndGeminiThinkingBudget' }, { paramId: 'llmVndGeminiGoogleSearch' }, ], - benchmark: { cbaElo: 1407 }, // gemini-2.5-flash (updated from CSV) + benchmark: { cbaElo: 1409 }, // gemini-2.5-flash }, // REMOVED MODELS (no longer returned by API as of Nov 20, 2025): @@ -398,7 +398,7 @@ const _knownGeminiModels: ({ { paramId: 'llmVndGeminiThinkingBudget' }, { paramId: 'llmVndGeminiGoogleSearch' }, ], - benchmark: { cbaElo: 1380 }, // gemini-2.5-flash-lite-preview-09-2025 (no-thinking variant) + benchmark: { cbaElo: 1379 }, // gemini-2.5-flash-lite-preview-09-2025-no-thinking }, // 2.5 Flash-Lite - Released July 2025 { @@ -463,7 +463,7 @@ const _knownGeminiModels: ({ chatPrice: gemini20FlashPricing, interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Vision, LLM_IF_OAI_Fn, LLM_IF_OAI_Json, LLM_IF_GEM_CodeExecution], parameterSpecs: [{ paramId: 'llmVndGeminiGoogleSearch' }], - benchmark: { cbaElo: 1360 }, // gemini-2.0-flash-001 + benchmark: { cbaElo: 1361 }, // gemini-2.0-flash-001 }, { id: 'models/gemini-2.0-flash', @@ -473,7 +473,7 @@ const _knownGeminiModels: ({ chatPrice: gemini20FlashPricing, interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Vision, LLM_IF_OAI_Fn, LLM_IF_OAI_Json, LLM_IF_GEM_CodeExecution], parameterSpecs: [{ paramId: 'llmVndGeminiGoogleSearch' }], - benchmark: { cbaElo: 1360 }, // gemini-2.0-flash + benchmark: { cbaElo: 1361 }, // gemini-2.0-flash }, // 2.0 Flash Lite @@ -529,7 +529,7 @@ const _knownGeminiModels: ({ isPreview: true, interfaces: [LLM_IF_OAI_Chat, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0], chatPrice: geminiExpFree, // Free tier only according to pricing page - benchmark: { cbaElo: 1311 }, // Estimating based on comparable models + benchmark: { cbaElo: 1319 }, // gemma-3n-e4b-it }, { id: 'models/gemma-3n-e2b-it', @@ -547,7 +547,7 @@ const _knownGeminiModels: ({ isPreview: true, interfaces: [LLM_IF_OAI_Chat, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0], chatPrice: geminiExpFree, // Pricing page indicates free tier only - benchmark: { cbaElo: 1341 }, + benchmark: { cbaElo: 1365 }, // gemma-3-27b-it // hidden: true, // Keep visible if it's a distinct offering }, { @@ -556,7 +556,7 @@ const _knownGeminiModels: ({ isPreview: true, interfaces: [LLM_IF_OAI_Chat, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0], chatPrice: geminiExpFree, - benchmark: { cbaElo: 1321 }, + benchmark: { cbaElo: 1342 }, // gemma-3-12b-it }, { hidden: true, // keep larger model @@ -564,7 +564,7 @@ const _knownGeminiModels: ({ isPreview: true, interfaces: [LLM_IF_OAI_Chat, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0], chatPrice: geminiExpFree, - benchmark: { cbaElo: 1275 }, + benchmark: { cbaElo: 1303 }, // gemma-3-4b-it }, { hidden: true, // keep larger model diff --git a/src/modules/llms/server/openai/models/deepseek.models.ts b/src/modules/llms/server/openai/models/deepseek.models.ts index 66cf8bb38..b606d4ede 100644 --- a/src/modules/llms/server/openai/models/deepseek.models.ts +++ b/src/modules/llms/server/openai/models/deepseek.models.ts @@ -19,7 +19,7 @@ const _knownDeepseekChatModels: ManualMappings = [ interfaces: [...IF_3, LLM_IF_OAI_Reasoning], maxCompletionTokens: 32768, // default, max: 65536 chatPrice: { input: 0.28, output: 0.42, cache: { cType: 'oai-ac', read: 0.028 } }, - benchmark: { cbaElo: 1418 }, // deepseek-r1-0528 + benchmark: { cbaElo: 1412 }, //deepseek-v3.2-exp-thinking }, { idPrefix: 'deepseek-chat', @@ -29,7 +29,7 @@ const _knownDeepseekChatModels: ManualMappings = [ interfaces: IF_3, maxCompletionTokens: 8192, // default is 4096, max is 8192 chatPrice: { input: 0.28, output: 0.42, cache: { cType: 'oai-ac', read: 0.028 } }, - benchmark: { cbaElo: 1419 }, // deepseek-v3.1-thinking + benchmark: { cbaElo: 1420 }, // deepseek-v3.2 }, ]; diff --git a/src/modules/llms/server/openai/models/mistral.models.ts b/src/modules/llms/server/openai/models/mistral.models.ts index 370e31937..69173c563 100644 --- a/src/modules/llms/server/openai/models/mistral.models.ts +++ b/src/modules/llms/server/openai/models/mistral.models.ts @@ -25,16 +25,16 @@ const _knownMistralModelDetails: Record = { // Premier models - Mistral 3 (Dec 2025) - 'mistral-large-2512': { chatPrice: { input: 0.5, output: 1.5 } }, // Mistral Large 3 - MoE 41B active / 675B total + 'mistral-large-2512': { chatPrice: { input: 0.5, output: 1.5 }, benchmark: { cbaElo: 1414 } }, // Mistral Large 3 - MoE 41B active / 675B total 'mistral-large-2411': { chatPrice: { input: 2, output: 6 }, benchmark: { cbaElo: 1305 }, hidden: true }, // older version 'mistral-large-latest': { chatPrice: { input: 0.5, output: 1.5 }, hidden: true }, // → 2512 - 'mistral-medium-2508': { chatPrice: { input: 0.4, output: 2 } }, // Mistral Medium 3 - 'mistral-medium-2505': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1383 }, hidden: true }, // older version + 'mistral-medium-2508': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1412 } }, // Mistral Medium 3 + 'mistral-medium-2505': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1384 }, hidden: true }, // older version 'mistral-medium-latest': { chatPrice: { input: 0.4, output: 2 }, hidden: true }, // → 2508 'mistral-medium': { chatPrice: { input: 0.4, output: 2 }, hidden: true }, // symlink - 'magistral-medium-2509': { chatPrice: { input: 2, output: 5 } }, // reasoning + 'magistral-medium-2509': { chatPrice: { input: 2, output: 5 }, benchmark: { cbaElo: 1305 } }, // reasoning 'magistral-medium-latest': { chatPrice: { input: 2, output: 5 }, hidden: true }, // symlink 'devstral-2512': { label: 'Devstral 2 (2512)', chatPrice: { input: 0.4, output: 2 } }, // Devstral 2 - 123B coding agents (API returns "Mistral Vibe Cli") @@ -61,7 +61,7 @@ const _knownMistralModelDetails: Record wrong, but still we need a fallback - benchmark: { cbaElo: 1288 }, + // no benchmark: keep this out }, ] as const;