LLMs: sync scores

2026-05-10 21:50:14 -07:00 · 2026-04-20 23:13:56 -07:00
parent c8e7315de3
commit 205fb1bb5b
7 changed files with 71 additions and 66 deletions
@@ -77,7 +77,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
      { paramId: 'llmVndAntEffort', enumValues: ['low', 'medium', 'high', 'xhigh', 'max'] },
      ...ANT_TOOLS_DYNAMIC,
    ],
-    // benchmark: { cbaElo: ... }, // TBD
+    benchmark: { cbaElo: 1504 }, // claude-opus-4-7-thinking
  },

  // Claude 4.6 models with thinking variants
@@ -92,7 +92,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
      { paramId: 'llmVndAntInfSpeed' },
      ...ANT_TOOLS_DYNAMIC,
    ],
-    // benchmark: { cbaElo: ... }, // TBD
+    benchmark: { cbaElo: 1502 }, // claude-opus-4-6-thinking
  },

  'claude-sonnet-4-6': {
@@ -105,7 +105,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
      { paramId: 'llmVndAntEffort', enumValues: ['low', 'medium', 'high', 'max'] },
      ...ANT_TOOLS_DYNAMIC,
    ],
-    // benchmark: { cbaElo: ... }, // TBD
+    benchmark: { cbaElo: 1463 + 1 }, // 1 (thinking) + claude-sonnet-4-6
  },

  // Claude 4.5 models with thinking variants
@@ -119,7 +119,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
      { paramId: 'llmVndAntEffort', enumValues: ['low', 'medium', 'high'] },
      ...ANT_TOOLS,
    ],
-    benchmark: { cbaElo: 1468 }, // claude-opus-4-5-20251101-thinking-32k
+    benchmark: { cbaElo: 1473 }, // claude-opus-4-5-20251101-thinking-32k
    maxCompletionTokens: 32000,
  },

@@ -134,7 +134,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
      { paramId: 'llmVndAnt1MContext' },
      ...ANT_TOOLS,
    ],
-    benchmark: { cbaElo: 1450 }, // claude-sonnet-4-5-20250929-thinking-32k
+    benchmark: { cbaElo: 1452 }, // claude-sonnet-4-5-20250929-thinking-32k
  },

  'claude-haiku-4-5-20251001': {
@@ -147,6 +147,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
      { paramId: 'llmVndAntThinkingBudget' },
      ...ANT_TOOLS,
    ],
+    benchmark: { cbaElo: 1408 + 1 }, // 1 (thinking) + claude-haiku-4-5-20251001
  },

  // Claude 4.1 models with thinking variants
@@ -160,7 +161,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
      { paramId: 'llmVndAntThinkingBudget' },
      ...ANT_TOOLS,
    ],
-    benchmark: { cbaElo: 1448 }, // claude-opus-4-1-20250805-thinking-16k
+    benchmark: { cbaElo: 1449 }, // claude-opus-4-1-20250805-thinking-16k
  },

  // Claude 4 models with thinking variants
@@ -189,7 +190,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
      { paramId: 'llmVndAnt1MContext' },
      ...ANT_TOOLS,
    ],
-    benchmark: { cbaElo: 1400 }, // claude-sonnet-4-20250514-thinking-32k
+    benchmark: { cbaElo: 1399 }, // claude-sonnet-4-20250514-thinking-32k
  },

  // Changes to the thinking variant (same model ID) for the Claude Sonnet 3.7 model
@@ -203,7 +204,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
      { paramId: 'llmVndAntThinkingBudget' },
      ...ANT_TOOLS,
    ],
-    benchmark: { cbaElo: 1389 }, // claude-3-7-sonnet-20250219-thinking-32k
+    benchmark: { cbaElo: 1387 }, // claude-3-7-sonnet-20250219-thinking-32k
  },

 } as const;
@@ -231,7 +232,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    // Breaking changes vs 4.6: extended thinking budgets removed (adaptive-only), temperature/top_p/top_k rejected,
    // thinking content omitted by default, new tokenizer (~1x to 1.35x tokens for same text), no prefill.
    chatPrice: { input: 5, output: 25, cache: { cType: 'ant-bp', read: 0.50, write: 6.25, duration: 300 } },
-    // benchmark: { cbaElo: ... }, // TBD
+    benchmark: { cbaElo: 1497 }, // claude-opus-4-7
  },

  // Claude 4.6 models
@@ -249,7 +250,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    ],
    // Opus 4.6: flat $5/$25 pricing (1M context GA at standard pricing since 2026-03-13, no opt-in required)
    chatPrice: { input: 5, output: 25, cache: { cType: 'ant-bp', read: 0.50, write: 6.25, duration: 300 } },
-    // benchmark: { cbaElo: ... }, // TBD
+    benchmark: { cbaElo: 1496 }, // claude-opus-4-6
  },
  {
    id: 'claude-sonnet-4-6', // Active
@@ -264,7 +265,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    ],
    // Sonnet 4.6: flat $3/$15 pricing (1M context GA at standard pricing since 2026-03-13, no opt-in required)
    chatPrice: { input: 3, output: 15, cache: { cType: 'ant-bp', read: 0.30, write: 3.75, duration: 300 } },
-    // benchmark: { cbaElo: ... }, // TBD
+    benchmark: { cbaElo: 1463 }, // claude-sonnet-4-6
  },

  // Claude 4.5 models
@@ -280,7 +281,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
      ...ANT_TOOLS,
    ],
    chatPrice: { input: 5, output: 25, cache: { cType: 'ant-bp', read: 0.50, write: 6.25, duration: 300 } },
-    benchmark: { cbaElo: 1466 }, // claude-opus-4-5-20251101
+    benchmark: { cbaElo: 1469 }, // claude-opus-4-5-20251101
  },
  {
    id: 'claude-sonnet-4-5-20250929', // Active
@@ -305,7 +306,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
        duration: 300,
      },
    },
-    benchmark: { cbaElo: 1450 }, // claude-sonnet-4-5-20250929
+    benchmark: { cbaElo: 1452 }, // claude-sonnet-4-5-20250929
  },
  {
    id: 'claude-haiku-4-5-20251001', // Active
@@ -316,7 +317,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    interfaces: IF_4,
    parameterSpecs: ANT_TOOLS,
    chatPrice: { input: 1, output: 5, cache: { cType: 'ant-bp', read: 0.10, write: 1.25, duration: 300 } },
-    benchmark: { cbaElo: 1403 }, // claude-haiku-4-5-20251001
+    benchmark: { cbaElo: 1408 }, // claude-haiku-4-5-20251001
  },

  // Claude 4.1 models
@@ -329,7 +330,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    interfaces: IF_4,
    parameterSpecs: ANT_TOOLS,
    chatPrice: { input: 15, output: 75, cache: { cType: 'ant-bp', read: 1.50, write: 18.75, duration: 300 } },
-    benchmark: { cbaElo: 1445 }, // claude-opus-4-1-20250805
+    benchmark: { cbaElo: 1447 }, // claude-opus-4-1-20250805
  },

  // Claude 4 models
@@ -343,7 +344,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    interfaces: IF_4,
    parameterSpecs: ANT_TOOLS,
    chatPrice: { input: 15, output: 75, cache: { cType: 'ant-bp', read: 1.50, write: 18.75, duration: 300 } },
-    benchmark: { cbaElo: 1414 }, // claude-opus-4-20250514
+    benchmark: { cbaElo: 1412 }, // claude-opus-4-20250514
    isLegacy: true,
  },
  {
@@ -370,7 +371,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
        duration: 300,
      },
    },
-    benchmark: { cbaElo: 1390 }, // claude-sonnet-4-20250514
+    benchmark: { cbaElo: 1389 }, // claude-sonnet-4-20250514
    isLegacy: true,
  },

@@ -384,7 +385,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    interfaces: IF_4,
    parameterSpecs: ANT_TOOLS,
    chatPrice: { input: 3, output: 15, cache: { cType: 'ant-bp', read: 0.30, write: 3.75, duration: 300 } },
-    benchmark: { cbaElo: 1372 }, // claude-3-7-sonnet-20250219
+    benchmark: { cbaElo: 1371 }, // claude-3-7-sonnet-20250219
    hidden: true, // retired
    isLegacy: true,
  },
@@ -401,7 +402,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    interfaces: IF_4,
    parameterSpecs: ANT_TOOLS,
    chatPrice: { input: 0.80, output: 4.00, cache: { cType: 'ant-bp', read: 0.08, write: 1.00, duration: 300 } },
-    benchmark: { cbaElo: 1324 }, // claude-3-5-haiku-20241022
+    benchmark: { cbaElo: 1323 }, // claude-3-5-haiku-20241022
    hidden: true, // retired
    isLegacy: true,
  },
@@ -417,7 +418,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    maxCompletionTokens: 4096,
    interfaces: IF_4,
    chatPrice: { input: 0.25, output: 1.25, cache: { cType: 'ant-bp', read: 0.03, write: 0.30, duration: 300 } },
-    benchmark: { cbaElo: 1262 }, // claude-3-haiku-20240307
+    benchmark: { cbaElo: 1260 }, // claude-3-haiku-20240307
    isLegacy: true,
  },

@@ -208,7 +208,7 @@ const _knownGeminiModels: ({
      { paramId: 'llmVndGeminiGoogleSearch' },
      // { paramId: 'llmVndGeminiComputerUse' }, // we don't have the logic to handle this yet
    ],
-    benchmark: undefined, // too new for CBA ELO (released Feb 19, 2026)
+    benchmark: { cbaElo: 1493 }, // gemini-3.1-pro-preview
  },
  // 3.1 Pro (Preview) - Custom Tools variant - Released February 19, 2026
  // Better at prioritizing custom tools for users building with a mix of bash and tools
@@ -225,7 +225,7 @@ const _knownGeminiModels: ({
      { paramId: 'llmVndGeminiCodeExecution' },
      { paramId: 'llmVndGeminiGoogleSearch' },
    ],
-    benchmark: undefined,
+    benchmark: { cbaElo: 1493 - 1 }, // -1 (deprio this variant) + gemini-3.1-pro-preview
  },

  // 3.1 Flash Image Preview - Released February 26, 2026
@@ -259,7 +259,7 @@ const _knownGeminiModels: ({
      { paramId: 'llmVndGeminiCodeExecution' },
      { paramId: 'llmVndGeminiGoogleSearch' },
    ],
-    benchmark: undefined, // too new (released March 3, 2026)
+    benchmark: { cbaElo: 1438 }, // gemini-3.1-flash-lite-preview
  },


@@ -280,7 +280,7 @@ const _knownGeminiModels: ({
      { paramId: 'llmVndGeminiGoogleSearch' },
      // { paramId: 'llmVndGeminiComputerUse' }, // we don't have the logic to handle this yet
    ],
-    benchmark: { cbaElo: 1487 }, // gemini-3-pro
+    benchmark: { cbaElo: 1486 }, // gemini-3-pro
  },

  // 3.0 Pro Image Preview - Released November 20, 2025
@@ -331,7 +331,7 @@ const _knownGeminiModels: ({
      { paramId: 'llmVndGeminiGoogleSearch' },
      // { paramId: 'llmVndGeminiComputerUse' }, // we don't have the logic to handle this yet
    ],
-    benchmark: { cbaElo: 1471 }, // gemini-3-flash
+    benchmark: { cbaElo: 1474 }, // gemini-3-flash
  },

  /// Generation 2.5
@@ -350,7 +350,7 @@ const _knownGeminiModels: ({
      },
      { paramId: 'llmVndGeminiGoogleSearch' },
    ],
-    benchmark: { cbaElo: 1450 }, // gemini-2.5-pro
+    benchmark: { cbaElo: 1448 }, // gemini-2.5-pro
  },

  // REMOVED MODELS (no longer returned by API as of Jan 8, 2026):
@@ -403,7 +403,7 @@ const _knownGeminiModels: ({
      { paramId: 'llmVndGeminiThinkingBudget' },
      { paramId: 'llmVndGeminiGoogleSearch' },
    ],
-    benchmark: { cbaElo: 1409 }, // gemini-2.5-flash
+    benchmark: { cbaElo: 1411 }, // gemini-2.5-flash
  },

  // REMOVED MODELS (no longer returned by API as of Nov 20, 2025):
@@ -559,7 +559,7 @@ const _knownGeminiModels: ({
    deprecated: '2026-06-01',
    chatPrice: gemini20FlashPricing,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Vision, LLM_IF_OAI_Fn, LLM_IF_GEM_CodeExecution],
-    benchmark: { cbaElo: 1361 }, // gemini-2.0-flash-001
+    benchmark: { cbaElo: 1360 }, // gemini-2.0-flash-001
  },
  {
    id: 'models/gemini-2.0-flash',
@@ -568,7 +568,7 @@ const _knownGeminiModels: ({
    // copied from symlink
    chatPrice: gemini20FlashPricing,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Vision, LLM_IF_OAI_Fn, LLM_IF_GEM_CodeExecution],
-    benchmark: { cbaElo: 1361 }, // gemini-2.0-flash
+    benchmark: { cbaElo: 1360 }, // gemini-2.0-flash
  },

  // 2.0 Flash Lite - DEPRECATED: shutdown June 1, 2026 (announced Feb 18, 2026)
@@ -625,6 +625,7 @@ const _knownGeminiModels: ({
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0],
    parameterSpecs: [{ paramId: 'llmVndGemEffort', enumValues: ['minimal', 'high'] }],
    chatPrice: geminiExpFree, // Free tier only according to pricing page
+    benchmark: { cbaElo: 1451 }, // gemma-4-31b
  },
  {
    hidden: true, // smaller MoE variant
@@ -633,6 +634,7 @@ const _knownGeminiModels: ({
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0],
    parameterSpecs: [{ paramId: 'llmVndGemEffort', enumValues: ['minimal', 'high'] }],
    chatPrice: geminiExpFree, // Free tier only according to pricing page
+    benchmark: { cbaElo: 1439 }, // gemma-4-26b-a4b
  },

  // Gemma 3n Model (newer than 3, first seen on the May 2025 update)
@@ -641,7 +643,7 @@ const _knownGeminiModels: ({
    isPreview: true,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0],
    chatPrice: geminiExpFree, // Free tier only according to pricing page
-    benchmark: { cbaElo: 1319 }, // gemma-3n-e4b-it
+    benchmark: { cbaElo: 1318 }, // gemma-3n-e4b-it
  },
  {
    id: 'models/gemma-3n-e2b-it',
@@ -659,7 +661,7 @@ const _knownGeminiModels: ({
    isPreview: true,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0],
    chatPrice: geminiExpFree, // Pricing page indicates free tier only
-    benchmark: { cbaElo: 1365 }, // gemma-3-27b-it
+    benchmark: { cbaElo: 1366 }, // gemma-3-27b-it
    // hidden: true, // Keep visible if it's a distinct offering
  },
  {
@@ -22,7 +22,7 @@ const _knownDeepseekChatModels: ManualMappings = [
    // ],
    maxCompletionTokens: 32768, // default, max: 65536
    chatPrice: { input: 0.28, output: 0.42, cache: { cType: 'oai-ac', read: 0.028 } },
-    benchmark: { cbaElo: 1412 }, //deepseek-v3.2-exp-thinking
+    benchmark: { cbaElo: 1425 }, // deepseek-v3.2-exp-thinking
  },
  {
    idPrefix: 'deepseek-chat',
@@ -32,7 +32,7 @@ const _knownDeepseekChatModels: ManualMappings = [
    interfaces: IF_3,
    maxCompletionTokens: 8192, // default is 4096, max is 8192
    chatPrice: { input: 0.28, output: 0.42, cache: { cType: 'oai-ac', read: 0.028 } },
-    benchmark: { cbaElo: 1420 }, // deepseek-v3.2
+    benchmark: { cbaElo: 1424 }, // deepseek-v3.2
  },
 ];

@@ -25,16 +25,16 @@ const _knownMistralModelDetails: Record<string, {
 }> = {

  // Premier models - Mistral 3 (Dec 2025)
-  'mistral-large-2512': { chatPrice: { input: 0.5, output: 1.5 }, benchmark: { cbaElo: 1414 } }, // Mistral Large 3 - MoE 41B active / 675B total
+  'mistral-large-2512': { chatPrice: { input: 0.5, output: 1.5 }, benchmark: { cbaElo: 1415 } }, // Mistral Large 3 - MoE 41B active / 675B total
  'mistral-large-2411': { chatPrice: { input: 2, output: 6 }, benchmark: { cbaElo: 1305 }, hidden: true }, // older version
  'mistral-large-latest': { chatPrice: { input: 0.5, output: 1.5 }, hidden: true }, // → 2512

-  'mistral-medium-2508': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1412 } }, // Mistral Medium 3
-  'mistral-medium-2505': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1384 }, hidden: true }, // older version
+  'mistral-medium-2508': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1410 } }, // Mistral Medium 3
+  'mistral-medium-2505': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1387 }, hidden: true }, // older version
  'mistral-medium-latest': { chatPrice: { input: 0.4, output: 2 }, hidden: true }, // → 2508
  'mistral-medium': { chatPrice: { input: 0.4, output: 2 }, hidden: true }, // symlink

-  'magistral-medium-2509': { chatPrice: { input: 2, output: 5 }, benchmark: { cbaElo: 1305 } }, // reasoning
+  'magistral-medium-2509': { chatPrice: { input: 2, output: 5 }, benchmark: { cbaElo: 1304 } }, // reasoning (leaderboard: magistral-medium-2506 = 1304)
  'magistral-medium-latest': { chatPrice: { input: 2, output: 5 }, hidden: true }, // symlink

  'devstral-2512': { label: 'Devstral 2 (2512)', chatPrice: { input: 0.4, output: 2 } }, // Devstral 2 - 123B coding agents (API returns "Mistral Vibe Cli")
@@ -70,7 +70,7 @@ const _knownMistralModelDetails: Record<string, {

  // Open models
  'mistral-small-2603': { chatPrice: { input: 0.15, output: 0.6 } }, // Mistral Small 4 - 119B hybrid (instruct+reasoning+coding), 256k ctx
-  'mistral-small-2506': { chatPrice: { input: 0.1, output: 0.3 }, benchmark: { cbaElo: 1356 }, hidden: true }, // Mistral Small 3.2
+  'mistral-small-2506': { chatPrice: { input: 0.1, output: 0.3 }, benchmark: { cbaElo: 1357 }, hidden: true }, // Mistral Small 3.2
  'mistral-small-latest': { chatPrice: { input: 0.15, output: 0.6 }, hidden: true }, // → 2603

  'labs-mistral-small-creative': { label: 'Mistral Small Creative', chatPrice: { input: 0.1, output: 0.3 } }, // creative writing, roleplay (Labs)
@@ -42,7 +42,7 @@ const _knownMoonshotModels: ManualMappings = [
    interfaces: IF_K2_5,
    parameterSpecs: _PS_Reasoning,
    chatPrice: { input: 0.60, output: 3.00, cache: { cType: 'oai-ac', read: 0.10 } },
-    benchmark: { cbaElo: 1450 }, // kimi-k2.5-thinking
+    benchmark: { cbaElo: 1451 }, // kimi-k2.5-thinking
  },

  // Kimi K2 Series - Latest Models
@@ -57,7 +57,7 @@ const _knownMoonshotModels: ManualMappings = [
    interfaces: IF_K2_REASON,
    // parameterSpecs: [{ paramId: 'llmVndMoonshotWebSearch' }], // NOT WORKING YET
    chatPrice: { input: 1.15, output: 8.00, cache: { cType: 'oai-ac', read: 0.15 } },
-    benchmark: { cbaElo: 1429 }, // kimi-k2-thinking-turbo
+    benchmark: { cbaElo: 1430 }, // kimi-k2-thinking-turbo
  },
  // Thinking
  {
@@ -107,7 +107,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
      { paramId: 'llmForceNoStream' },
    ],
    chatPrice: { input: 2.5, cache: { cType: 'oai-ac', read: 0.25 }, output: 15 },
-    benchmark: { cbaElo: 1481 }, // gpt-5.4-high (preliminary)
+    benchmark: { cbaElo: 1482 }, // gpt-5.4-high
  },
  {
    idPrefix: 'gpt-5.4',
@@ -156,7 +156,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
      { paramId: 'llmForceNoStream' },
    ],
    chatPrice: { input: 0.75, cache: { cType: 'oai-ac', read: 0.075 }, output: 4.5 },
-    // benchmark: TBD
+    benchmark: { cbaElo: 1458 }, // gpt-5.4-mini-high
  },
  {
    idPrefix: 'gpt-5.4-mini',
@@ -181,7 +181,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
      { paramId: 'llmForceNoStream' },
    ],
    chatPrice: { input: 0.2, cache: { cType: 'oai-ac', read: 0.02 }, output: 1.25 },
-    // benchmark: TBD
+    benchmark: { cbaElo: 1404 }, // gpt-5.4-nano-high
  },
  {
    idPrefix: 'gpt-5.4-nano',
@@ -242,7 +242,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
      { paramId: 'llmVndOaiCodeInterpreter' },
    ],
    chatPrice: { input: 1.75, cache: { cType: 'oai-ac', read: 0.175 }, output: 14 },
-    // benchmark: TBD
+    benchmark: { cbaElo: 1451 }, // gpt-5.3-chat-latest
  },


@@ -265,7 +265,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
      { paramId: 'llmForceNoStream' },
    ],
    chatPrice: { input: 1.75, cache: { cType: 'oai-ac', read: 0.175 }, output: 14 },
-    benchmark: { cbaElo: 1440 }, // gpt-5.2-high
+    benchmark: { cbaElo: 1441 }, // gpt-5.2-high
  },
  {
    idPrefix: 'gpt-5.2',
@@ -306,7 +306,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
      { paramId: 'llmVndOaiCodeInterpreter' },
    ],
    chatPrice: { input: 1.75, cache: { cType: 'oai-ac', read: 0.175 }, output: 14 },
-    // benchmark: TBD
+    benchmark: { cbaElo: 1477 }, // gpt-5.2-chat-latest-20260210
  },

  // GPT-5.2 Pro
@@ -352,7 +352,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
      { paramId: 'llmForceNoStream' },
    ],
    chatPrice: { input: 1.25, cache: { cType: 'oai-ac', read: 0.125 }, output: 10 },
-    benchmark: { cbaElo: 1459 }, // gpt-5.1-high
+    benchmark: { cbaElo: 1455 }, // gpt-5.1-high
  },
  {
    idPrefix: 'gpt-5.1',
@@ -450,7 +450,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
      { paramId: 'llmForceNoStream' }, // non-streaming option for unverified organizations
    ],
    chatPrice: { input: 1.25, cache: { cType: 'oai-ac', read: 0.125 }, output: 10 },
-    benchmark: { cbaElo: 1435 }, // gpt-5-high
+    benchmark: { cbaElo: 1433 }, // gpt-5-high
  },
  {
    idPrefix: 'gpt-5',
@@ -553,7 +553,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    interfaces: [LLM_IF_OAI_Responses, ...IFS_CHAT_CACHE_REASON, LLM_IF_HOTFIX_NoTemperature],
    parameterSpecs: [{ paramId: 'llmVndOaiEffort', enumValues: ['minimal', 'low', 'medium', 'high'] }, { paramId: 'llmVndOaiWebSearchContext' }, { paramId: 'llmVndOaiVerbosity' }, { paramId: 'llmVndOaiImageGeneration' }],
    chatPrice: { input: 0.05, cache: { cType: 'oai-ac', read: 0.005 }, output: 0.4 },
-    benchmark: { cbaElo: 1338 }, // gpt-5-nano-high
+    benchmark: { cbaElo: 1337 }, // gpt-5-nano-high
  },
  {
    idPrefix: 'gpt-5-nano',
@@ -575,7 +575,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
      { paramId: 'llmForceNoStream' },
    ],
    // chatPrice: TBD - unknown pricing
-    benchmark: { cbaElo: 1354 }, // gpt-oss-120b
+    benchmark: { cbaElo: 1353 }, // gpt-oss-120b
  },


@@ -630,7 +630,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    interfaces: IFS_CHAT_CACHE_REASON,
    parameterSpecs: [{ paramId: 'llmVndOaiEffort', enumValues: ['low', 'medium', 'high', 'xhigh'] }],
    chatPrice: { input: 1.1, cache: { cType: 'oai-ac', read: 0.275 }, output: 4.4 },
-    benchmark: { cbaElo: 1391 }, // o4-mini-2025-04-16
+    benchmark: { cbaElo: 1390 }, // o4-mini-2025-04-16
  },
  {
    idPrefix: 'o4-mini',
@@ -683,7 +683,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    interfaces: IFS_CHAT_CACHE_REASON,
    parameterSpecs: [{ paramId: 'llmVndOaiEffort', enumValues: ['low', 'medium', 'high', 'xhigh'] }, { paramId: 'llmForceNoStream' }],
    chatPrice: { input: 2, cache: { cType: 'oai-ac', read: 0.5 }, output: 8 },
-    benchmark: { cbaElo: 1433 }, // o3-2025-04-16
+    benchmark: { cbaElo: 1431 }, // o3-2025-04-16
  },
  {
    idPrefix: 'o3',
@@ -892,7 +892,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 4096,
    interfaces: IFS_CHAT_MIN,
    chatPrice: { input: 5, output: 15 },
-    benchmark: { cbaElo: 1346 }, // gpt-4o-2024-05-13
+    benchmark: { cbaElo: 1345 }, // gpt-4o-2024-05-13
  },
  {
    idPrefix: 'gpt-4o',
@@ -961,7 +961,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 16384,
    interfaces: IFS_CHAT_CACHE,
    chatPrice: { input: 0.15, cache: { cType: 'oai-ac', read: 0.075 }, output: 0.6 },
-    benchmark: { cbaElo: 1318 }, // gpt-4o-mini-2024-07-18
+    benchmark: { cbaElo: 1317 }, // gpt-4o-mini-2024-07-18
  },
  {
    idPrefix: 'gpt-4o-mini',
@@ -1015,7 +1015,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 4096,
    interfaces: IFS_CHAT_MIN,
    chatPrice: { input: 10, output: 30 },
-    benchmark: { cbaElo: 1325 }, // gpt-4-turbo-2024-04-09
+    benchmark: { cbaElo: 1324 }, // gpt-4-turbo-2024-04-09
  },
  {
    idPrefix: 'gpt-4-turbo',
@@ -1031,7 +1031,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 4096,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
    chatPrice: { input: 10, output: 30 },
-    benchmark: { cbaElo: 1314 }, // gpt-4-0125-preview
+    benchmark: { cbaElo: 1313 }, // gpt-4-0125-preview
  },
  {
    idPrefix: 'gpt-4-1106-preview', // GPT-4 Turbo preview model
@@ -1042,7 +1042,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 4096,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
    chatPrice: { input: 10, output: 30 },
-    benchmark: { cbaElo: 1314 }, // gpt-4-1106-preview
+    benchmark: { cbaElo: 1312 }, // gpt-4-1106-preview
  },
  {
    idPrefix: 'gpt-4-turbo-preview',
@@ -1060,7 +1060,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    contextWindow: 8192,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
    chatPrice: { input: 30, output: 60 },
-    benchmark: { cbaElo: 1276 }, // gpt-4-0613
+    benchmark: { cbaElo: 1274 }, // gpt-4-0613
    isLegacy: true,
  },
  {
@@ -1071,7 +1071,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    contextWindow: 8192,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
    chatPrice: { input: 30, output: 60 },
-    benchmark: { cbaElo: 1288 }, // gpt-4-0314
+    benchmark: { cbaElo: 1286 }, // gpt-4-0314
    isLegacy: true,
  },
  {
@@ -1094,7 +1094,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 4096,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
    chatPrice: { input: 0.5, output: 1.5 },
-    benchmark: { cbaElo: 1225 }, // gpt-3.5-turbo-0125
+    benchmark: { cbaElo: 1223 }, // gpt-3.5-turbo-0125
  },
  {
    idPrefix: 'gpt-3.5-turbo-1106',
@@ -1105,7 +1105,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 4096,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
    chatPrice: { input: 1, output: 2 },
-    benchmark: { cbaElo: 1204 }, // gpt-3.5-turbo-1106
+    benchmark: { cbaElo: 1202 }, // gpt-3.5-turbo-1106
  },
  {
    idPrefix: 'gpt-3.5-turbo',
@@ -92,7 +92,7 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: [...XAI_IF_Vision, LLM_IF_OAI_Reasoning],
    parameterSpecs: XAI_PAR_Reasoning,
    chatPrice: PRICE_420,
-    benchmark: { cbaElo: 1481 }, // grok-4.20-beta-0309-reasoning (CBA name)
+    benchmark: { cbaElo: 1480 }, // grok-4.20-beta-0309-reasoning (CBA name)
  },
  {
    idPrefix: 'grok-4.20-0309-non-reasoning',
@@ -103,7 +103,7 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: XAI_IF_Vision,
    parameterSpecs: XAI_PAR,
    chatPrice: PRICE_420,
-    benchmark: { cbaElo: 1492 }, // grok-4.20-beta1 (CBA name, preliminary)
+    benchmark: { cbaElo: 1482 }, // grok-4.20-beta1 (CBA name)
  },
  {
    idPrefix: 'grok-4.20-multi-agent-0309',
@@ -118,6 +118,7 @@ const _knownXAIChatModels: ManualMappings = [
      ...XAI_PAR_Reasoning,
    ],
    chatPrice: PRICE_420,
+    benchmark: { cbaElo: 1474 }, // grok-4.20-multi-agent-beta-0309
  },

  // Grok 4.1
@@ -130,7 +131,7 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: [...XAI_IF_Vision, LLM_IF_OAI_Reasoning],
    parameterSpecs: XAI_PAR_Reasoning,
    chatPrice: PRICE_41,
-    benchmark: { cbaElo: 1430 }, // grok-4-1-fast-reasoning
+    benchmark: { cbaElo: 1432 }, // grok-4-1-fast-reasoning
  },
  {
    idPrefix: 'grok-4-1-fast-non-reasoning',
@@ -141,7 +142,7 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: XAI_IF_Vision,
    parameterSpecs: XAI_PAR,
    chatPrice: PRICE_41,
-    benchmark: { cbaElo: 1466 }, // grok-4.1
+    benchmark: { cbaElo: 1461 }, // grok-4.1
  },

  // Grok 4
@@ -167,6 +168,7 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: XAI_IF_Vision,
    parameterSpecs: XAI_PAR,
    chatPrice: PRICE_40,
+    benchmark: { cbaElo: 1421 }, // grok-4-fast-chat
  },
  {
    hidden: true, // yield to 4.20
@@ -191,7 +193,7 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: XAI_IF_Pre4,
    parameterSpecs: XAI_PAR_Pre4,
    chatPrice: { input: 3, output: 15, cache: { cType: 'oai-ac', read: 0.75 } },
-    benchmark: { cbaElo: 1411 }, // grok-3-preview-02-24
+    benchmark: { cbaElo: 1412 }, // grok-3-preview-02-24
  },
  {
    idPrefix: 'grok-3-mini',