LLMs: sync CB

2026-05-10 21:50:14 -07:00 · 2026-01-29 22:13:04 -08:00
parent f617b06109
commit b11cac4328
7 changed files with 70 additions and 68 deletions
@@ -37,6 +37,7 @@ const _hardcodedAnthropicVariants: ModelVariantMap = {
    description: 'Claude Opus 4.5 with extended thinking mode for complex reasoning and agentic workflows',
    interfaces: [...IF_4_R, LLM_IF_ANT_ToolsSearch],
    parameterSpecs: [...ANT_PAR_WEB_THINKING, { paramId: 'llmVndAntEffort' }, { paramId: 'llmVndAntSkills' }],
+    benchmark: { cbaElo: 1468 }, // claude-opus-4-5-20251101-thinking-32k
    maxCompletionTokens: 32000,
  },

@@ -47,7 +48,7 @@ const _hardcodedAnthropicVariants: ModelVariantMap = {
    maxCompletionTokens: 64000,
    interfaces: [...IF_4_R, LLM_IF_ANT_ToolsSearch],
    parameterSpecs: [...ANT_PAR_WEB_THINKING, { paramId: 'llmVndAnt1MContext' }, { paramId: 'llmVndAntSkills' }],
-    benchmark: { cbaElo: 1451 + 1 }, // FALLBACK-UNTIL-AVAILABLE: claude-opus-4-1-20250805-thinking-16k + 1
+    benchmark: { cbaElo: 1450 }, // claude-sonnet-4-5-20250929-thinking-32k
  },

  'claude-haiku-4-5-20251001': {
@@ -67,7 +68,7 @@ const _hardcodedAnthropicVariants: ModelVariantMap = {
    maxCompletionTokens: 32000,
    interfaces: IF_4_R,
    parameterSpecs: ANT_PAR_WEB_THINKING,
-    benchmark: { cbaElo: 1451 }, // claude-opus-4-1-20250805-thinking-16k
+    benchmark: { cbaElo: 1448 }, // claude-opus-4-1-20250805-thinking-16k
  },

  // Claude 4 models with thinking variants
@@ -79,7 +80,7 @@ const _hardcodedAnthropicVariants: ModelVariantMap = {
    maxCompletionTokens: 32000,
    interfaces: IF_4_R,
    parameterSpecs: ANT_PAR_WEB_THINKING,
-    benchmark: { cbaElo: 1420 }, // claude-opus-4-20250514-thinking-16k
+    benchmark: { cbaElo: 1424 }, // claude-opus-4-20250514-thinking-16k
  },

  'claude-sonnet-4-20250514': {
@@ -100,7 +101,7 @@ const _hardcodedAnthropicVariants: ModelVariantMap = {
    maxCompletionTokens: 64000,
    interfaces: IF_4_R,
    parameterSpecs: ANT_PAR_WEB_THINKING,
-    benchmark: { cbaElo: 1385 }, // claude-3-7-sonnet-20250219-thinking-32k
+    benchmark: { cbaElo: 1389 }, // claude-3-7-sonnet-20250219-thinking-32k
  },

 } as const;
@@ -122,6 +123,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    interfaces: [...IF_4, LLM_IF_ANT_ToolsSearch],
    parameterSpecs: [...ANT_PAR_WEB, { paramId: 'llmVndAntEffort' }],
    chatPrice: { input: 5, output: 25, cache: { cType: 'ant-bp', read: 0.50, write: 6.25, duration: 300 } },
+    benchmark: { cbaElo: 1466 }, // claude-opus-4-5-20251101
  },
  {
    id: 'claude-sonnet-4-5-20250929', // Active
@@ -143,7 +145,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
        duration: 300,
      },
    },
-    benchmark: { cbaElo: 1438 + 1 }, // FALLBACK-UNTIL-AVAILABLE: claude-opus-4-1-20250805 + 1
+    benchmark: { cbaElo: 1450 }, // claude-sonnet-4-5-20250929
  },
  {
    id: 'claude-haiku-4-5-20251001', // Active
@@ -154,6 +156,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    interfaces: IF_4,
    parameterSpecs: [...ANT_PAR_WEB, { paramId: 'llmVndAntSkills' }],
    chatPrice: { input: 1, output: 5, cache: { cType: 'ant-bp', read: 0.10, write: 1.25, duration: 300 } },
+    benchmark: { cbaElo: 1403 }, // claude-haiku-4-5-20251001
  },

  // Claude 4.1 models
@@ -166,7 +169,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    interfaces: IF_4,
    parameterSpecs: ANT_PAR_WEB,
    chatPrice: { input: 15, output: 75, cache: { cType: 'ant-bp', read: 1.50, write: 18.75, duration: 300 } },
-    benchmark: { cbaElo: 1438 }, // claude-opus-4-1-20250805
+    benchmark: { cbaElo: 1445 }, // claude-opus-4-1-20250805
  },

  // Claude 4 models
@@ -180,7 +183,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    interfaces: IF_4,
    parameterSpecs: ANT_PAR_WEB,
    chatPrice: { input: 15, output: 75, cache: { cType: 'ant-bp', read: 1.50, write: 18.75, duration: 300 } },
-    benchmark: { cbaElo: 1411 }, // claude-opus-4-20250514
+    benchmark: { cbaElo: 1414 }, // claude-opus-4-20250514
  },
  {
    id: 'claude-sonnet-4-20250514', // Active
@@ -202,7 +205,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
        duration: 300,
      },
    },
-    benchmark: { cbaElo: 1386 }, // claude-sonnet-4-20250514
+    benchmark: { cbaElo: 1390 }, // claude-sonnet-4-20250514
  },

  // Claude 3.7 models
@@ -215,7 +218,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    interfaces: IF_4,
    parameterSpecs: ANT_PAR_WEB,
    chatPrice: { input: 3, output: 15, cache: { cType: 'ant-bp', read: 0.30, write: 3.75, duration: 300 } },
-    benchmark: { cbaElo: 1369 }, // claude-3-7-sonnet-20250219
+    benchmark: { cbaElo: 1372 }, // claude-3-7-sonnet-20250219
    hidden: true, // deprecated
    isLegacy: true,
  },
@@ -232,7 +235,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    interfaces: IF_4,
    parameterSpecs: ANT_PAR_WEB,
    chatPrice: { input: 0.80, output: 4.00, cache: { cType: 'ant-bp', read: 0.08, write: 1.00, duration: 300 } },
-    benchmark: { cbaElo: 1319, cbaMmlu: 75.2 }, // claude-3-5-haiku-20241022
+    benchmark: { cbaElo: 1324 }, // claude-3-5-haiku-20241022
    hidden: true, // deprecated
    isLegacy: true,
  },
@@ -248,7 +251,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
    maxCompletionTokens: 4096,
    interfaces: IF_4,
    chatPrice: { input: 0.25, output: 1.25, cache: { cType: 'ant-bp', read: 0.03, write: 0.30, duration: 300 } },
-    benchmark: { cbaElo: 1263, cbaMmlu: 75.1 },
+    benchmark: { cbaElo: 1262 }, // claude-3-haiku-20240307
  },

  // Legacy/Retired models
@@ -172,7 +172,7 @@ const _knownGeminiModels: ({
      { paramId: 'llmVndGeminiGoogleSearch' },
      // { paramId: 'llmVndGeminiComputerUse' }, // we don't have the logic to handle this yet
    ],
-    benchmark: { cbaElo: 1490 }, // gemini-3-pro
+    benchmark: { cbaElo: 1487 }, // gemini-3-pro
  },

  // 3.0 Pro Image Preview - Released November 20, 2025
@@ -221,7 +221,7 @@ const _knownGeminiModels: ({
      { paramId: 'llmVndGeminiGoogleSearch' },
      // { paramId: 'llmVndGeminiComputerUse' }, // we don't have the logic to handle this yet
    ],
-    benchmark: { cbaElo: 1480 }, // gemini-3-flash
+    benchmark: { cbaElo: 1471 }, // gemini-3-flash
  },

  /// Generation 2.5
@@ -237,7 +237,7 @@ const _knownGeminiModels: ({
      { paramId: 'llmVndGeminiThinkingBudget', rangeOverride: [128, 32768] /* does not support 0 which would turn thinking off */ },
      { paramId: 'llmVndGeminiGoogleSearch' },
    ],
-    benchmark: { cbaElo: 1451 }, // gemini-2.5-pro
+    benchmark: { cbaElo: 1450 }, // gemini-2.5-pro
  },

  // REMOVED MODELS (no longer returned by API as of Jan 8, 2026):
@@ -289,7 +289,7 @@ const _knownGeminiModels: ({
      { paramId: 'llmVndGeminiThinkingBudget' },
      { paramId: 'llmVndGeminiGoogleSearch' },
    ],
-    benchmark: { cbaElo: 1406 + 2 }, // gemini-2.5-flash-preview-09-2025 - the +2 is to be on top of the non-preview 2.5-flash (1407)
+    benchmark: { cbaElo: 1405 }, // gemini-2.5-flash-preview-09-2025
  },
  // 2.5 Flash
  {
@@ -303,7 +303,7 @@ const _knownGeminiModels: ({
      { paramId: 'llmVndGeminiThinkingBudget' },
      { paramId: 'llmVndGeminiGoogleSearch' },
    ],
-    benchmark: { cbaElo: 1407 }, // gemini-2.5-flash (updated from CSV)
+    benchmark: { cbaElo: 1409 }, // gemini-2.5-flash
  },

  // REMOVED MODELS (no longer returned by API as of Nov 20, 2025):
@@ -398,7 +398,7 @@ const _knownGeminiModels: ({
      { paramId: 'llmVndGeminiThinkingBudget' },
      { paramId: 'llmVndGeminiGoogleSearch' },
    ],
-    benchmark: { cbaElo: 1380 }, // gemini-2.5-flash-lite-preview-09-2025 (no-thinking variant)
+    benchmark: { cbaElo: 1379 }, // gemini-2.5-flash-lite-preview-09-2025-no-thinking
  },
  // 2.5 Flash-Lite - Released July 2025
  {
@@ -463,7 +463,7 @@ const _knownGeminiModels: ({
    chatPrice: gemini20FlashPricing,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Vision, LLM_IF_OAI_Fn, LLM_IF_OAI_Json, LLM_IF_GEM_CodeExecution],
    parameterSpecs: [{ paramId: 'llmVndGeminiGoogleSearch' }],
-    benchmark: { cbaElo: 1360 }, // gemini-2.0-flash-001
+    benchmark: { cbaElo: 1361 }, // gemini-2.0-flash-001
  },
  {
    id: 'models/gemini-2.0-flash',
@@ -473,7 +473,7 @@ const _knownGeminiModels: ({
    chatPrice: gemini20FlashPricing,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Vision, LLM_IF_OAI_Fn, LLM_IF_OAI_Json, LLM_IF_GEM_CodeExecution],
    parameterSpecs: [{ paramId: 'llmVndGeminiGoogleSearch' }],
-    benchmark: { cbaElo: 1360 }, // gemini-2.0-flash
+    benchmark: { cbaElo: 1361 }, // gemini-2.0-flash
  },

  // 2.0 Flash Lite
@@ -529,7 +529,7 @@ const _knownGeminiModels: ({
    isPreview: true,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0],
    chatPrice: geminiExpFree, // Free tier only according to pricing page
-    benchmark: { cbaElo: 1311 }, // Estimating based on comparable models
+    benchmark: { cbaElo: 1319 }, // gemma-3n-e4b-it
  },
  {
    id: 'models/gemma-3n-e2b-it',
@@ -547,7 +547,7 @@ const _knownGeminiModels: ({
    isPreview: true,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0],
    chatPrice: geminiExpFree, // Pricing page indicates free tier only
-    benchmark: { cbaElo: 1341 },
+    benchmark: { cbaElo: 1365 }, // gemma-3-27b-it
    // hidden: true, // Keep visible if it's a distinct offering
  },
  {
@@ -556,7 +556,7 @@ const _knownGeminiModels: ({
    isPreview: true,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0],
    chatPrice: geminiExpFree,
-    benchmark: { cbaElo: 1321 },
+    benchmark: { cbaElo: 1342 }, // gemma-3-12b-it
  },
  {
    hidden: true, // keep larger model
@@ -564,7 +564,7 @@ const _knownGeminiModels: ({
    isPreview: true,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0],
    chatPrice: geminiExpFree,
-    benchmark: { cbaElo: 1275 },
+    benchmark: { cbaElo: 1303 }, // gemma-3-4b-it
  },
  {
    hidden: true, // keep larger model
@@ -19,7 +19,7 @@ const _knownDeepseekChatModels: ManualMappings = [
    interfaces: [...IF_3, LLM_IF_OAI_Reasoning],
    maxCompletionTokens: 32768, // default, max: 65536
    chatPrice: { input: 0.28, output: 0.42, cache: { cType: 'oai-ac', read: 0.028 } },
-    benchmark: { cbaElo: 1418 }, // deepseek-r1-0528
+    benchmark: { cbaElo: 1412 }, //deepseek-v3.2-exp-thinking
  },
  {
    idPrefix: 'deepseek-chat',
@@ -29,7 +29,7 @@ const _knownDeepseekChatModels: ManualMappings = [
    interfaces: IF_3,
    maxCompletionTokens: 8192, // default is 4096, max is 8192
    chatPrice: { input: 0.28, output: 0.42, cache: { cType: 'oai-ac', read: 0.028 } },
-    benchmark: { cbaElo: 1419 }, // deepseek-v3.1-thinking
+    benchmark: { cbaElo: 1420 }, // deepseek-v3.2
  },
 ];

@@ -25,16 +25,16 @@ const _knownMistralModelDetails: Record<string, {
 }> = {

  // Premier models - Mistral 3 (Dec 2025)
-  'mistral-large-2512': { chatPrice: { input: 0.5, output: 1.5 } }, // Mistral Large 3 - MoE 41B active / 675B total
+  'mistral-large-2512': { chatPrice: { input: 0.5, output: 1.5 }, benchmark: { cbaElo: 1414 } }, // Mistral Large 3 - MoE 41B active / 675B total
  'mistral-large-2411': { chatPrice: { input: 2, output: 6 }, benchmark: { cbaElo: 1305 }, hidden: true }, // older version
  'mistral-large-latest': { chatPrice: { input: 0.5, output: 1.5 }, hidden: true }, // → 2512

-  'mistral-medium-2508': { chatPrice: { input: 0.4, output: 2 } }, // Mistral Medium 3
-  'mistral-medium-2505': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1383 }, hidden: true }, // older version
+  'mistral-medium-2508': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1412 } }, // Mistral Medium 3
+  'mistral-medium-2505': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1384 }, hidden: true }, // older version
  'mistral-medium-latest': { chatPrice: { input: 0.4, output: 2 }, hidden: true }, // → 2508
  'mistral-medium': { chatPrice: { input: 0.4, output: 2 }, hidden: true }, // symlink

-  'magistral-medium-2509': { chatPrice: { input: 2, output: 5 } }, // reasoning
+  'magistral-medium-2509': { chatPrice: { input: 2, output: 5 }, benchmark: { cbaElo: 1305 } }, // reasoning
  'magistral-medium-latest': { chatPrice: { input: 2, output: 5 }, hidden: true }, // symlink

  'devstral-2512': { label: 'Devstral 2 (2512)', chatPrice: { input: 0.4, output: 2 } }, // Devstral 2 - 123B coding agents (API returns "Mistral Vibe Cli")
@@ -61,7 +61,7 @@ const _knownMistralModelDetails: Record<string, {
  'ministral-14b-latest': { chatPrice: { input: 0.2, output: 0.2 }, hidden: true }, // symlink

  'ministral-8b-2512': { chatPrice: { input: 0.15, output: 0.15 } }, // Ministral 3 8B
-  'ministral-8b-2410': { chatPrice: { input: 0.1, output: 0.1 }, benchmark: { cbaElo: 1240 }, hidden: true }, // older version
+  'ministral-8b-2410': { chatPrice: { input: 0.1, output: 0.1 }, benchmark: { cbaElo: 1237 }, hidden: true }, // older version
  'ministral-8b-latest': { chatPrice: { input: 0.15, output: 0.15 }, hidden: true }, // symlink

  'ministral-3b-2512': { chatPrice: { input: 0.1, output: 0.1 } }, // Ministral 3 3B
@@ -69,7 +69,7 @@ const _knownMistralModelDetails: Record<string, {
  'ministral-3b-latest': { chatPrice: { input: 0.1, output: 0.1 }, hidden: true }, // symlink

  // Open models
-  'mistral-small-2506': { chatPrice: { input: 0.1, output: 0.3 } }, // Mistral Small 3.2
+  'mistral-small-2506': { chatPrice: { input: 0.1, output: 0.3 }, benchmark: { cbaElo: 1356 } }, // Mistral Small 3.2
  'mistral-small-latest': { chatPrice: { input: 0.1, output: 0.3 }, hidden: true }, // symlink

  'labs-mistral-small-creative': { label: 'Mistral Small Creative', chatPrice: { input: 0.1, output: 0.3 } }, // creative writing, roleplay (Labs)
@@ -38,7 +38,7 @@ const _knownMoonshotModels: ManualMappings = [
    interfaces: IF_K2_5,
    chatPrice: { input: 0.60, output: 3.00, cache: { cType: 'oai-ac', read: 0.10 } },
    parameterSpecs: [{ paramId: 'llmVndMoonReasoningEffort' }],
-    benchmark: { cbaElo: 1417 + 2 }, // to be at the top
+    benchmark: { cbaElo: 1450 }, // kimi-k2.5-thinking
  },

  // Kimi K2 Series - Latest Models
@@ -52,7 +52,7 @@ const _knownMoonshotModels: ManualMappings = [
    maxCompletionTokens: 65536,
    interfaces: IF_K2_REASON,
    chatPrice: { input: 1.15, output: 8.00, cache: { cType: 'oai-ac', read: 0.15 } },
-    benchmark: { cbaElo: 1417 + 1 }, // UNKNOWN +1 over 0905, but don't want to be above the non-turbo
+    benchmark: { cbaElo: 1429 }, // kimi-k2-thinking-turbo
    // parameterSpecs: [{ paramId: 'llmVndMoonshotWebSearch' }], // NOT WORKING YET
  },
  // Thinking
@@ -78,7 +78,7 @@ const _knownMoonshotModels: ManualMappings = [
    interfaces: IF_K2,
    chatPrice: { input: 0.60, output: 2.50, cache: { cType: 'oai-ac', read: 0.15 } },
    isPreview: true,
-    benchmark: { cbaElo: 1417 },
+    benchmark: { cbaElo: 1418 }, // kimi-k2-0905-preview
    // parameterSpecs: [{ paramId: 'llmVndMoonshotWebSearch' }],
  },
  {
@@ -91,7 +91,7 @@ const _knownMoonshotModels: ManualMappings = [
    interfaces: IF_K2,
    chatPrice: { input: 0.60, output: 2.50, cache: { cType: 'oai-ac', read: 0.15 } },
    isPreview: true,
-    benchmark: { cbaElo: 1415 },
+    benchmark: { cbaElo: 1417 }, // kimi-k2-0711-preview
    // parameterSpecs: [{ paramId: 'llmVndMoonshotWebSearch' }],
  },
  {
@@ -90,7 +90,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
      { paramId: 'llmForceNoStream' },
    ],
    chatPrice: { input: 1.75, cache: { cType: 'oai-ac', read: 0.175 }, output: 14 },
-    // benchmark: TBD
+    benchmark: { cbaElo: 1440 }, // gpt-5.2-high
  },
  {
    idPrefix: 'gpt-5.2',
@@ -174,7 +174,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
      { paramId: 'llmForceNoStream' },
    ],
    chatPrice: { input: 1.25, cache: { cType: 'oai-ac', read: 0.125 }, output: 10 },
-    // benchmark: TBD
+    benchmark: { cbaElo: 1459 }, // gpt-5.1-high
  },
  {
    idPrefix: 'gpt-5.1',
@@ -267,7 +267,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
      { paramId: 'llmForceNoStream' }, // non-streaming option for unverified organizations
    ],
    chatPrice: { input: 1.25, cache: { cType: 'oai-ac', read: 0.125 }, output: 10 },
-    benchmark: { cbaElo: 1442 }, // gpt-5-high
+    benchmark: { cbaElo: 1435 }, // gpt-5-high
  },
  {
    idPrefix: 'gpt-5',
@@ -302,7 +302,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 16384,
    interfaces: [LLM_IF_OAI_Responses, LLM_IF_OAI_Chat, LLM_IF_OAI_Vision, LLM_IF_OAI_PromptCaching], // no function calling or reasoning
    chatPrice: { input: 1.25, cache: { cType: 'oai-ac', read: 0.125 }, output: 10 },
-    benchmark: { cbaElo: 1430 }, // gpt-5-chat
+    benchmark: { cbaElo: 1426 }, // gpt-5-chat
  },

  // GPT-5 Codex
@@ -351,7 +351,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    interfaces: [LLM_IF_OAI_Responses, ...IFS_CHAT_CACHE_REASON, LLM_IF_Tools_WebSearch, LLM_IF_HOTFIX_NoTemperature],
    parameterSpecs: [{ paramId: 'llmVndOaiReasoningEffort4' }, { paramId: 'llmVndOaiWebSearchContext' }, { paramId: 'llmVndOaiVerbosity' }, { paramId: 'llmVndOaiImageGeneration' }, { paramId: 'llmForceNoStream' }],
    chatPrice: { input: 0.25, cache: { cType: 'oai-ac', read: 0.025 }, output: 2 },
-    benchmark: { cbaElo: 1388 }, // gpt-5-mini-high
+    benchmark: { cbaElo: 1390 }, // gpt-5-mini-high
  },
  {
    idPrefix: 'gpt-5-mini',
@@ -369,7 +369,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    interfaces: [LLM_IF_OAI_Responses, ...IFS_CHAT_CACHE_REASON, LLM_IF_HOTFIX_NoTemperature],
    parameterSpecs: [{ paramId: 'llmVndOaiReasoningEffort4' }, { paramId: 'llmVndOaiVerbosity' }, { paramId: 'llmVndOaiImageGeneration' }],
    chatPrice: { input: 0.05, cache: { cType: 'oai-ac', read: 0.005 }, output: 0.4 },
-    benchmark: { cbaElo: 1344 }, // gpt-5-nano-high
+    benchmark: { cbaElo: 1338 }, // gpt-5-nano-high
  },
  {
    idPrefix: 'gpt-5-nano',
@@ -390,6 +390,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
      { paramId: 'llmForceNoStream' },
    ],
    // chatPrice: TBD - unknown pricing
+    benchmark: { cbaElo: 1354 }, // gpt-oss-120b
  },


@@ -455,7 +456,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    interfaces: IFS_CHAT_CACHE_REASON,
    parameterSpecs: [{ paramId: 'llmVndOaiReasoningEffort' }],
    chatPrice: { input: 1.1, cache: { cType: 'oai-ac', read: 0.275 }, output: 4.4 },
-    benchmark: { cbaElo: 1393 }, // o4-mini-2025-04-16
+    benchmark: { cbaElo: 1391 }, // o4-mini-2025-04-16
  },
  {
    idPrefix: 'o4-mini',
@@ -508,7 +509,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    interfaces: IFS_CHAT_CACHE_REASON,
    parameterSpecs: [{ paramId: 'llmVndOaiReasoningEffort' }, { paramId: 'llmForceNoStream' }],
    chatPrice: { input: 2, cache: { cType: 'oai-ac', read: 0.5 }, output: 8 },
-    benchmark: { cbaElo: 1444 }, // o3-2025-04-16
+    benchmark: { cbaElo: 1433 }, // o3-2025-04-16
  },
  {
    idPrefix: 'o3',
@@ -526,7 +527,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn, LLM_IF_OAI_Json, LLM_IF_OAI_PromptCaching, LLM_IF_OAI_Reasoning, LLM_IF_HOTFIX_StripImages],
    parameterSpecs: [{ paramId: 'llmVndOaiReasoningEffort' }],
    chatPrice: { input: 1.1, cache: { cType: 'oai-ac', read: 0.55 }, output: 4.4 },
-    benchmark: { cbaElo: 1347 }, // o3-mini (not using o3-mini-high here, as it seems too inflated)
+    benchmark: { cbaElo: 1348 }, // o3-mini
  },
  {
    idPrefix: 'o3-mini',
@@ -563,7 +564,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    interfaces: IFS_CHAT_CACHE_REASON,
    parameterSpecs: [{ paramId: 'llmVndOaiReasoningEffort' }, { paramId: 'llmVndOaiRestoreMarkdown' }],
    chatPrice: { input: 15, cache: { cType: 'oai-ac', read: 7.5 }, output: 60 },
-    benchmark: { cbaElo: 1399 }, // o1-2024-12-17
+    benchmark: { cbaElo: 1402 }, // o1-2024-12-17
  },
  {
    idPrefix: 'o1',
@@ -583,7 +584,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 32768,
    interfaces: IFS_CHAT_CACHE,
    chatPrice: { input: 2, cache: { cType: 'oai-ac', read: 0.5 }, output: 8 },
-    benchmark: { cbaElo: 1409 }, // gpt-4.1-2025-04-14
+    benchmark: { cbaElo: 1413 }, // gpt-4.1-2025-04-14
  },
  {
    idPrefix: 'gpt-4.1',
@@ -600,7 +601,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 32768,
    interfaces: IFS_CHAT_CACHE,
    chatPrice: { input: 0.4, cache: { cType: 'oai-ac', read: 0.1 }, output: 1.6 },
-    benchmark: { cbaElo: 1377 }, // gpt-4.1-mini-2025-04-14
+    benchmark: { cbaElo: 1382 }, // gpt-4.1-mini-2025-04-14
  },
  {
    idPrefix: 'gpt-4.1-mini',
@@ -617,7 +618,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 32768,
    interfaces: IFS_CHAT_CACHE,
    chatPrice: { input: 0.1, cache: { cType: 'oai-ac', read: 0.025 }, output: 0.4 },
-    benchmark: { cbaElo: 1320 }, // gpt-4.1-nano-2025-04-14
+    benchmark: { cbaElo: 1322 }, // gpt-4.1-nano-2025-04-14
  },
  {
    idPrefix: 'gpt-4.1-nano',
@@ -694,7 +695,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 16384,
    interfaces: IFS_CHAT_CACHE,
    chatPrice: { input: 2.5, cache: { cType: 'oai-ac', read: 1.25 }, output: 10 },
-    benchmark: { cbaElo: 1333 }, // GPT-4o (08/06)
+    benchmark: { cbaElo: 1335 }, // gpt-4o-2024-08-06
  },
  {
    idPrefix: 'gpt-4o-2024-05-13',
@@ -705,7 +706,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 4096,
    interfaces: IFS_CHAT_MIN,
    chatPrice: { input: 5, output: 15 },
-    benchmark: { cbaElo: 1344 }, // gpt-4o-2024-05-13
+    benchmark: { cbaElo: 1346 }, // gpt-4o-2024-05-13
  },
  {
    idPrefix: 'gpt-4o',
@@ -720,7 +721,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 16384,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Vision, LLM_IF_OAI_Json], // does not support Tools
    chatPrice: { input: 5, output: 15 },
-    benchmark: { cbaElo: 1441 }, // chatgpt-4o-latest-20250326
+    benchmark: { cbaElo: 1443 }, // chatgpt-4o-latest-20250326
    isLegacy: true, // Deprecated February 17, 2026.
  },

@@ -784,7 +785,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 16384,
    interfaces: IFS_CHAT_CACHE,
    chatPrice: { input: 0.15, cache: { cType: 'oai-ac', read: 0.075 }, output: 0.6 },
-    benchmark: { cbaElo: 1316 }, // GPT-4o-mini (07/18)
+    benchmark: { cbaElo: 1318 }, // gpt-4o-mini-2024-07-18
  },
  {
    idPrefix: 'gpt-4o-mini',
@@ -838,7 +839,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 4096,
    interfaces: IFS_CHAT_MIN,
    chatPrice: { input: 10, output: 30 },
-    benchmark: { cbaElo: 1324 }, // gpt-4-turbo-2024-04-09
+    benchmark: { cbaElo: 1325 }, // gpt-4-turbo-2024-04-09
  },
  {
    idPrefix: 'gpt-4-turbo',
@@ -854,7 +855,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 4096,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn, LLM_IF_OAI_Json],
    chatPrice: { input: 10, output: 30 },
-    benchmark: { cbaElo: 1315 }, // gpt-4-0125-preview
+    benchmark: { cbaElo: 1314 }, // gpt-4-0125-preview
  },
  {
    idPrefix: 'gpt-4-1106-preview', // GPT-4 Turbo preview model
@@ -865,7 +866,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 4096,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn, LLM_IF_OAI_Json],
    chatPrice: { input: 10, output: 30 },
-    benchmark: { cbaElo: 1315 }, // gpt-4-1106-preview
+    benchmark: { cbaElo: 1314 }, // gpt-4-1106-preview
  },
  {
    idPrefix: 'gpt-4-turbo-preview',
@@ -883,7 +884,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    contextWindow: 8192,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
    chatPrice: { input: 30, output: 60 },
-    benchmark: { cbaElo: 1163 },
+    benchmark: { cbaElo: 1276 }, // gpt-4-0613
    isLegacy: true,
  },
  {
@@ -894,7 +895,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    contextWindow: 8192,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
    chatPrice: { input: 30, output: 60 },
-    benchmark: { cbaElo: 1186 },
+    benchmark: { cbaElo: 1288 }, // gpt-4-0314
    isLegacy: true,
  },
  {
@@ -917,7 +918,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 4096,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
    chatPrice: { input: 0.5, output: 1.5 },
-    benchmark: { cbaElo: 1106 },
+    benchmark: { cbaElo: 1225 }, // gpt-3.5-turbo-0125
  },
  {
    idPrefix: 'gpt-3.5-turbo-1106',
@@ -928,7 +929,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
    maxCompletionTokens: 4096,
    interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
    chatPrice: { input: 1, output: 2 },
-    benchmark: { cbaElo: 1068 },
+    benchmark: { cbaElo: 1204 }, // gpt-3.5-turbo-1106
  },
  {
    idPrefix: 'gpt-3.5-turbo',
@@ -83,7 +83,7 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: [...XAI_IF_Vision, LLM_IF_OAI_Reasoning],
    parameterSpecs: XAI_PAR_Reasoning,
    chatPrice: PRICE_41,
-    benchmark: { cbaElo: 1483 }, // grok-4-1-fast-reasoning
+    benchmark: { cbaElo: 1430 }, // grok-4-1-fast-reasoning
  },
  {
    idPrefix: 'grok-4-1-fast-non-reasoning',
@@ -94,7 +94,7 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: XAI_IF_Vision,
    parameterSpecs: XAI_PAR,
    chatPrice: PRICE_41,
-    benchmark: { cbaElo: 1465 }, // grok-4-1-fast-non-reasoning
+    benchmark: { cbaElo: 1466 }, // grok-4.1
  },

  // Grok 4
@@ -108,7 +108,7 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: [...XAI_IF_Vision, LLM_IF_OAI_Reasoning],
    parameterSpecs: XAI_PAR_Reasoning,
    chatPrice: PRICE_40,
-    benchmark: { cbaElo: 1420 },
+    benchmark: { cbaElo: 1404 }, // grok-4-fast-reasoning
  },
  {
    hidden: true, // yield to 4.1
@@ -120,7 +120,6 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: XAI_IF_Vision,
    parameterSpecs: XAI_PAR,
    chatPrice: PRICE_40,
-    benchmark: { cbaElo: 1409 },
  },
  {
    idPrefix: 'grok-4-0709',
@@ -131,7 +130,7 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: [...XAI_IF_Vision, LLM_IF_OAI_Reasoning],
    parameterSpecs: XAI_PAR_Reasoning,
    chatPrice: { input: 3, output: 15, cache: { cType: 'oai-ac', read: 0.75 } },
-    benchmark: { cbaElo: 1415 + 6 }, // grok-4-0709 (+6 to stay on top of the fast)
+    benchmark: { cbaElo: 1410 }, // grok-4-0709
  },

  // Grok 3 (Pre-Grok 4: no server-side tools)
@@ -144,7 +143,7 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: XAI_IF_Pre4,
    parameterSpecs: XAI_PAR_Pre4,
    chatPrice: { input: 3, output: 15, cache: { cType: 'oai-ac', read: 0.75 } },
-    benchmark: { cbaElo: 1409 }, // grok-3-preview-02-24
+    benchmark: { cbaElo: 1411 }, // grok-3-preview-02-24
  },
  {
    idPrefix: 'grok-3-mini',
@@ -155,7 +154,7 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: [...XAI_IF_Pre4, LLM_IF_OAI_Reasoning],
    parameterSpecs: XAI_PAR_Pre4,
    chatPrice: { input: 0.3, output: 0.5, cache: { cType: 'oai-ac', read: 0.075 } },
-    benchmark: { cbaElo: 1358 }, // grok-3-mini-beta (updated from CSV)
+    benchmark: { cbaElo: 1357 }, // grok-3-mini-beta
  },

  // Grok Code (Pre-Grok 4: no server-side tools)
@@ -181,8 +180,7 @@ const _knownXAIChatModels: ManualMappings = [
    interfaces: XAI_IF_Pre4_Vision,
    parameterSpecs: XAI_PAR_Pre4,
    chatPrice: { input: 2, output: 10 },
-    // Fuzzy matched with "grok-2-2024-08-13" (1288) => wrong, but still we need a fallback
-    benchmark: { cbaElo: 1288 },
+    // no benchmark: keep this out
  },

 ] as const;