LLMs: sync scores

This commit is contained in:
Enrico Ros
2026-04-20 23:13:56 -07:00
parent c8e7315de3
commit 205fb1bb5b
7 changed files with 71 additions and 66 deletions
@@ -77,7 +77,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
{ paramId: 'llmVndAntEffort', enumValues: ['low', 'medium', 'high', 'xhigh', 'max'] },
...ANT_TOOLS_DYNAMIC,
],
// benchmark: { cbaElo: ... }, // TBD
benchmark: { cbaElo: 1504 }, // claude-opus-4-7-thinking
},
// Claude 4.6 models with thinking variants
@@ -92,7 +92,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
{ paramId: 'llmVndAntInfSpeed' },
...ANT_TOOLS_DYNAMIC,
],
// benchmark: { cbaElo: ... }, // TBD
benchmark: { cbaElo: 1502 }, // claude-opus-4-6-thinking
},
'claude-sonnet-4-6': {
@@ -105,7 +105,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
{ paramId: 'llmVndAntEffort', enumValues: ['low', 'medium', 'high', 'max'] },
...ANT_TOOLS_DYNAMIC,
],
// benchmark: { cbaElo: ... }, // TBD
benchmark: { cbaElo: 1463 + 1 }, // 1 (thinking) + claude-sonnet-4-6
},
// Claude 4.5 models with thinking variants
@@ -119,7 +119,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
{ paramId: 'llmVndAntEffort', enumValues: ['low', 'medium', 'high'] },
...ANT_TOOLS,
],
benchmark: { cbaElo: 1468 }, // claude-opus-4-5-20251101-thinking-32k
benchmark: { cbaElo: 1473 }, // claude-opus-4-5-20251101-thinking-32k
maxCompletionTokens: 32000,
},
@@ -134,7 +134,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
{ paramId: 'llmVndAnt1MContext' },
...ANT_TOOLS,
],
benchmark: { cbaElo: 1450 }, // claude-sonnet-4-5-20250929-thinking-32k
benchmark: { cbaElo: 1452 }, // claude-sonnet-4-5-20250929-thinking-32k
},
'claude-haiku-4-5-20251001': {
@@ -147,6 +147,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
{ paramId: 'llmVndAntThinkingBudget' },
...ANT_TOOLS,
],
benchmark: { cbaElo: 1408 + 1 }, // 1 (thinking) + claude-haiku-4-5-20251001
},
// Claude 4.1 models with thinking variants
@@ -160,7 +161,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
{ paramId: 'llmVndAntThinkingBudget' },
...ANT_TOOLS,
],
benchmark: { cbaElo: 1448 }, // claude-opus-4-1-20250805-thinking-16k
benchmark: { cbaElo: 1449 }, // claude-opus-4-1-20250805-thinking-16k
},
// Claude 4 models with thinking variants
@@ -189,7 +190,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
{ paramId: 'llmVndAnt1MContext' },
...ANT_TOOLS,
],
benchmark: { cbaElo: 1400 }, // claude-sonnet-4-20250514-thinking-32k
benchmark: { cbaElo: 1399 }, // claude-sonnet-4-20250514-thinking-32k
},
// Changes to the thinking variant (same model ID) for the Claude Sonnet 3.7 model
@@ -203,7 +204,7 @@ const _hardcodedAnthropicThinkingVariants: ModelVariantMap & { [id: string]: { i
{ paramId: 'llmVndAntThinkingBudget' },
...ANT_TOOLS,
],
benchmark: { cbaElo: 1389 }, // claude-3-7-sonnet-20250219-thinking-32k
benchmark: { cbaElo: 1387 }, // claude-3-7-sonnet-20250219-thinking-32k
},
} as const;
@@ -231,7 +232,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
// Breaking changes vs 4.6: extended thinking budgets removed (adaptive-only), temperature/top_p/top_k rejected,
// thinking content omitted by default, new tokenizer (~1x to 1.35x tokens for same text), no prefill.
chatPrice: { input: 5, output: 25, cache: { cType: 'ant-bp', read: 0.50, write: 6.25, duration: 300 } },
// benchmark: { cbaElo: ... }, // TBD
benchmark: { cbaElo: 1497 }, // claude-opus-4-7
},
// Claude 4.6 models
@@ -249,7 +250,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
],
// Opus 4.6: flat $5/$25 pricing (1M context GA at standard pricing since 2026-03-13, no opt-in required)
chatPrice: { input: 5, output: 25, cache: { cType: 'ant-bp', read: 0.50, write: 6.25, duration: 300 } },
// benchmark: { cbaElo: ... }, // TBD
benchmark: { cbaElo: 1496 }, // claude-opus-4-6
},
{
id: 'claude-sonnet-4-6', // Active
@@ -264,7 +265,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
],
// Sonnet 4.6: flat $3/$15 pricing (1M context GA at standard pricing since 2026-03-13, no opt-in required)
chatPrice: { input: 3, output: 15, cache: { cType: 'ant-bp', read: 0.30, write: 3.75, duration: 300 } },
// benchmark: { cbaElo: ... }, // TBD
benchmark: { cbaElo: 1463 }, // claude-sonnet-4-6
},
// Claude 4.5 models
@@ -280,7 +281,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
...ANT_TOOLS,
],
chatPrice: { input: 5, output: 25, cache: { cType: 'ant-bp', read: 0.50, write: 6.25, duration: 300 } },
benchmark: { cbaElo: 1466 }, // claude-opus-4-5-20251101
benchmark: { cbaElo: 1469 }, // claude-opus-4-5-20251101
},
{
id: 'claude-sonnet-4-5-20250929', // Active
@@ -305,7 +306,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
duration: 300,
},
},
benchmark: { cbaElo: 1450 }, // claude-sonnet-4-5-20250929
benchmark: { cbaElo: 1452 }, // claude-sonnet-4-5-20250929
},
{
id: 'claude-haiku-4-5-20251001', // Active
@@ -316,7 +317,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
interfaces: IF_4,
parameterSpecs: ANT_TOOLS,
chatPrice: { input: 1, output: 5, cache: { cType: 'ant-bp', read: 0.10, write: 1.25, duration: 300 } },
benchmark: { cbaElo: 1403 }, // claude-haiku-4-5-20251001
benchmark: { cbaElo: 1408 }, // claude-haiku-4-5-20251001
},
// Claude 4.1 models
@@ -329,7 +330,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
interfaces: IF_4,
parameterSpecs: ANT_TOOLS,
chatPrice: { input: 15, output: 75, cache: { cType: 'ant-bp', read: 1.50, write: 18.75, duration: 300 } },
benchmark: { cbaElo: 1445 }, // claude-opus-4-1-20250805
benchmark: { cbaElo: 1447 }, // claude-opus-4-1-20250805
},
// Claude 4 models
@@ -343,7 +344,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
interfaces: IF_4,
parameterSpecs: ANT_TOOLS,
chatPrice: { input: 15, output: 75, cache: { cType: 'ant-bp', read: 1.50, write: 18.75, duration: 300 } },
benchmark: { cbaElo: 1414 }, // claude-opus-4-20250514
benchmark: { cbaElo: 1412 }, // claude-opus-4-20250514
isLegacy: true,
},
{
@@ -370,7 +371,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
duration: 300,
},
},
benchmark: { cbaElo: 1390 }, // claude-sonnet-4-20250514
benchmark: { cbaElo: 1389 }, // claude-sonnet-4-20250514
isLegacy: true,
},
@@ -384,7 +385,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
interfaces: IF_4,
parameterSpecs: ANT_TOOLS,
chatPrice: { input: 3, output: 15, cache: { cType: 'ant-bp', read: 0.30, write: 3.75, duration: 300 } },
benchmark: { cbaElo: 1372 }, // claude-3-7-sonnet-20250219
benchmark: { cbaElo: 1371 }, // claude-3-7-sonnet-20250219
hidden: true, // retired
isLegacy: true,
},
@@ -401,7 +402,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
interfaces: IF_4,
parameterSpecs: ANT_TOOLS,
chatPrice: { input: 0.80, output: 4.00, cache: { cType: 'ant-bp', read: 0.08, write: 1.00, duration: 300 } },
benchmark: { cbaElo: 1324 }, // claude-3-5-haiku-20241022
benchmark: { cbaElo: 1323 }, // claude-3-5-haiku-20241022
hidden: true, // retired
isLegacy: true,
},
@@ -417,7 +418,7 @@ export const hardcodedAnthropicModels: (ModelDescriptionSchema & { isLegacy?: bo
maxCompletionTokens: 4096,
interfaces: IF_4,
chatPrice: { input: 0.25, output: 1.25, cache: { cType: 'ant-bp', read: 0.03, write: 0.30, duration: 300 } },
benchmark: { cbaElo: 1262 }, // claude-3-haiku-20240307
benchmark: { cbaElo: 1260 }, // claude-3-haiku-20240307
isLegacy: true,
},
+13 -11
View File
@@ -208,7 +208,7 @@ const _knownGeminiModels: ({
{ paramId: 'llmVndGeminiGoogleSearch' },
// { paramId: 'llmVndGeminiComputerUse' }, // we don't have the logic to handle this yet
],
benchmark: undefined, // too new for CBA ELO (released Feb 19, 2026)
benchmark: { cbaElo: 1493 }, // gemini-3.1-pro-preview
},
// 3.1 Pro (Preview) - Custom Tools variant - Released February 19, 2026
// Better at prioritizing custom tools for users building with a mix of bash and tools
@@ -225,7 +225,7 @@ const _knownGeminiModels: ({
{ paramId: 'llmVndGeminiCodeExecution' },
{ paramId: 'llmVndGeminiGoogleSearch' },
],
benchmark: undefined,
benchmark: { cbaElo: 1493 - 1 }, // -1 (deprio this variant) + gemini-3.1-pro-preview
},
// 3.1 Flash Image Preview - Released February 26, 2026
@@ -259,7 +259,7 @@ const _knownGeminiModels: ({
{ paramId: 'llmVndGeminiCodeExecution' },
{ paramId: 'llmVndGeminiGoogleSearch' },
],
benchmark: undefined, // too new (released March 3, 2026)
benchmark: { cbaElo: 1438 }, // gemini-3.1-flash-lite-preview
},
@@ -280,7 +280,7 @@ const _knownGeminiModels: ({
{ paramId: 'llmVndGeminiGoogleSearch' },
// { paramId: 'llmVndGeminiComputerUse' }, // we don't have the logic to handle this yet
],
benchmark: { cbaElo: 1487 }, // gemini-3-pro
benchmark: { cbaElo: 1486 }, // gemini-3-pro
},
// 3.0 Pro Image Preview - Released November 20, 2025
@@ -331,7 +331,7 @@ const _knownGeminiModels: ({
{ paramId: 'llmVndGeminiGoogleSearch' },
// { paramId: 'llmVndGeminiComputerUse' }, // we don't have the logic to handle this yet
],
benchmark: { cbaElo: 1471 }, // gemini-3-flash
benchmark: { cbaElo: 1474 }, // gemini-3-flash
},
/// Generation 2.5
@@ -350,7 +350,7 @@ const _knownGeminiModels: ({
},
{ paramId: 'llmVndGeminiGoogleSearch' },
],
benchmark: { cbaElo: 1450 }, // gemini-2.5-pro
benchmark: { cbaElo: 1448 }, // gemini-2.5-pro
},
// REMOVED MODELS (no longer returned by API as of Jan 8, 2026):
@@ -403,7 +403,7 @@ const _knownGeminiModels: ({
{ paramId: 'llmVndGeminiThinkingBudget' },
{ paramId: 'llmVndGeminiGoogleSearch' },
],
benchmark: { cbaElo: 1409 }, // gemini-2.5-flash
benchmark: { cbaElo: 1411 }, // gemini-2.5-flash
},
// REMOVED MODELS (no longer returned by API as of Nov 20, 2025):
@@ -559,7 +559,7 @@ const _knownGeminiModels: ({
deprecated: '2026-06-01',
chatPrice: gemini20FlashPricing,
interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Vision, LLM_IF_OAI_Fn, LLM_IF_GEM_CodeExecution],
benchmark: { cbaElo: 1361 }, // gemini-2.0-flash-001
benchmark: { cbaElo: 1360 }, // gemini-2.0-flash-001
},
{
id: 'models/gemini-2.0-flash',
@@ -568,7 +568,7 @@ const _knownGeminiModels: ({
// copied from symlink
chatPrice: gemini20FlashPricing,
interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Vision, LLM_IF_OAI_Fn, LLM_IF_GEM_CodeExecution],
benchmark: { cbaElo: 1361 }, // gemini-2.0-flash
benchmark: { cbaElo: 1360 }, // gemini-2.0-flash
},
// 2.0 Flash Lite - DEPRECATED: shutdown June 1, 2026 (announced Feb 18, 2026)
@@ -625,6 +625,7 @@ const _knownGeminiModels: ({
interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0],
parameterSpecs: [{ paramId: 'llmVndGemEffort', enumValues: ['minimal', 'high'] }],
chatPrice: geminiExpFree, // Free tier only according to pricing page
benchmark: { cbaElo: 1451 }, // gemma-4-31b
},
{
hidden: true, // smaller MoE variant
@@ -633,6 +634,7 @@ const _knownGeminiModels: ({
interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0],
parameterSpecs: [{ paramId: 'llmVndGemEffort', enumValues: ['minimal', 'high'] }],
chatPrice: geminiExpFree, // Free tier only according to pricing page
benchmark: { cbaElo: 1439 }, // gemma-4-26b-a4b
},
// Gemma 3n Model (newer than 3, first seen on the May 2025 update)
@@ -641,7 +643,7 @@ const _knownGeminiModels: ({
isPreview: true,
interfaces: [LLM_IF_OAI_Chat, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0],
chatPrice: geminiExpFree, // Free tier only according to pricing page
benchmark: { cbaElo: 1319 }, // gemma-3n-e4b-it
benchmark: { cbaElo: 1318 }, // gemma-3n-e4b-it
},
{
id: 'models/gemma-3n-e2b-it',
@@ -659,7 +661,7 @@ const _knownGeminiModels: ({
isPreview: true,
interfaces: [LLM_IF_OAI_Chat, LLM_IF_HOTFIX_StripImages, LLM_IF_HOTFIX_Sys0ToUsr0],
chatPrice: geminiExpFree, // Pricing page indicates free tier only
benchmark: { cbaElo: 1365 }, // gemma-3-27b-it
benchmark: { cbaElo: 1366 }, // gemma-3-27b-it
// hidden: true, // Keep visible if it's a distinct offering
},
{
@@ -22,7 +22,7 @@ const _knownDeepseekChatModels: ManualMappings = [
// ],
maxCompletionTokens: 32768, // default, max: 65536
chatPrice: { input: 0.28, output: 0.42, cache: { cType: 'oai-ac', read: 0.028 } },
benchmark: { cbaElo: 1412 }, //deepseek-v3.2-exp-thinking
benchmark: { cbaElo: 1425 }, // deepseek-v3.2-exp-thinking
},
{
idPrefix: 'deepseek-chat',
@@ -32,7 +32,7 @@ const _knownDeepseekChatModels: ManualMappings = [
interfaces: IF_3,
maxCompletionTokens: 8192, // default is 4096, max is 8192
chatPrice: { input: 0.28, output: 0.42, cache: { cType: 'oai-ac', read: 0.028 } },
benchmark: { cbaElo: 1420 }, // deepseek-v3.2
benchmark: { cbaElo: 1424 }, // deepseek-v3.2
},
];
@@ -25,16 +25,16 @@ const _knownMistralModelDetails: Record<string, {
}> = {
// Premier models - Mistral 3 (Dec 2025)
'mistral-large-2512': { chatPrice: { input: 0.5, output: 1.5 }, benchmark: { cbaElo: 1414 } }, // Mistral Large 3 - MoE 41B active / 675B total
'mistral-large-2512': { chatPrice: { input: 0.5, output: 1.5 }, benchmark: { cbaElo: 1415 } }, // Mistral Large 3 - MoE 41B active / 675B total
'mistral-large-2411': { chatPrice: { input: 2, output: 6 }, benchmark: { cbaElo: 1305 }, hidden: true }, // older version
'mistral-large-latest': { chatPrice: { input: 0.5, output: 1.5 }, hidden: true }, // → 2512
'mistral-medium-2508': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1412 } }, // Mistral Medium 3
'mistral-medium-2505': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1384 }, hidden: true }, // older version
'mistral-medium-2508': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1410 } }, // Mistral Medium 3
'mistral-medium-2505': { chatPrice: { input: 0.4, output: 2 }, benchmark: { cbaElo: 1387 }, hidden: true }, // older version
'mistral-medium-latest': { chatPrice: { input: 0.4, output: 2 }, hidden: true }, // → 2508
'mistral-medium': { chatPrice: { input: 0.4, output: 2 }, hidden: true }, // symlink
'magistral-medium-2509': { chatPrice: { input: 2, output: 5 }, benchmark: { cbaElo: 1305 } }, // reasoning
'magistral-medium-2509': { chatPrice: { input: 2, output: 5 }, benchmark: { cbaElo: 1304 } }, // reasoning (leaderboard: magistral-medium-2506 = 1304)
'magistral-medium-latest': { chatPrice: { input: 2, output: 5 }, hidden: true }, // symlink
'devstral-2512': { label: 'Devstral 2 (2512)', chatPrice: { input: 0.4, output: 2 } }, // Devstral 2 - 123B coding agents (API returns "Mistral Vibe Cli")
@@ -70,7 +70,7 @@ const _knownMistralModelDetails: Record<string, {
// Open models
'mistral-small-2603': { chatPrice: { input: 0.15, output: 0.6 } }, // Mistral Small 4 - 119B hybrid (instruct+reasoning+coding), 256k ctx
'mistral-small-2506': { chatPrice: { input: 0.1, output: 0.3 }, benchmark: { cbaElo: 1356 }, hidden: true }, // Mistral Small 3.2
'mistral-small-2506': { chatPrice: { input: 0.1, output: 0.3 }, benchmark: { cbaElo: 1357 }, hidden: true }, // Mistral Small 3.2
'mistral-small-latest': { chatPrice: { input: 0.15, output: 0.6 }, hidden: true }, // → 2603
'labs-mistral-small-creative': { label: 'Mistral Small Creative', chatPrice: { input: 0.1, output: 0.3 } }, // creative writing, roleplay (Labs)
@@ -42,7 +42,7 @@ const _knownMoonshotModels: ManualMappings = [
interfaces: IF_K2_5,
parameterSpecs: _PS_Reasoning,
chatPrice: { input: 0.60, output: 3.00, cache: { cType: 'oai-ac', read: 0.10 } },
benchmark: { cbaElo: 1450 }, // kimi-k2.5-thinking
benchmark: { cbaElo: 1451 }, // kimi-k2.5-thinking
},
// Kimi K2 Series - Latest Models
@@ -57,7 +57,7 @@ const _knownMoonshotModels: ManualMappings = [
interfaces: IF_K2_REASON,
// parameterSpecs: [{ paramId: 'llmVndMoonshotWebSearch' }], // NOT WORKING YET
chatPrice: { input: 1.15, output: 8.00, cache: { cType: 'oai-ac', read: 0.15 } },
benchmark: { cbaElo: 1429 }, // kimi-k2-thinking-turbo
benchmark: { cbaElo: 1430 }, // kimi-k2-thinking-turbo
},
// Thinking
{
@@ -107,7 +107,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
{ paramId: 'llmForceNoStream' },
],
chatPrice: { input: 2.5, cache: { cType: 'oai-ac', read: 0.25 }, output: 15 },
benchmark: { cbaElo: 1481 }, // gpt-5.4-high (preliminary)
benchmark: { cbaElo: 1482 }, // gpt-5.4-high
},
{
idPrefix: 'gpt-5.4',
@@ -156,7 +156,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
{ paramId: 'llmForceNoStream' },
],
chatPrice: { input: 0.75, cache: { cType: 'oai-ac', read: 0.075 }, output: 4.5 },
// benchmark: TBD
benchmark: { cbaElo: 1458 }, // gpt-5.4-mini-high
},
{
idPrefix: 'gpt-5.4-mini',
@@ -181,7 +181,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
{ paramId: 'llmForceNoStream' },
],
chatPrice: { input: 0.2, cache: { cType: 'oai-ac', read: 0.02 }, output: 1.25 },
// benchmark: TBD
benchmark: { cbaElo: 1404 }, // gpt-5.4-nano-high
},
{
idPrefix: 'gpt-5.4-nano',
@@ -242,7 +242,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
{ paramId: 'llmVndOaiCodeInterpreter' },
],
chatPrice: { input: 1.75, cache: { cType: 'oai-ac', read: 0.175 }, output: 14 },
// benchmark: TBD
benchmark: { cbaElo: 1451 }, // gpt-5.3-chat-latest
},
@@ -265,7 +265,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
{ paramId: 'llmForceNoStream' },
],
chatPrice: { input: 1.75, cache: { cType: 'oai-ac', read: 0.175 }, output: 14 },
benchmark: { cbaElo: 1440 }, // gpt-5.2-high
benchmark: { cbaElo: 1441 }, // gpt-5.2-high
},
{
idPrefix: 'gpt-5.2',
@@ -306,7 +306,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
{ paramId: 'llmVndOaiCodeInterpreter' },
],
chatPrice: { input: 1.75, cache: { cType: 'oai-ac', read: 0.175 }, output: 14 },
// benchmark: TBD
benchmark: { cbaElo: 1477 }, // gpt-5.2-chat-latest-20260210
},
// GPT-5.2 Pro
@@ -352,7 +352,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
{ paramId: 'llmForceNoStream' },
],
chatPrice: { input: 1.25, cache: { cType: 'oai-ac', read: 0.125 }, output: 10 },
benchmark: { cbaElo: 1459 }, // gpt-5.1-high
benchmark: { cbaElo: 1455 }, // gpt-5.1-high
},
{
idPrefix: 'gpt-5.1',
@@ -450,7 +450,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
{ paramId: 'llmForceNoStream' }, // non-streaming option for unverified organizations
],
chatPrice: { input: 1.25, cache: { cType: 'oai-ac', read: 0.125 }, output: 10 },
benchmark: { cbaElo: 1435 }, // gpt-5-high
benchmark: { cbaElo: 1433 }, // gpt-5-high
},
{
idPrefix: 'gpt-5',
@@ -553,7 +553,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
interfaces: [LLM_IF_OAI_Responses, ...IFS_CHAT_CACHE_REASON, LLM_IF_HOTFIX_NoTemperature],
parameterSpecs: [{ paramId: 'llmVndOaiEffort', enumValues: ['minimal', 'low', 'medium', 'high'] }, { paramId: 'llmVndOaiWebSearchContext' }, { paramId: 'llmVndOaiVerbosity' }, { paramId: 'llmVndOaiImageGeneration' }],
chatPrice: { input: 0.05, cache: { cType: 'oai-ac', read: 0.005 }, output: 0.4 },
benchmark: { cbaElo: 1338 }, // gpt-5-nano-high
benchmark: { cbaElo: 1337 }, // gpt-5-nano-high
},
{
idPrefix: 'gpt-5-nano',
@@ -575,7 +575,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
{ paramId: 'llmForceNoStream' },
],
// chatPrice: TBD - unknown pricing
benchmark: { cbaElo: 1354 }, // gpt-oss-120b
benchmark: { cbaElo: 1353 }, // gpt-oss-120b
},
@@ -630,7 +630,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
interfaces: IFS_CHAT_CACHE_REASON,
parameterSpecs: [{ paramId: 'llmVndOaiEffort', enumValues: ['low', 'medium', 'high', 'xhigh'] }],
chatPrice: { input: 1.1, cache: { cType: 'oai-ac', read: 0.275 }, output: 4.4 },
benchmark: { cbaElo: 1391 }, // o4-mini-2025-04-16
benchmark: { cbaElo: 1390 }, // o4-mini-2025-04-16
},
{
idPrefix: 'o4-mini',
@@ -683,7 +683,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
interfaces: IFS_CHAT_CACHE_REASON,
parameterSpecs: [{ paramId: 'llmVndOaiEffort', enumValues: ['low', 'medium', 'high', 'xhigh'] }, { paramId: 'llmForceNoStream' }],
chatPrice: { input: 2, cache: { cType: 'oai-ac', read: 0.5 }, output: 8 },
benchmark: { cbaElo: 1433 }, // o3-2025-04-16
benchmark: { cbaElo: 1431 }, // o3-2025-04-16
},
{
idPrefix: 'o3',
@@ -892,7 +892,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
maxCompletionTokens: 4096,
interfaces: IFS_CHAT_MIN,
chatPrice: { input: 5, output: 15 },
benchmark: { cbaElo: 1346 }, // gpt-4o-2024-05-13
benchmark: { cbaElo: 1345 }, // gpt-4o-2024-05-13
},
{
idPrefix: 'gpt-4o',
@@ -961,7 +961,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
maxCompletionTokens: 16384,
interfaces: IFS_CHAT_CACHE,
chatPrice: { input: 0.15, cache: { cType: 'oai-ac', read: 0.075 }, output: 0.6 },
benchmark: { cbaElo: 1318 }, // gpt-4o-mini-2024-07-18
benchmark: { cbaElo: 1317 }, // gpt-4o-mini-2024-07-18
},
{
idPrefix: 'gpt-4o-mini',
@@ -1015,7 +1015,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
maxCompletionTokens: 4096,
interfaces: IFS_CHAT_MIN,
chatPrice: { input: 10, output: 30 },
benchmark: { cbaElo: 1325 }, // gpt-4-turbo-2024-04-09
benchmark: { cbaElo: 1324 }, // gpt-4-turbo-2024-04-09
},
{
idPrefix: 'gpt-4-turbo',
@@ -1031,7 +1031,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
maxCompletionTokens: 4096,
interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
chatPrice: { input: 10, output: 30 },
benchmark: { cbaElo: 1314 }, // gpt-4-0125-preview
benchmark: { cbaElo: 1313 }, // gpt-4-0125-preview
},
{
idPrefix: 'gpt-4-1106-preview', // GPT-4 Turbo preview model
@@ -1042,7 +1042,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
maxCompletionTokens: 4096,
interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
chatPrice: { input: 10, output: 30 },
benchmark: { cbaElo: 1314 }, // gpt-4-1106-preview
benchmark: { cbaElo: 1312 }, // gpt-4-1106-preview
},
{
idPrefix: 'gpt-4-turbo-preview',
@@ -1060,7 +1060,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
contextWindow: 8192,
interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
chatPrice: { input: 30, output: 60 },
benchmark: { cbaElo: 1276 }, // gpt-4-0613
benchmark: { cbaElo: 1274 }, // gpt-4-0613
isLegacy: true,
},
{
@@ -1071,7 +1071,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
contextWindow: 8192,
interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
chatPrice: { input: 30, output: 60 },
benchmark: { cbaElo: 1288 }, // gpt-4-0314
benchmark: { cbaElo: 1286 }, // gpt-4-0314
isLegacy: true,
},
{
@@ -1094,7 +1094,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
maxCompletionTokens: 4096,
interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
chatPrice: { input: 0.5, output: 1.5 },
benchmark: { cbaElo: 1225 }, // gpt-3.5-turbo-0125
benchmark: { cbaElo: 1223 }, // gpt-3.5-turbo-0125
},
{
idPrefix: 'gpt-3.5-turbo-1106',
@@ -1105,7 +1105,7 @@ export const _knownOpenAIChatModels: ManualMappings = [
maxCompletionTokens: 4096,
interfaces: [LLM_IF_OAI_Chat, LLM_IF_OAI_Fn],
chatPrice: { input: 1, output: 2 },
benchmark: { cbaElo: 1204 }, // gpt-3.5-turbo-1106
benchmark: { cbaElo: 1202 }, // gpt-3.5-turbo-1106
},
{
idPrefix: 'gpt-3.5-turbo',
@@ -92,7 +92,7 @@ const _knownXAIChatModels: ManualMappings = [
interfaces: [...XAI_IF_Vision, LLM_IF_OAI_Reasoning],
parameterSpecs: XAI_PAR_Reasoning,
chatPrice: PRICE_420,
benchmark: { cbaElo: 1481 }, // grok-4.20-beta-0309-reasoning (CBA name)
benchmark: { cbaElo: 1480 }, // grok-4.20-beta-0309-reasoning (CBA name)
},
{
idPrefix: 'grok-4.20-0309-non-reasoning',
@@ -103,7 +103,7 @@ const _knownXAIChatModels: ManualMappings = [
interfaces: XAI_IF_Vision,
parameterSpecs: XAI_PAR,
chatPrice: PRICE_420,
benchmark: { cbaElo: 1492 }, // grok-4.20-beta1 (CBA name, preliminary)
benchmark: { cbaElo: 1482 }, // grok-4.20-beta1 (CBA name)
},
{
idPrefix: 'grok-4.20-multi-agent-0309',
@@ -118,6 +118,7 @@ const _knownXAIChatModels: ManualMappings = [
...XAI_PAR_Reasoning,
],
chatPrice: PRICE_420,
benchmark: { cbaElo: 1474 }, // grok-4.20-multi-agent-beta-0309
},
// Grok 4.1
@@ -130,7 +131,7 @@ const _knownXAIChatModels: ManualMappings = [
interfaces: [...XAI_IF_Vision, LLM_IF_OAI_Reasoning],
parameterSpecs: XAI_PAR_Reasoning,
chatPrice: PRICE_41,
benchmark: { cbaElo: 1430 }, // grok-4-1-fast-reasoning
benchmark: { cbaElo: 1432 }, // grok-4-1-fast-reasoning
},
{
idPrefix: 'grok-4-1-fast-non-reasoning',
@@ -141,7 +142,7 @@ const _knownXAIChatModels: ManualMappings = [
interfaces: XAI_IF_Vision,
parameterSpecs: XAI_PAR,
chatPrice: PRICE_41,
benchmark: { cbaElo: 1466 }, // grok-4.1
benchmark: { cbaElo: 1461 }, // grok-4.1
},
// Grok 4
@@ -167,6 +168,7 @@ const _knownXAIChatModels: ManualMappings = [
interfaces: XAI_IF_Vision,
parameterSpecs: XAI_PAR,
chatPrice: PRICE_40,
benchmark: { cbaElo: 1421 }, // grok-4-fast-chat
},
{
hidden: true, // yield to 4.20
@@ -191,7 +193,7 @@ const _knownXAIChatModels: ManualMappings = [
interfaces: XAI_IF_Pre4,
parameterSpecs: XAI_PAR_Pre4,
chatPrice: { input: 3, output: 15, cache: { cType: 'oai-ac', read: 0.75 } },
benchmark: { cbaElo: 1411 }, // grok-3-preview-02-24
benchmark: { cbaElo: 1412 }, // grok-3-preview-02-24
},
{
idPrefix: 'grok-3-mini',