From 51d58223b434ce0ed41383ebccdfeaabdf14b0b9 Mon Sep 17 00:00:00 2001 From: Enrico Ros Date: Wed, 4 Feb 2026 19:12:50 -0800 Subject: [PATCH] Sweep: more succinct output --- .../llm-gemini-parameters-sweep.json | 3 +- .../llm-openai-parameters-sweep.json | 99 ++--- .../llm-xai-parameters-sweep.json | 16 +- .../sweep-config.template.json | 1 - tools/develop/llm-parameter-sweep/sweep.ts | 381 +++++++++--------- 5 files changed, 253 insertions(+), 247 deletions(-) diff --git a/tools/develop/llm-parameter-sweep/llm-gemini-parameters-sweep.json b/tools/develop/llm-parameter-sweep/llm-gemini-parameters-sweep.json index 1f01d9f43..d560d47a6 100644 --- a/tools/develop/llm-parameter-sweep/llm-gemini-parameters-sweep.json +++ b/tools/develop/llm-parameter-sweep/llm-gemini-parameters-sweep.json @@ -1,6 +1,7 @@ { "_comment": "API-validated parameter values. null=undefined/missing. Values are tested and working. Note: temperature is continuous, not discrete.", "_evaluated": "Evaluated: gemini-thinking-budget, gemini-thinking-level, temperature. If missing, the parameter is not supported by that model.", + "_modelFilter": "models/gemini-3, models/gemini-2, models/gemma", "models/gemini-2.0-flash-001": { "gemini-thinking-budget": [0], "temperature-range": [0,2] @@ -32,7 +33,7 @@ "temperature-range": [0,2] }, "models/gemini-2.5-flash-preview-tts": { - "temperature": [0,0.5,1.5,2] + "temperature-range": [0,2] }, "models/gemini-2.5-pro": { "gemini-thinking-budget": [1024,16384,24576,32768], diff --git a/tools/develop/llm-parameter-sweep/llm-openai-parameters-sweep.json b/tools/develop/llm-parameter-sweep/llm-openai-parameters-sweep.json index 16c1cec4d..3142baf2f 100644 --- a/tools/develop/llm-parameter-sweep/llm-openai-parameters-sweep.json +++ b/tools/develop/llm-parameter-sweep/llm-openai-parameters-sweep.json @@ -1,107 +1,98 @@ { "_comment": "API-validated parameter values. null=undefined/missing. Values are tested and working. Note: temperature is continuous, not discrete.", "_evaluated": "Evaluated: oai-image-generation, oai-reasoning-effort, oai-verbosity, oai-web-search, temperature. If missing, the parameter is not supported by that model.", + "_modelFilter": "gpt-5, o", "gpt-5-2025-08-07": { - "oai-image-generation": ["hq"], "oai-reasoning-effort": ["minimal","low","medium","high"], "oai-verbosity": ["low","medium","high"], - "oai-web-search": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-image-generation","oai-web-search"] }, "gpt-5-chat-latest": { - "oai-image-generation": ["hq"], "oai-verbosity": ["medium"], - "oai-web-search": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-image-generation","oai-web-search"] }, "gpt-5-codex": { "oai-reasoning-effort": ["low","medium","high"], "oai-verbosity": ["medium"], - "oai-web-search": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-web-search"] }, "gpt-5-mini-2025-08-07": { - "oai-image-generation": ["hq"], "oai-reasoning-effort": ["minimal","low","medium","high"], "oai-verbosity": ["low","medium","high"], - "oai-web-search": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-image-generation","oai-web-search"] }, "gpt-5-nano-2025-08-07": { - "oai-image-generation": ["hq"], "oai-reasoning-effort": ["minimal","low","medium","high"], "oai-verbosity": ["low","medium","high"], - "oai-web-search": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-image-generation","oai-web-search"] }, "gpt-5-pro-2025-10-06": { - "oai-image-generation": ["hq"], "oai-reasoning-effort": ["high"], "oai-verbosity": ["low","medium","high"], - "oai-web-search": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-image-generation","oai-web-search"] }, "gpt-5-search-api-2025-10-14": { "temperature-range": [0,2] }, "gpt-5.1-2025-11-13": { - "oai-image-generation": ["hq"], "oai-reasoning-effort": ["none","low","medium","high"], "oai-verbosity": ["low","medium","high"], - "oai-web-search": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-image-generation","oai-web-search"] }, "gpt-5.1-chat-latest": { - "oai-image-generation": ["hq"], "oai-reasoning-effort": ["medium"], "oai-verbosity": ["medium"], - "oai-web-search": ["medium"], - "temperature": [1] + "temperature": [1], + "tools": ["oai-image-generation","oai-web-search"] }, "gpt-5.1-codex": { "oai-reasoning-effort": ["low","medium","high"], "oai-verbosity": ["medium"], - "oai-web-search": ["medium"], - "temperature": [1] + "temperature": [1], + "tools": ["oai-web-search"] }, "gpt-5.1-codex-max": { "oai-reasoning-effort": ["low","medium","high","xhigh"], "oai-verbosity": ["medium"], - "oai-web-search": ["medium"], - "temperature": [1] + "temperature": [1], + "tools": ["oai-web-search"] }, "gpt-5.1-codex-mini": { "oai-reasoning-effort": ["low","medium","high"], "oai-verbosity": ["medium"], - "oai-web-search": ["medium"], - "temperature": [1] + "temperature": [1], + "tools": ["oai-web-search"] }, "gpt-5.2-2025-12-11": { - "oai-image-generation": ["hq"], "oai-reasoning-effort": ["none","low","medium","high","xhigh"], "oai-verbosity": ["low","medium","high"], - "oai-web-search": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-image-generation","oai-web-search"] }, "gpt-5.2-chat-latest": { - "oai-image-generation": ["hq"], "oai-reasoning-effort": ["medium"], "oai-verbosity": ["medium"], - "oai-web-search": ["medium"], - "temperature": [1] + "temperature": [1], + "tools": ["oai-image-generation","oai-web-search"] }, "gpt-5.2-codex": { - "oai-reasoning-effort": ["low", "medium","high","xhigh"], + "oai-reasoning-effort": ["low","medium","high","xhigh"], "oai-verbosity": ["medium"], - "oai-web-search": ["medium"], - "temperature": [1] + "temperature": [1], + "tools": ["oai-web-search"] }, "gpt-5.2-pro-2025-12-11": { - "oai-image-generation": ["hq"], "oai-reasoning-effort": ["medium","high","xhigh"], "oai-verbosity": ["low","medium","high"], - "oai-web-search": ["medium"], - "temperature": [1] + "temperature": [1], + "tools": ["oai-image-generation","oai-web-search"] }, "o1-2024-12-17": { "oai-reasoning-effort": ["low","medium","high","xhigh"], @@ -114,41 +105,39 @@ "temperature-range": [0,2] }, "o3-2025-04-16": { - "oai-image-generation": ["hq"], "oai-reasoning-effort": ["low","medium","high","xhigh"], "oai-verbosity": ["medium"], - "oai-web-search": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-image-generation","oai-web-search"] }, "o3-deep-research-2025-06-26": { "oai-reasoning-effort": ["medium"], "oai-verbosity": ["medium"], - "oai-web-search": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-web-search"] }, "o3-mini-2025-01-31": { - "oai-image-generation": ["hq"], "oai-reasoning-effort": ["low","medium","high","xhigh"], "oai-verbosity": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-image-generation"] }, "o3-pro-2025-06-10": { - "oai-image-generation": ["hq"], "oai-reasoning-effort": ["low","medium","high"], "oai-verbosity": ["medium"], - "oai-web-search": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-image-generation","oai-web-search"] }, "o4-mini-2025-04-16": { "oai-reasoning-effort": ["low","medium","high","xhigh"], "oai-verbosity": ["medium"], - "oai-web-search": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-web-search"] }, "o4-mini-deep-research-2025-06-26": { "oai-reasoning-effort": ["medium"], "oai-verbosity": ["medium"], - "oai-web-search": ["medium"], - "temperature-range": [0,2] + "temperature-range": [0,2], + "tools": ["oai-web-search"] } } diff --git a/tools/develop/llm-parameter-sweep/llm-xai-parameters-sweep.json b/tools/develop/llm-parameter-sweep/llm-xai-parameters-sweep.json index 4644940a1..498b84c3b 100644 --- a/tools/develop/llm-parameter-sweep/llm-xai-parameters-sweep.json +++ b/tools/develop/llm-parameter-sweep/llm-xai-parameters-sweep.json @@ -2,7 +2,7 @@ "_comment": "API-validated parameter values. null=undefined/missing. Values are tested and working. Note: temperature is continuous, not discrete.", "_evaluated": "Evaluated: temperature, xai-reasoning-effort, xai-web-search. If missing, the parameter is not supported by that model.", "grok-2-vision-1212": { - "temperature-range": [0,1.5], + "temperature-range": [0,2], "xai-reasoning-effort": ["low","medium","high"] }, "grok-3": { @@ -16,27 +16,27 @@ "grok-4-0709": { "temperature-range": [0,2], "xai-reasoning-effort": ["low","medium","high"], - "xai-web-search": ["auto"] + "xai-tools": ["xai-web-search"] }, "grok-4-1-fast-non-reasoning": { "temperature-range": [0,2], "xai-reasoning-effort": ["low","medium","high"], - "xai-web-search": ["auto"] + "xai-tools": ["xai-web-search"] }, "grok-4-1-fast-reasoning": { - "temperature-range": [0,2], + "temperature-range": [0,1.5], "xai-reasoning-effort": ["low","medium","high"], - "xai-web-search": ["auto"] + "xai-tools": ["xai-web-search"] }, "grok-4-fast-non-reasoning": { "temperature-range": [0,2], "xai-reasoning-effort": ["low","medium","high"], - "xai-web-search": ["auto"] + "xai-tools": ["xai-web-search"] }, "grok-4-fast-reasoning": { - "temperature-range": [0,2], + "temperature-range": [0,1.5], "xai-reasoning-effort": ["low","medium","high"], - "xai-web-search": ["auto"] + "xai-tools": ["xai-web-search"] }, "grok-code-fast-1": { "temperature-range": [0,1], diff --git a/tools/develop/llm-parameter-sweep/sweep-config.template.json b/tools/develop/llm-parameter-sweep/sweep-config.template.json index 0385dd505..aaf25dca0 100644 --- a/tools/develop/llm-parameter-sweep/sweep-config.template.json +++ b/tools/develop/llm-parameter-sweep/sweep-config.template.json @@ -21,7 +21,6 @@ }, "xai": { "access": { "dialect": "xai", "oaiKey": "xai-...", "oaiOrg": "", "oaiHost": "", "heliKey": "" }, - "modelFilter": "grok", "sweeps": ["temperature", "xai-reasoning-effort"], "baseModelOverrides": { "maxTokens": 512 } }, diff --git a/tools/develop/llm-parameter-sweep/sweep.ts b/tools/develop/llm-parameter-sweep/sweep.ts index e9ab6bef5..0f53e0413 100644 --- a/tools/develop/llm-parameter-sweep/sweep.ts +++ b/tools/develop/llm-parameter-sweep/sweep.ts @@ -23,172 +23,7 @@ import { createChatGenerateDispatch } from '~/modules/aix/server/dispatch/chatGe import { fetchResponseOrTRPCThrow, TRPCFetcherError } from '~/server/trpc/trpc.router.fetchers'; -// ============================================================================ -// Terminal Colors -// ============================================================================ - -const COLORS = { - reset: '\x1b[0m', - bright: '\x1b[1m', - dim: '\x1b[2m', - red: '\x1b[31m', - green: '\x1b[32m', - yellow: '\x1b[33m', - cyan: '\x1b[36m', - magenta: '\x1b[35m', -} as const; - - -// ============================================================================ -// Types -// ============================================================================ - -interface CliOptions { - config?: string; - dialect?: string; - key?: string; - host?: string; - modelFilter?: string; - sweepFilter?: string; - delay: number; - maxModels: number; - verbose: boolean; - debug: boolean; - includeSymlinks: boolean; - dryRun: boolean; - sequential: boolean; -} - -type SweepValue = string | number | boolean | null; - -interface SweepDefinition { - name: string; - description: string; - applicability: - | { type: 'all' } - | { type: 'dialects'; dialects: AixAPI_Access['dialect'][] }; - values: TValue[]; - applyToModel: (value: TValue) => Partial; - mode: 'enumerate' | 'bisect'; - /** For bisect mode: precision to stop binary search */ - bisectPrecision?: number; -} - -function defineSweep(definition: SweepDefinition) { - return definition; -} - -interface VendorSweepConfig { - access: AixAPI_Access; - sweeps?: string[]; // names of built-in sweeps from SWEEP_DEFINITIONS - modelFilter?: string | string[]; // prefix(es) to match model IDs - baseModelOverrides?: Partial; -} - -interface SweepConfig { - delayMs?: number; - maxTokens?: number; - vendors: Record; -} - -// Results file format: dialect -> model -> sweep -> passing values -type SweepResultsFile = Record; -type DialectResults = Record; -type ModelResults = Record; - -type ErrorCategory = - | 'exception' // exception testing the parameter - | 'dialect' // parsing fails - | 'abort' | 'connection' | 'http' | 'parse'; // tRPC errors - -type TestOutcome = 'pass' | 'fail' | 'truncated' | 'error'; - -interface TestResult { - sweepName: string; - paramValue: SweepValue; - outcome: TestOutcome; - errorMessage: string | null; // source of truth for non-pass outcomes (always set when outcome !== 'pass') - errorCategory?: ErrorCategory; // secondary, used for symbol display - httpStatus?: number; - responseText?: string; - verboseLogs: string[]; // --verbose: response/error details - debugRequestAixModel?: string; // --debug: AixAPI_Model JSON - debugRequestBody?: string; // --debug: request body JSON - durationMs: number; -} - -interface ModelSweepResult { - modelId: string; - modelLabel: string; - results: TestResult[]; -} - -interface VendorSweepResult { - vendorName: string; - dialect: AixAPI_Access['dialect']; - modelsAvailable: number; - modelsTested: number; - models: ModelSweepResult[]; -} - - -// ============================================================================ -// SweepCollectorTransmitter - Lightweight IParticleTransmitter for probing -// ============================================================================ - -class SweepCollectorTransmitter implements IParticleTransmitter { - text: string = ''; - dialectIssue: string | null = null; - tokenStopReason: AixWire_Particles.GCTokenStopReason | null = null; - endReason: string | null = null; - - get hasText(): boolean { return this.text.length > 0; } - get hasError(): boolean { return this.dialectIssue !== null; } - - // Parser-initiated Control - setEnded(reason: 'done-dialect' | 'issue-dialect'): void { - this.endReason = reason; - } - - setDialectTerminatingIssue(dialectText: string, _symbol: string | null, _serverLog: ParticleServerLogLevel): void { - this.dialectIssue = dialectText; - } - - // Parts data - only collect text, everything else is a no-op - endMessagePart(): void { /* no-op */ } - - appendText(textChunk: string): void { - this.text += textChunk; - } - - appendReasoningText(_textChunk: string, _options?: { weak?: 'tag'; restart?: boolean }): void { /* no-op */ } - setReasoningSignature(_signature: string): void { /* no-op */ } - addReasoningRedactedData(_data: string): void { /* no-op */ } - appendAutoText_weak(textChunk: string): void { this.text += textChunk; } - appendAudioInline(_mimeType: string, _base64Data: string, _label: string, _generator: string, _durationMs: number): void { /* no-op */ } - appendImageInline(_mimeType: string, _base64Data: string, _label: string, _generator: string, _prompt: string): void { /* no-op */ } - startFunctionCallInvocation(_id: string | null, _functionName: string, _expectedArgsFmt: 'incr_str' | 'json_object', _args: string | object | null): void { /* no-op */ } - appendFunctionCallInvocationArgs(_id: string | null, _argsJsonChunk: string): void { /* no-op */ } - addCodeExecutionInvocation(_id: string | null, _language: string, _code: string, _author: 'gemini_auto_inline' | 'code_interpreter'): void { /* no-op */ } - addCodeExecutionResponse(_id: string | null, _error: boolean | string, _result: string, _executor: 'gemini_auto_inline' | 'code_interpreter', _environment: 'upstream'): void { /* no-op */ } - appendUrlCitation(_title: string, _url: string, _citationNumber?: number, _startIndex?: number, _endIndex?: number, _textSnippet?: string, _pubTs?: number): void { /* no-op */ } - - // Special - sendControl(_cgCOp: AixWire_Particles.ChatControlOp, _flushQueue?: boolean): void { /* no-op */ } - sendVoidPlaceholder(_mot: 'search-web' | 'gen-image' | 'code-exec', _text: string): void { /* no-op */ } - sendSetVendorState(_vendor: string, _state: unknown): void { /* no-op */ } - - // Non-parts data - setModelName(_modelName: string): void { /* no-op */ } - setUpstreamHandle(_handle: string, _type: 'oai-responses'): void { /* no-op */ } - setTokenStopReason(reason: AixWire_Particles.GCTokenStopReason): void { this.tokenStopReason = reason; } - updateMetrics(_update: Partial): void { /* no-op */ } -} - - -// ============================================================================ -// Built-in Sweep Definitions -// ============================================================================ +// --- SWEEP DEFINITIONS --- const SWEEP_DEFINITIONS = [ @@ -324,6 +159,165 @@ const SWEEP_DEFINITIONS = [ ] as const satisfies SweepDefinition[]; +interface SweepDefinition { + name: string; + description: string; + applicability: + | { type: 'all' } + | { type: 'dialects'; dialects: AixAPI_Access['dialect'][] }; + values: TValue[]; + applyToModel: (value: TValue) => Partial; + mode: 'enumerate' | 'bisect'; + /** For bisect mode: precision to stop binary search */ + bisectPrecision?: number; +} + +type SweepValue = string | number | boolean | null; + +function defineSweep(definition: SweepDefinition) { + return definition; +} + +// ============================================================================ +// Types +// ============================================================================ + +const COLORS = { + reset: '\x1b[0m', + bright: '\x1b[1m', + dim: '\x1b[2m', + red: '\x1b[31m', + green: '\x1b[32m', + yellow: '\x1b[33m', + cyan: '\x1b[36m', + magenta: '\x1b[35m', +} as const; + +interface CliOptions { + config?: string; + dialect?: string; + key?: string; + host?: string; + modelFilter?: string; + sweepFilter?: string; + delay: number; + maxModels: number; + verbose: boolean; + debug: boolean; + includeSymlinks: boolean; + dryRun: boolean; + sequential: boolean; +} + +// Types: Config File + +interface SweepConfigFile { + delayMs?: number; + maxTokens?: number; + vendors: Record; +} + +interface SweepConfigFile_Vendor { + access: AixAPI_Access; + sweeps?: string[]; // names of built-in sweeps from SWEEP_DEFINITIONS + modelFilter?: string | string[]; // prefix(es) to match model IDs + baseModelOverrides?: Partial; +} + +// Types: Sweep Results + +interface VendorSweepResult { + vendorName: string; + dialect: AixAPI_Access['dialect']; + modelsAvailable: number; + modelsTested: number; + models: ModelSweepResult[]; + modelFilter?: string; // effective filter used (from config and/or CLI) +} + +interface ModelSweepResult { + modelId: string; + modelLabel: string; + results: TestResult[]; +} + +interface TestResult { + sweepName: string; + paramValue: SweepValue; + outcome: TestOutcome; + errorMessage: string | null; // source of truth for non-pass outcomes (always set when outcome !== 'pass') + errorCategory?: ErrorCategory; // secondary, used for symbol display + httpStatus?: number; + responseText?: string; + verboseLogs: string[]; // --verbose: response/error details + debugRequestAixModel?: string; // --debug: AixAPI_Model JSON + debugRequestBody?: string; // --debug: request body JSON + durationMs: number; +} + +type TestOutcome = 'pass' | 'fail' | 'truncated' | 'error'; + + +type ErrorCategory = + | 'exception' // exception testing the parameter + | 'dialect' // parsing fails + | 'abort' | 'connection' | 'http' | 'parse'; // tRPC errors + + +// ============================================================================ +// SweepCollectorTransmitter - Lightweight IParticleTransmitter for probing +// ============================================================================ + +class SweepCollectorTransmitter implements IParticleTransmitter { + text: string = ''; + dialectIssue: string | null = null; + tokenStopReason: AixWire_Particles.GCTokenStopReason | null = null; + endReason: string | null = null; + + get hasText(): boolean { return this.text.length > 0; } + get hasError(): boolean { return this.dialectIssue !== null; } + + // Parser-initiated Control + setEnded(reason: 'done-dialect' | 'issue-dialect'): void { + this.endReason = reason; + } + + setDialectTerminatingIssue(dialectText: string, _symbol: string | null, _serverLog: ParticleServerLogLevel): void { + this.dialectIssue = dialectText; + } + + // Parts data - only collect text, everything else is a no-op + endMessagePart(): void { /* no-op */ } + + appendText(textChunk: string): void { + this.text += textChunk; + } + + appendReasoningText(_textChunk: string, _options?: { weak?: 'tag'; restart?: boolean }): void { /* no-op */ } + setReasoningSignature(_signature: string): void { /* no-op */ } + addReasoningRedactedData(_data: string): void { /* no-op */ } + appendAutoText_weak(textChunk: string): void { this.text += textChunk; } + appendAudioInline(_mimeType: string, _base64Data: string, _label: string, _generator: string, _durationMs: number): void { /* no-op */ } + appendImageInline(_mimeType: string, _base64Data: string, _label: string, _generator: string, _prompt: string): void { /* no-op */ } + startFunctionCallInvocation(_id: string | null, _functionName: string, _expectedArgsFmt: 'incr_str' | 'json_object', _args: string | object | null): void { /* no-op */ } + appendFunctionCallInvocationArgs(_id: string | null, _argsJsonChunk: string): void { /* no-op */ } + addCodeExecutionInvocation(_id: string | null, _language: string, _code: string, _author: 'gemini_auto_inline' | 'code_interpreter'): void { /* no-op */ } + addCodeExecutionResponse(_id: string | null, _error: boolean | string, _result: string, _executor: 'gemini_auto_inline' | 'code_interpreter', _environment: 'upstream'): void { /* no-op */ } + appendUrlCitation(_title: string, _url: string, _citationNumber?: number, _startIndex?: number, _endIndex?: number, _textSnippet?: string, _pubTs?: number): void { /* no-op */ } + + // Special + sendControl(_cgCOp: AixWire_Particles.ChatControlOp, _flushQueue?: boolean): void { /* no-op */ } + sendVoidPlaceholder(_mot: 'search-web' | 'gen-image' | 'code-exec', _text: string): void { /* no-op */ } + sendSetVendorState(_vendor: string, _state: unknown): void { /* no-op */ } + + // Non-parts data + setModelName(_modelName: string): void { /* no-op */ } + setUpstreamHandle(_handle: string, _type: 'oai-responses'): void { /* no-op */ } + setTokenStopReason(reason: AixWire_Particles.GCTokenStopReason): void { this.tokenStopReason = reason; } + updateMetrics(_update: Partial): void { /* no-op */ } +} + + // ============================================================================ // Minimal Request Construction // ============================================================================ @@ -383,7 +377,6 @@ async function testParameterValue( value: SweepValue, maxTokens: number, baseModelOverrides: Partial | undefined, - debug: boolean = false, ): Promise { const startTime = Date.now(); const baseModel = createBaseModel(modelId, maxTokens); @@ -656,15 +649,19 @@ function printSweepSummary(results: VendorSweepResult[]): void { // Results File (per-dialect: llm-{dialect}-parameters-sweep.json) // ============================================================================ +// Results file format: dialect -> model -> sweep -> passing values +type DialectReultsByModel = Record; +type ModelResultsBySweep = Record; + function getResultsFilePath(dialect: string): string { const scriptDir = path.dirname(new URL(import.meta.url).pathname.replace(/^\/([A-Z]:)/, '$1')); return path.join(scriptDir, `llm-${dialect}-parameters-sweep.json`); } -function saveDialectResults(dialect: string, dialectResults: DialectResults, evaluatedSweeps: string[]): void { +function saveDialectResults(dialect: string, dialectResults: DialectReultsByModel, evaluatedSweeps: string[], modelFilter?: string): void { const filePath = getResultsFilePath(dialect); // Sort keys for stable output - const sorted: DialectResults = {}; + const sorted: DialectReultsByModel = {}; for (const model of Object.keys(dialectResults).sort()) { sorted[model] = {}; for (const sweep of Object.keys(dialectResults[model]).sort()) { @@ -686,16 +683,18 @@ function saveDialectResults(dialect: string, dialectResults: DialectResults, eva // Insert header comments after opening brace const comment1 = '"_comment": "API-validated parameter values. null=undefined/missing. Values are tested and working. Note: temperature is continuous, not discrete.",'; const comment2 = `"_evaluated": "Evaluated: ${evaluatedSweeps.sort().join(', ')}. If missing, the parameter is not supported by that model.",`; - const json = jsonBody.replace(/^\{\n {2}/, '{\n ' + comment1 + '\n ' + comment2 + '\n '); + const comment3 = modelFilter ? `"_modelFilter": "${modelFilter}",` : ''; + const comments = [comment1, comment2, comment3].filter(Boolean).join('\n '); + const json = jsonBody.replace(/^\{\n {2}/, '{\n ' + comments + '\n '); fs.writeFileSync(filePath, json + '\n', 'utf-8'); console.log(`${COLORS.dim}Results saved to: ${filePath}${COLORS.reset}`); } -function vendorResultToDialectResults(vendorResult: VendorSweepResult): DialectResults { - const dialectResults: DialectResults = {}; +function vendorResultToDialectResults(vendorResult: VendorSweepResult): DialectReultsByModel { + const dialectResults: DialectReultsByModel = {}; for (const model of vendorResult.models) { - const modelResults: ModelResults = {}; + const modelResults: ModelResultsBySweep = {}; // Group results by sweep name const bySweep = new Map(); @@ -705,8 +704,10 @@ function vendorResultToDialectResults(vendorResult: VendorSweepResult): DialectR } // Sweeps that become "tools" when fully supported - const toolSweeps = ['oai-image-generation', 'oai-web-search', 'xai-web-search']; + const toolSweeps = ['oai-image-generation', 'oai-web-search']; const tools: string[] = []; + const xaiToolSweeps = ['xai-web-search']; + const xaiTools: string[] = []; // Extract passing values for each sweep (skip if none passed) for (const [sweepName, sweepResults] of bySweep) { @@ -721,6 +722,10 @@ function vendorResultToDialectResults(vendorResult: VendorSweepResult): DialectR tools.push(sweepName); continue; } + if (xaiToolSweeps.includes(sweepName) && passingValues.length === sweepResults.length) { + xaiTools.push(sweepName); + continue; + } // Special case: temperature with contiguous range from 0 -> use range [min, max] if (sweepName === 'temperature') { @@ -741,6 +746,8 @@ function vendorResultToDialectResults(vendorResult: VendorSweepResult): DialectR // Add tools array if non-empty if (tools.length > 0) modelResults['tools'] = tools.sort(); + if (xaiTools.length > 0) + modelResults['xai-tools'] = xaiTools.sort(); dialectResults[model.modelId] = modelResults; } @@ -759,7 +766,7 @@ function saveAllResults(allResults: VendorSweepResult[]): void { } } const dialectResults = vendorResultToDialectResults(vendorResult); - saveDialectResults(vendorResult.dialect, dialectResults, [...evaluatedSweeps]); + saveDialectResults(vendorResult.dialect, dialectResults, [...evaluatedSweeps], vendorResult.modelFilter); } } @@ -768,7 +775,7 @@ function saveAllResults(allResults: VendorSweepResult[]): void { // Config Loading // ============================================================================ -function loadSweepConfig(configPath: string): SweepConfig { +function loadSweepConfig(configPath: string): SweepConfigFile { const fullPath = path.resolve(configPath); if (!fs.existsSync(fullPath)) throw new Error(`Configuration file not found: ${fullPath}`); @@ -779,11 +786,11 @@ function loadSweepConfig(configPath: string): SweepConfig { // Support both old format (flat Record) and new format (SweepConfig) if (parsed.vendors) { - return parsed as SweepConfig; + return parsed as SweepConfigFile; } // Legacy: flat Record - const vendors: Record = {}; + const vendors: Record = {}; for (const [key, value] of Object.entries(parsed)) { if (key.startsWith('_')) continue; if (typeof value === 'object' && value !== null && 'dialect' in value) @@ -795,7 +802,7 @@ function loadSweepConfig(configPath: string): SweepConfig { } } -function createSingleVendorConfig(dialect: string, key: string, host?: string): SweepConfig { +function createSingleVendorConfig(dialect: string, key: string, host?: string): SweepConfigFile { let access: AixAPI_Access; switch (dialect) { @@ -1025,7 +1032,7 @@ function sleep(ms: number): Promise { // ============================================================================ async function runSweep( - sweepConfig: SweepConfig, + sweepConfig: SweepConfigFile, options: CliOptions, ): Promise { const allResults: VendorSweepResult[] = []; @@ -1067,7 +1074,7 @@ async function runSweep( // 2b. Filter out duplicate models with idVariant (keep base model, or first variant if no base) { - const beforeCount = models.length; + // const beforeCount = models.length; // Sort so base models (no idVariant) come before variants models.sort((a, b) => (a.idVariant ? 1 : 0) - (b.idVariant ? 1 : 0)); const seenIds = new Set(); @@ -1136,12 +1143,22 @@ async function runSweep( } console.log(` Applicable sweeps: ${applicableSweeps.map(s => COLORS.magenta + s.name + COLORS.reset).join(', ')}`); + // Build effective model filter string for JSON output + const effectiveFilters: string[] = []; + if (vendorModelFilter) { + const prefixes = Array.isArray(vendorModelFilter) ? vendorModelFilter : [vendorModelFilter]; + effectiveFilters.push(...prefixes); + } + if (options.modelFilter) + effectiveFilters.push(options.modelFilter); + const vendorResult: VendorSweepResult = { vendorName, dialect: access.dialect, modelsAvailable: models.length, modelsTested: models.length, models: [], + modelFilter: effectiveFilters.length > 0 ? effectiveFilters.join(', ') : undefined, }; // 5. For each model @@ -1257,7 +1274,7 @@ async function main(): Promise { const options = parseArgs(); // Load vendor config - let sweepConfig: SweepConfig; + let sweepConfig: SweepConfigFile; if (options.config) { sweepConfig = loadSweepConfig(options.config); } else {