diff --git a/src/modules/aix/server/dispatch/chatGenerate/adapters/openai.responsesCreate.ts b/src/modules/aix/server/dispatch/chatGenerate/adapters/openai.responsesCreate.ts index 036e6eb17..454391920 100644 --- a/src/modules/aix/server/dispatch/chatGenerate/adapters/openai.responsesCreate.ts +++ b/src/modules/aix/server/dispatch/chatGenerate/adapters/openai.responsesCreate.ts @@ -11,6 +11,7 @@ import { aixSpillShallFlush, aixSpillSystemToUser, approxDocPart_To_String } fro // configuration const OPENAI_RESPONSES_DEFAULT_TRUNCATION: TRequest['truncation'] = undefined; +export const AIX_OAI_DEFAULT_IMAGE_GEN_MODEL: Exclude['model'], undefined> = 'gpt-image-2'; type TRequest = OpenAIWire_API_Responses.Request; @@ -207,6 +208,7 @@ export function aixToOpenAIResponses( const imageMode = model.vndOaiImageGeneration; const imageGenerationTool: Extract = { type: 'image_generation', + ...(AIX_OAI_DEFAULT_IMAGE_GEN_MODEL && { model: AIX_OAI_DEFAULT_IMAGE_GEN_MODEL }), ...(imageMode === 'mq' ? { quality: 'medium' } : { /* quality: 'high' -- auto */ }), // ...(imageMode === 'hq' ? ... auto ... ), ...(imageMode === 'hq_edit' && { input_fidelity: 'high' }), diff --git a/src/modules/aix/server/dispatch/chatGenerate/parsers/openai.responses.parser.ts b/src/modules/aix/server/dispatch/chatGenerate/parsers/openai.responses.parser.ts index e581dc05b..7853beb9d 100644 --- a/src/modules/aix/server/dispatch/chatGenerate/parsers/openai.responses.parser.ts +++ b/src/modules/aix/server/dispatch/chatGenerate/parsers/openai.responses.parser.ts @@ -5,10 +5,11 @@ import { hasKeys } from '~/common/util/objectUtils'; import type { AixWire_Particles } from '../../../api/aix.wiretypes'; import type { ChatGenerateParseFunction } from '../chatGenerate.dispatch'; import type { IParticleTransmitter } from './IParticleTransmitter'; +import { AIX_OAI_DEFAULT_IMAGE_GEN_MODEL } from '../adapters/openai.responsesCreate'; import { IssueSymbols } from '../ChatGenerateTransmitter'; import { aixResilientUnknownValue } from '../../../api/aix.resilience'; -import { OpenAIWire_API_Responses } from '../../wiretypes/openai.wiretypes'; +import { OpenAIWire_API_Responses, OpenAIWire_Responses_Tools } from '../../wiretypes/openai.wiretypes'; // configuration @@ -42,6 +43,9 @@ type TResponse = OpenAIWire_API_Responses.Response; type TOutputItem = OpenAIWire_API_Responses.Response['output'][number]; type TEventType = OpenAIWire_API_Responses.StreamingEvent['type']; +// cached config for the image_generation hosted tool, captured at response.created +type TImageGenToolCfg = Extract; + /** * We need this just to ensure events are not out of order, as out streaming is progressive @@ -76,6 +80,9 @@ class ResponseParserStateMachine { // streaming state tracking #hasFunctionCalls: boolean = false; // tracks if we've seen function_call output items + // hosted tool configuration echo (captured at response.created) + #imageGenToolCfg: TImageGenToolCfg | undefined; + // Validations @@ -237,6 +244,18 @@ class ResponseParserStateMachine { return this.#hasFunctionCalls; } + + // Hosted tool config capture + + captureHostedToolConfigs(tools: TResponse['tools']) { + if (!tools?.length) return; + this.#imageGenToolCfg = tools.find((t): t is TImageGenToolCfg => t.type === 'image_generation'); + } + + get imageGenToolCfg() { + return this.#imageGenToolCfg; + } + } @@ -299,6 +318,9 @@ export function createOpenAIResponsesEventParser(): ChatGenerateParseFunction { if (event.response.store && event.response.id) pt.setUpstreamHandle(event.response.id, 'oai-responses' /*, event.sequence_number - commented, unused for now */); + // -> Hosted tool configs: cache for per-event enrichment (e.g. image generation progress) + R.captureHostedToolConfigs(event.response.tools); + // TODO: [FUTURE] Accumulate in DMessage.sessionMetadata: // pt.setSessionMetadata('openai.response.id', response.id) // pt.setSessionMetadata('openai.response.expiresAt', response.expires_at) @@ -403,14 +425,17 @@ export function createOpenAIResponsesEventParser(): ChatGenerateParseFunction { case 'image_generation_call': // -> IGC: process completed image generation using 'ii' particle for inline images - const { result: igResult, revised_prompt: igRevisedPrompt } = doneItem; + const { id: igId, result: igResult, revised_prompt: igRevisedPrompt } = doneItem; + const igDoneText = !igRevisedPrompt?.length ? 'Image generated' + : `Generated: "${igRevisedPrompt.length > 100 ? igRevisedPrompt.slice(0, 100) + '...' : igRevisedPrompt}"`; + pt.sendOperationState('gen-image', igDoneText, { opId: igId, state: 'done' }); // Create inline image with base64 data if (igResult) pt.appendImageInline( _imageGenerationMimeType(doneItem), // infer from output_format echoed in the item igResult, igRevisedPrompt || 'Generated image', - 'gpt-image-1', // generator + R.imageGenToolCfg?.model || AIX_OAI_DEFAULT_IMAGE_GEN_MODEL, // generator: prefer the cached tool config, fallback to current default igRevisedPrompt || '', // prompt used ); else @@ -569,7 +594,7 @@ export function createOpenAIResponsesEventParser(): ChatGenerateParseFunction { case 'response.image_generation_call.in_progress': R.outputItemVisit(eventType, event.output_index, 'image_generation_call'); - pt.sendOperationState('gen-image', 'Generating image...', { opId: event.item_id }); + pt.sendOperationState('gen-image', `Generating image${_prettyImageGenConfigSuffix(R.imageGenToolCfg)}...`, { opId: event.item_id }); break; case 'response.image_generation_call.generating': @@ -579,15 +604,19 @@ export function createOpenAIResponsesEventParser(): ChatGenerateParseFunction { case 'response.image_generation_call.partial_image': R.outputItemVisit(eventType, event.output_index, 'image_generation_call'); - // SKIP partial images to avoid duplicates - only use final result - // const { partial_image_index: piIndex } = event; - // console.log('[DEV] AIX: OpenAI Responses: skipping partial_image event to avoid duplicates:', { piIndex }); - // The final image will be handled in response.output_item.done + // NOTE: We don't surface the partial image data yet (it would require a new particle type) + // NOTE: Also for now we do not use the partial_image_index as a progress counter in the status text. + // const piIndex = event.partial_image_index; + // const piMax = R.imageGenToolCfg?.partial_images; + // const piProgress = piMax ? `${piIndex + 1}/${piMax}` : `step ${piIndex + 1}`; + // pt.sendOperationState('gen-image', `Generating image${_prettyImageGenConfigSuffix(R.imageGenToolCfg)}... (${piProgress})`, { opId: event.item_id }); break; case 'response.image_generation_call.completed': R.outputItemVisit(eventType, event.output_index, 'image_generation_call'); - pt.sendOperationState('gen-image', 'Image generated', { opId: event.item_id, state: 'done' }); + // NOTE: we defer the 'done' status to response.output_item.done, where we also have the revised_prompt for decoration; + // the semantics of the below would be better (to close the in_progress), but we assume that the response will come immediately after + // pt.sendOperationState('gen-image', 'Image generated', { opId: event.item_id, state: 'done' }); // -> Final image result is handled in response.output_item.done break; @@ -892,7 +921,7 @@ export function createOpenAIResponseParserNS(): ChatGenerateParseFunction { _imageGenerationMimeType(oItem), // infer from output_format echoed in the item igResult, igRevisedPrompt || 'Generated image', - 'gpt-image-1', // generator + AIX_OAI_DEFAULT_IMAGE_GEN_MODEL, // generator: non-streaming path has no captured tool config, use current default igRevisedPrompt || '', // prompt used ); else @@ -1036,6 +1065,22 @@ function _forwardTextAnnotation(pt: IParticleTransmitter, annotation: Exclude; const Request_schema = z.object({ - // 32,000 for gpt-image-1.5/gpt-image-1/gpt-image-1-mini, 4,000 for dall-e-3, 1,000 for dall-e-2 + // 32,000 for gpt-image family, 4,000 for dall-e-3, 1,000 for dall-e-2 prompt: z.string().max(32000), - model: z.enum([ - 'gpt-image-1.5', - 'gpt-image-1', - 'gpt-image-1-mini', - 'dall-e-3', - 'dall-e-2', // default + model: z.union([ + GptImageModels_schema, + z.enum([ + 'dall-e-3', + 'dall-e-2', // default + ]), ]).optional(), // The number of images to generate. Must be between 1 and 10. For dall-e-3, only n=1 is supported. @@ -850,7 +858,7 @@ export namespace OpenAIWire_API_Images_Generations { // Image quality quality: z.enum([ 'auto', // default - 'high', 'medium', 'low', // gpt-image-1.5, gpt-image-1, gpt-image-1-mini + 'high', 'medium', 'low', // gpt-image 'hd', 'standard', // dall-e-3: hd | standard, dall-e-2: only standard ]).optional(), @@ -876,7 +884,7 @@ export namespace OpenAIWire_API_Images_Generations { user: z.string().optional(), - // -- GPT Image Family Specific Parameters (gpt-image-1.5, gpt-image-1, gpt-image-1-mini) -- + // -- GPT Image specific parameters -- // Allows to set transparency (in that case, format = png or webp) background: z.enum(['transparent', 'opaque', 'auto' /* default */]).optional(), @@ -907,7 +915,7 @@ export namespace OpenAIWire_API_Images_Generations { url: z.url().optional(), // if the response_format is 'url' - DEPRECATED })), - // GPT Image models only (gpt-image-1.5, gpt-image-1, gpt-image-1-mini) + // gpt-image only usage: z.object({ total_tokens: z.number(), input_tokens: z.number() // images + text tokens in the input prompt @@ -935,14 +943,17 @@ export namespace OpenAIWire_API_Images_Edits { */ export const Request_schema = z.object({ - // 32,000 for gpt-image-1.5/gpt-image-1/gpt-image-1-mini, 1,000 for dall-e-2 + // 32,000 for gpt-image, 1,000 for dall-e-2 prompt: z.string().max(32000), // image: file | file[] - REQUIRED - Handled as file uploads in FormData ('image' field) // mask: file - OPTIONAL - Handled as file upload in FormData ('mask' field) - model: z.enum(['gpt-image-1.5', 'gpt-image-1', 'gpt-image-1-mini', 'dall-e-2']).optional(), + model: z.union([ + OpenAIWire_API_Images_Generations.GptImageModels_schema, + z.enum(['dall-e-2' /* dall-e-3 does not do image edits */]), + ]).optional(), // Number of images to generate, between 1 and 10 n: z.number().min(1).max(10).nullable().optional(), @@ -950,7 +961,7 @@ export namespace OpenAIWire_API_Images_Edits { // Image quality quality: z.enum([ 'auto', // default - 'high', 'medium', 'low', // gpt-image-1.5, gpt-image-1, gpt-image-1-mini + 'high', 'medium', 'low', // gpt-image 'standard', // dall-e-2: only standard ]).optional(), @@ -1198,17 +1209,20 @@ export namespace OpenAIWire_Responses_Items { id: z.string(), // unique ID of the image generation call (output item ID) result: z.string().optional(), // base64 image data when completed revised_prompt: z.string().optional(), // the revised prompt used for generation - // BREAKING CHANGE from OpenAI - 2025-09-30 - // redefining the following because we need 'generating' too here + // Docs: "in_progress" | "completed" | "generating" | "failed". + // 'incomplete' kept as defensive carryover from OutputItemBase (not doc'd for this item). status: z.enum([ - 'generating', // 2025-09-30: seen on OpenAI for `image_generation_call` items - 'in_progress', 'completed', 'incomplete', + 'in_progress', 'completed', 'generating', 'failed', + 'incomplete', // defensive: not in docs for image_generation_call, but harmless to accept ]).optional(), - // Echoed configuration from the tool request - used to infer mime type for the result + // Echoed configuration from the tool request - the API returns these on the done item with + // RESOLVED values (e.g. size:"auto" becomes "1536x1024"). Confirmed live on 2026-04-21 log. output_format: z.enum(['png' /* default */, 'jpeg', 'webp']).optional(), - // NOTE: we also see the following echoed in the image_generation_call item + // NOTE: we also see the following echoed in the image_generation_call item, but don't make use of them for now // background: z.enum(['transparent', 'opaque', 'auto' /* default */]).optional(), // quality: z.enum(['auto', 'high', 'medium', 'low']).optional(), + // size: z.enum(['1024x1024', '1024x1536', '1536x1024', 'auto']).or(z.string()).optional(), + // action: z.enum(['generate', 'edit', 'auto']).or(z.string()).optional(), }); // const OutputMCPCallItem_schema = _OutputItemBase_schema.extend({ @@ -1434,19 +1448,21 @@ export namespace OpenAIWire_Responses_Tools { const ImageGenerationTool_schema = z.object({ type: z.literal('image_generation'), + /** Whether to generate a new image or edit an existing image. Default: auto. */ + action: z.enum(['generate', 'edit', 'auto']).or(z.string()).optional(), background: z.enum(['transparent', 'opaque', 'auto']).optional(), // defaults to 'auto' /** * Control how much effort the model will exert to match the style and features, especially facial features, of input images. - * Defaults to 'low'. + * Supported for gpt-image-1 / gpt-image-1.5+ (not gpt-image-1-mini). Defaults to 'low'. */ input_fidelity: z.enum(['high', 'low']).optional(), input_image_mask: z.object({ file_id: z.string().optional(), // File ID for the mask image image_url: z.string().optional(), // Base64-encoded mask image }).optional(), - /** 'gpt-image-1' (default), leaks suggest also 'gpt-image-0721-mini-alpha' */ - model: z.string().optional(), - /** Note: 'low' is unconfirmed here. Defaults to 'auto' */ + /** gpt-image family, relaxed with .or(z.string()) for forward-compat with new models. */ + model: OpenAIWire_API_Images_Generations.GptImageModels_schema.or(z.string()).optional(), + /** Defaults to 'auto' */ moderation: z.enum(['low', 'auto']).optional(), output_compression: z.number().min(0).max(100).int().optional(), // defaults to 100 /** One of [png, webp, or jpeg]. Default: png. */ @@ -1455,8 +1471,10 @@ export namespace OpenAIWire_Responses_Tools { partial_images: z.number().int().min(0).max(3).optional(), /** Quality of the generated image. Defaults to 'auto' */ quality: z.enum(['low', 'medium', 'high', 'auto']).optional(), - /** The size of the generated image. One of 1024x1024, 1024x1536, 1536x1024, or auto. Default: auto. */ - size: z.enum(['1024x1024', '1024x1536', '1536x1024', 'auto']).optional(), + /** Default: auto */ + size: z.enum(['1024x1024', '1024x1536', '1536x1024', 'auto']).or(z.string()).optional(), + // Not supported in the request, echoed by the API, but always 1 + // n: z.number().int().optional(), }); // Code Interpreter tool - runs Python code in a sandboxed container @@ -1617,6 +1635,13 @@ export namespace OpenAIWire_API_Responses { model: z.string(), // model used for the response + // echo of requested tools but with all properties values - can be used by the parser to enrich hosted-tool messages + // NOTE: .catch() gracefully degrades to undefined since this is a non-critical enrichment path + tools: z.array(OpenAIWire_Responses_Tools.Tool_schema).optional().catch((ctx) => { + console.warn('[DEV] AIX: OpenAI Responses: unable to parse echoed tools, ignoring:', { tools: ctx.value }); + return; + }), + output: z.array(OpenAIWire_Responses_Items.OutputItem_schema), usage: z.object({ @@ -1649,7 +1674,6 @@ export namespace OpenAIWire_API_Responses { // temperature: ... // 1 // text: ... // { .. } // tool_choice: ... // 'auto' - // tools: ... // e.g. [{ type: 'web_search_preview', search_context_size: 'medium', ... }] // top_logprobs: ... // 0 // top_p: ... // 1 // truncation: ... // 'disabled' @@ -1851,6 +1875,12 @@ export namespace OpenAIWire_API_Responses { type: z.literal('response.image_generation_call.partial_image'), partial_image_b64: z.string(), // base64 partial image partial_image_index: z.number(), // 0-based index + // NOTE: observed on the wire (not in the OpenAI docs, but consistently echoed), but not used/useful yet. + // these carry the RESOLVED values even when the request used 'auto' (e.g. size:"auto" -> "1536x1024"). + // output_format: z.enum(['png', 'jpeg', 'webp']).or(z.string()).optional(), + // background: z.enum(['transparent', 'opaque', 'auto']).or(z.string()).optional(), + // quality: z.enum(['low', 'medium', 'high', 'auto']).or(z.string()).optional(), + // size: z.enum(['1024x1024', '1024x1536', '1536x1024', 'auto']).or(z.string()).optional(), }); const OutputImageGenerationCallCompletedEvent_schema = _OutputIndexedEvent_schema.extend({