AIX: OpenAI Responses: image generation: add details

This commit is contained in:
Enrico Ros
2026-04-21 13:27:17 -07:00
parent 1e04efe748
commit ec76e1c5cf
3 changed files with 113 additions and 36 deletions
@@ -11,6 +11,7 @@ import { aixSpillShallFlush, aixSpillSystemToUser, approxDocPart_To_String } fro
// configuration
const OPENAI_RESPONSES_DEFAULT_TRUNCATION: TRequest['truncation'] = undefined;
export const AIX_OAI_DEFAULT_IMAGE_GEN_MODEL: Exclude<Extract<TRequestTool, { type: 'image_generation' }>['model'], undefined> = 'gpt-image-2';
type TRequest = OpenAIWire_API_Responses.Request;
@@ -207,6 +208,7 @@ export function aixToOpenAIResponses(
const imageMode = model.vndOaiImageGeneration;
const imageGenerationTool: Extract<TRequestTool, { type: 'image_generation' }> = {
type: 'image_generation',
...(AIX_OAI_DEFAULT_IMAGE_GEN_MODEL && { model: AIX_OAI_DEFAULT_IMAGE_GEN_MODEL }),
...(imageMode === 'mq' ? { quality: 'medium' } : { /* quality: 'high' -- auto */ }),
// ...(imageMode === 'hq' ? ... auto ... ),
...(imageMode === 'hq_edit' && { input_fidelity: 'high' }),
@@ -5,10 +5,11 @@ import { hasKeys } from '~/common/util/objectUtils';
import type { AixWire_Particles } from '../../../api/aix.wiretypes';
import type { ChatGenerateParseFunction } from '../chatGenerate.dispatch';
import type { IParticleTransmitter } from './IParticleTransmitter';
import { AIX_OAI_DEFAULT_IMAGE_GEN_MODEL } from '../adapters/openai.responsesCreate';
import { IssueSymbols } from '../ChatGenerateTransmitter';
import { aixResilientUnknownValue } from '../../../api/aix.resilience';
import { OpenAIWire_API_Responses } from '../../wiretypes/openai.wiretypes';
import { OpenAIWire_API_Responses, OpenAIWire_Responses_Tools } from '../../wiretypes/openai.wiretypes';
// configuration
@@ -42,6 +43,9 @@ type TResponse = OpenAIWire_API_Responses.Response;
type TOutputItem = OpenAIWire_API_Responses.Response['output'][number];
type TEventType = OpenAIWire_API_Responses.StreamingEvent['type'];
// cached config for the image_generation hosted tool, captured at response.created
type TImageGenToolCfg = Extract<OpenAIWire_Responses_Tools.Tool, { type: 'image_generation' }>;
/**
* We need this just to ensure events are not out of order, as out streaming is progressive
@@ -76,6 +80,9 @@ class ResponseParserStateMachine {
// streaming state tracking
#hasFunctionCalls: boolean = false; // tracks if we've seen function_call output items
// hosted tool configuration echo (captured at response.created)
#imageGenToolCfg: TImageGenToolCfg | undefined;
// Validations
@@ -237,6 +244,18 @@ class ResponseParserStateMachine {
return this.#hasFunctionCalls;
}
// Hosted tool config capture
captureHostedToolConfigs(tools: TResponse['tools']) {
if (!tools?.length) return;
this.#imageGenToolCfg = tools.find((t): t is TImageGenToolCfg => t.type === 'image_generation');
}
get imageGenToolCfg() {
return this.#imageGenToolCfg;
}
}
@@ -299,6 +318,9 @@ export function createOpenAIResponsesEventParser(): ChatGenerateParseFunction {
if (event.response.store && event.response.id)
pt.setUpstreamHandle(event.response.id, 'oai-responses' /*, event.sequence_number - commented, unused for now */);
// -> Hosted tool configs: cache for per-event enrichment (e.g. image generation progress)
R.captureHostedToolConfigs(event.response.tools);
// TODO: [FUTURE] Accumulate in DMessage.sessionMetadata:
// pt.setSessionMetadata('openai.response.id', response.id)
// pt.setSessionMetadata('openai.response.expiresAt', response.expires_at)
@@ -403,14 +425,17 @@ export function createOpenAIResponsesEventParser(): ChatGenerateParseFunction {
case 'image_generation_call':
// -> IGC: process completed image generation using 'ii' particle for inline images
const { result: igResult, revised_prompt: igRevisedPrompt } = doneItem;
const { id: igId, result: igResult, revised_prompt: igRevisedPrompt } = doneItem;
const igDoneText = !igRevisedPrompt?.length ? 'Image generated'
: `Generated: "${igRevisedPrompt.length > 100 ? igRevisedPrompt.slice(0, 100) + '...' : igRevisedPrompt}"`;
pt.sendOperationState('gen-image', igDoneText, { opId: igId, state: 'done' });
// Create inline image with base64 data
if (igResult)
pt.appendImageInline(
_imageGenerationMimeType(doneItem), // infer from output_format echoed in the item
igResult,
igRevisedPrompt || 'Generated image',
'gpt-image-1', // generator
R.imageGenToolCfg?.model || AIX_OAI_DEFAULT_IMAGE_GEN_MODEL, // generator: prefer the cached tool config, fallback to current default
igRevisedPrompt || '', // prompt used
);
else
@@ -569,7 +594,7 @@ export function createOpenAIResponsesEventParser(): ChatGenerateParseFunction {
case 'response.image_generation_call.in_progress':
R.outputItemVisit(eventType, event.output_index, 'image_generation_call');
pt.sendOperationState('gen-image', 'Generating image...', { opId: event.item_id });
pt.sendOperationState('gen-image', `Generating image${_prettyImageGenConfigSuffix(R.imageGenToolCfg)}...`, { opId: event.item_id });
break;
case 'response.image_generation_call.generating':
@@ -579,15 +604,19 @@ export function createOpenAIResponsesEventParser(): ChatGenerateParseFunction {
case 'response.image_generation_call.partial_image':
R.outputItemVisit(eventType, event.output_index, 'image_generation_call');
// SKIP partial images to avoid duplicates - only use final result
// const { partial_image_index: piIndex } = event;
// console.log('[DEV] AIX: OpenAI Responses: skipping partial_image event to avoid duplicates:', { piIndex });
// The final image will be handled in response.output_item.done
// NOTE: We don't surface the partial image data yet (it would require a new particle type)
// NOTE: Also for now we do not use the partial_image_index as a progress counter in the status text.
// const piIndex = event.partial_image_index;
// const piMax = R.imageGenToolCfg?.partial_images;
// const piProgress = piMax ? `${piIndex + 1}/${piMax}` : `step ${piIndex + 1}`;
// pt.sendOperationState('gen-image', `Generating image${_prettyImageGenConfigSuffix(R.imageGenToolCfg)}... (${piProgress})`, { opId: event.item_id });
break;
case 'response.image_generation_call.completed':
R.outputItemVisit(eventType, event.output_index, 'image_generation_call');
pt.sendOperationState('gen-image', 'Image generated', { opId: event.item_id, state: 'done' });
// NOTE: we defer the 'done' status to response.output_item.done, where we also have the revised_prompt for decoration;
// the semantics of the below would be better (to close the in_progress), but we assume that the response will come immediately after
// pt.sendOperationState('gen-image', 'Image generated', { opId: event.item_id, state: 'done' });
// -> Final image result is handled in response.output_item.done
break;
@@ -892,7 +921,7 @@ export function createOpenAIResponseParserNS(): ChatGenerateParseFunction {
_imageGenerationMimeType(oItem), // infer from output_format echoed in the item
igResult,
igRevisedPrompt || 'Generated image',
'gpt-image-1', // generator
AIX_OAI_DEFAULT_IMAGE_GEN_MODEL, // generator: non-streaming path has no captured tool config, use current default
igRevisedPrompt || '', // prompt used
);
else
@@ -1036,6 +1065,22 @@ function _forwardTextAnnotation(pt: IParticleTransmitter, annotation: Exclude<Ex
}
}
/**
* Builds a concise " (model · size · quality · format)" suffix from the image-gen tool
* config captured at response.created, so in-flight status messages carry real info
* instead of a generic "Generating image...".
*/
function _prettyImageGenConfigSuffix(cfg: TImageGenToolCfg | undefined): string {
if (!cfg) return '';
const parts: string[] = [];
if (cfg.model) parts.push(cfg.model);
if (cfg.size && cfg.size !== 'auto') parts.push(cfg.size);
if (cfg.quality && cfg.quality !== 'auto') parts.push(cfg.quality);
if (cfg.output_format) parts.push(cfg.output_format);
if (cfg.background && cfg.background !== 'auto') parts.push(cfg.background);
return parts.length ? ` (${parts.join(' · ')})` : '';
}
/**
* Infers the mime type from the image_generation_call output item's output_format field.
* The API echoes the output_format in the done item (e.g. 'png', 'webp', 'jpeg').
@@ -830,18 +830,26 @@ export namespace OpenAIWire_API_Chat_Completions {
//
export namespace OpenAIWire_API_Images_Generations {
/** GPT Image family models - shared between this namespace, Images_Edits, and the Responses image_generation tool. */
export const GptImageModels_schema = z.enum([
'gpt-image-2',
'gpt-image-1.5',
'gpt-image-1',
'gpt-image-1-mini',
]);
export type Request = z.infer<typeof Request_schema>;
const Request_schema = z.object({
// 32,000 for gpt-image-1.5/gpt-image-1/gpt-image-1-mini, 4,000 for dall-e-3, 1,000 for dall-e-2
// 32,000 for gpt-image family, 4,000 for dall-e-3, 1,000 for dall-e-2
prompt: z.string().max(32000),
model: z.enum([
'gpt-image-1.5',
'gpt-image-1',
'gpt-image-1-mini',
'dall-e-3',
'dall-e-2', // default
model: z.union([
GptImageModels_schema,
z.enum([
'dall-e-3',
'dall-e-2', // default
]),
]).optional(),
// The number of images to generate. Must be between 1 and 10. For dall-e-3, only n=1 is supported.
@@ -850,7 +858,7 @@ export namespace OpenAIWire_API_Images_Generations {
// Image quality
quality: z.enum([
'auto', // default
'high', 'medium', 'low', // gpt-image-1.5, gpt-image-1, gpt-image-1-mini
'high', 'medium', 'low', // gpt-image
'hd', 'standard', // dall-e-3: hd | standard, dall-e-2: only standard
]).optional(),
@@ -876,7 +884,7 @@ export namespace OpenAIWire_API_Images_Generations {
user: z.string().optional(),
// -- GPT Image Family Specific Parameters (gpt-image-1.5, gpt-image-1, gpt-image-1-mini) --
// -- GPT Image specific parameters --
// Allows to set transparency (in that case, format = png or webp)
background: z.enum(['transparent', 'opaque', 'auto' /* default */]).optional(),
@@ -907,7 +915,7 @@ export namespace OpenAIWire_API_Images_Generations {
url: z.url().optional(), // if the response_format is 'url' - DEPRECATED
})),
// GPT Image models only (gpt-image-1.5, gpt-image-1, gpt-image-1-mini)
// gpt-image only
usage: z.object({
total_tokens: z.number(),
input_tokens: z.number() // images + text tokens in the input prompt
@@ -935,14 +943,17 @@ export namespace OpenAIWire_API_Images_Edits {
*/
export const Request_schema = z.object({
// 32,000 for gpt-image-1.5/gpt-image-1/gpt-image-1-mini, 1,000 for dall-e-2
// 32,000 for gpt-image, 1,000 for dall-e-2
prompt: z.string().max(32000),
// image: file | file[] - REQUIRED - Handled as file uploads in FormData ('image' field)
// mask: file - OPTIONAL - Handled as file upload in FormData ('mask' field)
model: z.enum(['gpt-image-1.5', 'gpt-image-1', 'gpt-image-1-mini', 'dall-e-2']).optional(),
model: z.union([
OpenAIWire_API_Images_Generations.GptImageModels_schema,
z.enum(['dall-e-2' /* dall-e-3 does not do image edits */]),
]).optional(),
// Number of images to generate, between 1 and 10
n: z.number().min(1).max(10).nullable().optional(),
@@ -950,7 +961,7 @@ export namespace OpenAIWire_API_Images_Edits {
// Image quality
quality: z.enum([
'auto', // default
'high', 'medium', 'low', // gpt-image-1.5, gpt-image-1, gpt-image-1-mini
'high', 'medium', 'low', // gpt-image
'standard', // dall-e-2: only standard
]).optional(),
@@ -1198,17 +1209,20 @@ export namespace OpenAIWire_Responses_Items {
id: z.string(), // unique ID of the image generation call (output item ID)
result: z.string().optional(), // base64 image data when completed
revised_prompt: z.string().optional(), // the revised prompt used for generation
// BREAKING CHANGE from OpenAI - 2025-09-30
// redefining the following because we need 'generating' too here
// Docs: "in_progress" | "completed" | "generating" | "failed".
// 'incomplete' kept as defensive carryover from OutputItemBase (not doc'd for this item).
status: z.enum([
'generating', // 2025-09-30: seen on OpenAI for `image_generation_call` items
'in_progress', 'completed', 'incomplete',
'in_progress', 'completed', 'generating', 'failed',
'incomplete', // defensive: not in docs for image_generation_call, but harmless to accept
]).optional(),
// Echoed configuration from the tool request - used to infer mime type for the result
// Echoed configuration from the tool request - the API returns these on the done item with
// RESOLVED values (e.g. size:"auto" becomes "1536x1024"). Confirmed live on 2026-04-21 log.
output_format: z.enum(['png' /* default */, 'jpeg', 'webp']).optional(),
// NOTE: we also see the following echoed in the image_generation_call item
// NOTE: we also see the following echoed in the image_generation_call item, but don't make use of them for now
// background: z.enum(['transparent', 'opaque', 'auto' /* default */]).optional(),
// quality: z.enum(['auto', 'high', 'medium', 'low']).optional(),
// size: z.enum(['1024x1024', '1024x1536', '1536x1024', 'auto']).or(z.string()).optional(),
// action: z.enum(['generate', 'edit', 'auto']).or(z.string()).optional(),
});
// const OutputMCPCallItem_schema = _OutputItemBase_schema.extend({
@@ -1434,19 +1448,21 @@ export namespace OpenAIWire_Responses_Tools {
const ImageGenerationTool_schema = z.object({
type: z.literal('image_generation'),
/** Whether to generate a new image or edit an existing image. Default: auto. */
action: z.enum(['generate', 'edit', 'auto']).or(z.string()).optional(),
background: z.enum(['transparent', 'opaque', 'auto']).optional(), // defaults to 'auto'
/**
* Control how much effort the model will exert to match the style and features, especially facial features, of input images.
* Defaults to 'low'.
* Supported for gpt-image-1 / gpt-image-1.5+ (not gpt-image-1-mini). Defaults to 'low'.
*/
input_fidelity: z.enum(['high', 'low']).optional(),
input_image_mask: z.object({
file_id: z.string().optional(), // File ID for the mask image
image_url: z.string().optional(), // Base64-encoded mask image
}).optional(),
/** 'gpt-image-1' (default), leaks suggest also 'gpt-image-0721-mini-alpha' */
model: z.string().optional(),
/** Note: 'low' is unconfirmed here. Defaults to 'auto' */
/** gpt-image family, relaxed with .or(z.string()) for forward-compat with new models. */
model: OpenAIWire_API_Images_Generations.GptImageModels_schema.or(z.string()).optional(),
/** Defaults to 'auto' */
moderation: z.enum(['low', 'auto']).optional(),
output_compression: z.number().min(0).max(100).int().optional(), // defaults to 100
/** One of [png, webp, or jpeg]. Default: png. */
@@ -1455,8 +1471,10 @@ export namespace OpenAIWire_Responses_Tools {
partial_images: z.number().int().min(0).max(3).optional(),
/** Quality of the generated image. Defaults to 'auto' */
quality: z.enum(['low', 'medium', 'high', 'auto']).optional(),
/** The size of the generated image. One of 1024x1024, 1024x1536, 1536x1024, or auto. Default: auto. */
size: z.enum(['1024x1024', '1024x1536', '1536x1024', 'auto']).optional(),
/** Default: auto */
size: z.enum(['1024x1024', '1024x1536', '1536x1024', 'auto']).or(z.string()).optional(),
// Not supported in the request, echoed by the API, but always 1
// n: z.number().int().optional(),
});
// Code Interpreter tool - runs Python code in a sandboxed container
@@ -1617,6 +1635,13 @@ export namespace OpenAIWire_API_Responses {
model: z.string(), // model used for the response
// echo of requested tools but with all properties values - can be used by the parser to enrich hosted-tool messages
// NOTE: .catch() gracefully degrades to undefined since this is a non-critical enrichment path
tools: z.array(OpenAIWire_Responses_Tools.Tool_schema).optional().catch((ctx) => {
console.warn('[DEV] AIX: OpenAI Responses: unable to parse echoed tools, ignoring:', { tools: ctx.value });
return;
}),
output: z.array(OpenAIWire_Responses_Items.OutputItem_schema),
usage: z.object({
@@ -1649,7 +1674,6 @@ export namespace OpenAIWire_API_Responses {
// temperature: ... // 1
// text: ... // { .. }
// tool_choice: ... // 'auto'
// tools: ... // e.g. [{ type: 'web_search_preview', search_context_size: 'medium', ... }]
// top_logprobs: ... // 0
// top_p: ... // 1
// truncation: ... // 'disabled'
@@ -1851,6 +1875,12 @@ export namespace OpenAIWire_API_Responses {
type: z.literal('response.image_generation_call.partial_image'),
partial_image_b64: z.string(), // base64 partial image
partial_image_index: z.number(), // 0-based index
// NOTE: observed on the wire (not in the OpenAI docs, but consistently echoed), but not used/useful yet.
// these carry the RESOLVED values even when the request used 'auto' (e.g. size:"auto" -> "1536x1024").
// output_format: z.enum(['png', 'jpeg', 'webp']).or(z.string()).optional(),
// background: z.enum(['transparent', 'opaque', 'auto']).or(z.string()).optional(),
// quality: z.enum(['low', 'medium', 'high', 'auto']).or(z.string()).optional(),
// size: z.enum(['1024x1024', '1024x1536', '1536x1024', 'auto']).or(z.string()).optional(),
});
const OutputImageGenerationCallCompletedEvent_schema = _OutputIndexedEvent_schema.extend({