From 996998a5cc1e790fa84bc36cc7326617e68c9fac Mon Sep 17 00:00:00 2001 From: Enrico Ros Date: Wed, 22 Apr 2026 17:16:31 -0700 Subject: [PATCH] AIX: Gemini Interactions: In/Out of images, Out of Audio --- .../adapters/gemini.interactionsCreate.ts | 93 ++++++++++++++----- .../parsers/gemini.interactions.parser.ts | 66 +++++++++++-- .../gemini.interactions.wiretypes.ts | 44 ++++++++- 3 files changed, 166 insertions(+), 37 deletions(-) diff --git a/src/modules/aix/server/dispatch/chatGenerate/adapters/gemini.interactionsCreate.ts b/src/modules/aix/server/dispatch/chatGenerate/adapters/gemini.interactionsCreate.ts index e1fe89ef0..a5bcc0046 100644 --- a/src/modules/aix/server/dispatch/chatGenerate/adapters/gemini.interactionsCreate.ts +++ b/src/modules/aix/server/dispatch/chatGenerate/adapters/gemini.interactionsCreate.ts @@ -8,18 +8,22 @@ import { approxDocPart_To_String, approxInReferenceTo_To_XMLString, aixSpillSyst type TRequestBody = z.infer; type TTurn = z.infer; +type TTurnContent = TTurn['content']; // string | InputContentPart[] +type TInputPart = z.infer; /** - * MINIMAL - Build the POST /v1beta/interactions body for Deep Research agents. + * Build the POST /v1beta/interactions body for Deep Research (and future agents). * * Scope: - * - Stateless multi-turn: the full `chatSequence` is flattened to role-tagged turns and sent as `input`. - * - `systemMessage` text (if any) is prepended to the first user turn, since the Interactions API for - * background agents does not accept a dedicated `system_instruction`. - * - Text-only content: doc parts are rendered via `approxDocPart_To_String`; in-reference-to XML is prepended to the user turn. - * - Model messages containing only tool invocations/responses/aux (no text) are dropped. - * - Non-text user parts (images, audio, cache-control) are silently dropped. + * - Stateless multi-turn: `chatSequence` is flattened to role-tagged turns and sent as `input`. + * - `systemMessage` text (if any) is prepended to the first user turn; background agents do not + * accept a dedicated `system_instruction`. + * - Multimodal: user and model turns carry images as content-part arrays when any image is present, + * otherwise stay as plain strings (preserves the API's convenience shape). + * - Doc parts render as text via `approxDocPart_To_String`; in-reference-to XML is prepended to the user turn. + * - Model messages containing only tool invocations/responses/aux (no text or images) are dropped. + * - Audio and cache-control parts are silently dropped (unsupported on this path). */ export function aixToGeminiInteractionsCreate(model: AixAPI_Model, chatGenerateRaw: AixAPIChatGenerate_Request): TRequestBody { @@ -33,11 +37,11 @@ export function aixToGeminiInteractionsCreate(model: AixAPI_Model, chatGenerateR const turns: TTurn[] = []; for (const msg of chatGenerate.chatSequence) { if (msg.role === 'user') { - const content = _flattenUserParts(msg.parts); - if (content) turns.push({ role: 'user', content }); + const content = _buildUserContent(msg.parts); + if (_hasTurnContent(content)) turns.push({ role: 'user', content }); } else if (msg.role === 'model') { - const content = _flattenModelParts(msg.parts); - if (content) turns.push({ role: 'model', content }); + const content = _buildModelContent(msg.parts); + if (_hasTurnContent(content)) turns.push({ role: 'model', content }); } } @@ -48,14 +52,14 @@ export function aixToGeminiInteractionsCreate(model: AixAPI_Model, chatGenerateR if (systemPrefix) { const firstUserIdx = turns.findIndex(t => t.role === 'user'); if (firstUserIdx >= 0) - turns[firstUserIdx] = { role: 'user', content: `${systemPrefix}\n\n${turns[firstUserIdx].content}` }; + turns[firstUserIdx] = { role: 'user', content: _prependSystemText(turns[firstUserIdx].content, systemPrefix) }; } // Sanity: the API expects the last turn to be 'user' (we're asking the model to respond) if (turns[turns.length - 1].role !== 'user') throw new Error('Gemini Interactions: last turn must be from user (chat sequence ended with a model message)'); - // Simplify single-turn to string form (matches the Python/JS SDK convenience shape) + // Simplify single-turn to bare content form (matches the Python/JS SDK convenience shape) const input: TRequestBody['input'] = (turns.length === 1 && turns[0].role === 'user') ? turns[0].content : turns; @@ -95,23 +99,26 @@ function _collectSystemText(systemMessage: AixAPIChatGenerate_Request['systemMes return chunks.join('\n').trim(); } -function _flattenUserParts(parts: Extract['parts']): string { - const chunks: string[] = []; +function _buildUserContent(parts: Extract['parts']): TTurnContent { + const textChunks: string[] = []; const prefixChunks: string[] = []; // in-reference-to goes before body + const images: TInputPart[] = []; for (const part of parts) { switch (part.pt) { case 'text': - chunks.push(part.text); + textChunks.push(part.text); break; case 'doc': - chunks.push(approxDocPart_To_String(part)); + textChunks.push(approxDocPart_To_String(part)); break; case 'meta_in_reference_to': const irt = approxInReferenceTo_To_XMLString(part); if (irt) prefixChunks.push(irt); break; case 'inline_image': + images.push({ type: 'image', data: part.base64, mime_type: part.mimeType }); + break; case 'meta_cache_control': break; // unsupported here; dropped default: @@ -119,26 +126,64 @@ function _flattenUserParts(parts: Extract['parts']): string { - const chunks: string[] = []; +function _buildModelContent(parts: Extract['parts']): TTurnContent { + const textChunks: string[] = []; + const images: TInputPart[] = []; + for (const part of parts) { switch (part.pt) { case 'text': - chunks.push(part.text); + textChunks.push(part.text); + break; + case 'inline_image': + // model-authored images (e.g. from a prior generation) - replay as context + images.push({ type: 'image', data: part.base64, mime_type: part.mimeType }); break; case 'inline_audio': - case 'inline_image': case 'tool_invocation': case 'tool_response': case 'ma': // model aux (reasoning, etc.) case 'meta_cache_control': - break; // drop non-text model output for Deep Research replays + break; // drop non-text/image model output for Deep Research replays default: const _exhaustive: never = part; } } - return chunks.join('\n\n').trim(); + + const text = textChunks.join('\n\n').trim(); + + if (!images.length) return text; + + const contentParts: TInputPart[] = []; + if (text) contentParts.push({ type: 'text', text }); + contentParts.push(...images); + return contentParts; +} + + +// -- helpers -- + +function _hasTurnContent(content: TTurnContent): boolean { + return typeof content === 'string' ? content.length > 0 : content.length > 0; +} + +function _prependSystemText(content: TTurnContent, systemPrefix: string): TTurnContent { + if (typeof content === 'string') + return `${systemPrefix}\n\n${content}`; + // multimodal: inject a text part at the front, or fold into the leading text part if present + if (content.length > 0 && content[0].type === 'text') + return [{ type: 'text', text: `${systemPrefix}\n\n${content[0].text}` }, ...content.slice(1)]; + return [{ type: 'text', text: systemPrefix }, ...content]; } diff --git a/src/modules/aix/server/dispatch/chatGenerate/parsers/gemini.interactions.parser.ts b/src/modules/aix/server/dispatch/chatGenerate/parsers/gemini.interactions.parser.ts index f792f0f38..4ff18d7cc 100644 --- a/src/modules/aix/server/dispatch/chatGenerate/parsers/gemini.interactions.parser.ts +++ b/src/modules/aix/server/dispatch/chatGenerate/parsers/gemini.interactions.parser.ts @@ -5,6 +5,7 @@ import type { ChatGenerateParseFunction } from '../chatGenerate.dispatch'; import type { IParticleTransmitter } from './IParticleTransmitter'; import { GeminiInteractionsWire_API_Interactions } from '../../wiretypes/gemini.interactions.wiretypes'; +import { geminiConvertPCM2WAV } from './gemini.audioutils'; // Kill-switch: drop url_citation annotations - Deep Research ships opaque grounding-redirect URLs with no titles, and the text already contains a numbered source list. @@ -16,7 +17,7 @@ type TUsage = NonNullable; /** - * Gemini Interactions API parser (for Deep Research agents). + * Gemini Interactions API parser (for Deep Research and future multimodal agents). * * Each SSE frame carries a *full* Interaction snapshot (from POST or from a GET poll). * The parser diffs against prior state and emits only new content. @@ -24,7 +25,11 @@ type TUsage = NonNullable; * Emission rules per output type: * - `text` -> `pt.appendText(newSuffix)`. New url_citation annotations are emitted once. * - `thought` -> `pt.appendReasoningText(newSuffix)`; signatures recorded via `setReasoningSignature`. - * - any other type -> ignored (Deep Research primarily emits text + thought). + * - `image` -> `pt.appendImageInline(...)` once per index (images are whole, not incremental). + * URI-only variants emit a visible note + `console.warn` (not yet wired as fetches). + * - `audio` -> PCM -> WAV via `geminiConvertPCM2WAV`, then `pt.appendAudioInline(...)` once per index. + * - unknown types -> `console.warn` + inline `_Unsupported content block: _` note, once per index. + * Non-terminating: Deep Research streams are long-lived and must not blow up on new blocks. * * Part boundaries: when the output type at a given index changes kind (e.g. thought -> text), * we call `endMessagePart()` so the transmitter flushes the previous part cleanly. @@ -42,10 +47,12 @@ export function createGeminiInteractionsParser(requestedModelName: string | null // per-index emission state (array index in `outputs[]`) type EmittedState = { - kind: 'text' | 'thought' | 'other'; + kind: 'text' | 'thought' | 'image' | 'audio' | 'other'; emittedTextLen: number; emittedCitationKeys: Set; // `${url}@${start}-${end}` to de-dupe signatureSent: boolean; + mediaEmitted: boolean; // image/audio: emit only once (whole, not incremental) + otherWarned: boolean; // unknown type: warn only once per index }; const emitted: EmittedState[] = []; let lastOpenIdx = -1; // index of the most recently opened part; -1 = none @@ -84,17 +91,25 @@ export function createGeminiInteractionsParser(requestedModelName: string | null timeToFirstEvent = Date.now() - parserCreationTimestamp; // process outputs (may be absent on early in_progress frames). - // Each raw output is classified via Zod safeParse against a discriminated union; unknown - // shapes fall through to `kind: 'other'` and are silently ignored. + // Each raw output is classified via Zod safeParse against a discriminated union. + // - Untyped/empty placeholders (`{}`, no `type` field) are skipped silently without creating + // state, so a later snapshot that populates them can classify cleanly. + // - Typed-but-unknown shapes warn once per index with a visible note (non-terminating). const outputs = interaction.outputs ?? []; for (let i = 0; i < outputs.length; i++) { - const classified = GeminiInteractionsWire_API_Interactions.KnownOutput_schema.safeParse(outputs[i]); + const raw = outputs[i] as { type?: unknown }; + const rawType = typeof raw?.type === 'string' ? raw.type : null; + + // skip not-yet-populated placeholder blocks silently (Deep Research pre-allocates slots) + if (rawType === null) continue; + + const classified = GeminiInteractionsWire_API_Interactions.KnownOutput_schema.safeParse(raw); const kind: EmittedState['kind'] = !classified.success ? 'other' : classified.data.type; // first time we see this index: initialize + flush previous part if switching kinds let state = emitted[i]; if (!state) { - state = { kind, emittedTextLen: 0, emittedCitationKeys: new Set(), signatureSent: false }; + state = { kind, emittedTextLen: 0, emittedCitationKeys: new Set(), signatureSent: false, mediaEmitted: false, otherWarned: false }; emitted[i] = state; // close previous part if we're opening a new index (natural part boundary) @@ -103,7 +118,15 @@ export function createGeminiInteractionsParser(requestedModelName: string | null lastOpenIdx = i; } - if (!classified.success) continue; // 'other': ignored for now + // 'other': warn once per index with visible note, then continue + if (!classified.success) { + if (!state.otherWarned) { + console.warn(`[GeminiInteractions] unsupported output type: ${rawType}`, raw); + pt.appendText(`\n_Unsupported content block: ${rawType}_\n`); + state.otherWarned = true; + } + continue; + } const out = classified.data; if (out.type === 'text') { @@ -123,7 +146,7 @@ export function createGeminiInteractionsParser(requestedModelName: string | null pt.appendUrlCitation(ann.title || ann.url, ann.url, undefined, ann.start_index, ann.end_index, undefined, undefined); } } - } else /* out.type === 'thought' */ { + } else if (out.type === 'thought') { const summary = out.summary ?? ''; if (summary.length > state.emittedTextLen) { pt.appendReasoningText(summary.slice(state.emittedTextLen)); @@ -133,6 +156,31 @@ export function createGeminiInteractionsParser(requestedModelName: string | null pt.setReasoningSignature(out.signature); state.signatureSent = true; } + } else if (out.type === 'image') { + if (!state.mediaEmitted) { + if (out.data) { + pt.appendImageInline(out.mime_type, out.data, 'Gemini Generated Image', 'Gemini', ''); + } else if (out.uri) { + // URI-hosted images aren't fetched here (yet); surface the link inline + console.warn('[GeminiInteractions] image output via URI is not yet fetched inline:', out.uri); + pt.appendText(`\n[Image: ${out.uri}]\n`); + } else { + console.warn('[GeminiInteractions] image output with neither data nor uri:', out); + pt.appendText(`\n_Image block without payload_\n`); + } + state.mediaEmitted = true; + } + } else /* out.type === 'audio' */ { + if (!state.mediaEmitted) { + try { + const wav = geminiConvertPCM2WAV(out.mime_type, out.data); + pt.appendAudioInline(wav.mimeType, wav.base64Data, 'Gemini Generated Audio', 'Gemini', wav.durationMs); + } catch (error) { + console.warn('[GeminiInteractions] audio convert failed:', error); + pt.appendText(`\n_Audio conversion failed: ${String(error)}_\n`); + } + state.mediaEmitted = true; + } } } diff --git a/src/modules/aix/server/dispatch/wiretypes/gemini.interactions.wiretypes.ts b/src/modules/aix/server/dispatch/wiretypes/gemini.interactions.wiretypes.ts index 6149b6b04..fd4dea041 100644 --- a/src/modules/aix/server/dispatch/wiretypes/gemini.interactions.wiretypes.ts +++ b/src/modules/aix/server/dispatch/wiretypes/gemini.interactions.wiretypes.ts @@ -26,17 +26,36 @@ export namespace GeminiInteractionsWire_API_Interactions { // -- Request Body (POST /v1beta/interactions) -- + // Multimodal content parts (used when a turn carries images/audio in addition to text). + // Single-modal text turns stay as a plain string to match the API's convenience shape. + const InputTextPart_schema = z.object({ + type: z.literal('text'), + text: z.string(), + }); + const InputImagePart_schema = z.object({ + type: z.literal('image'), + data: z.string(), // base64-encoded bytes + mime_type: z.string(), // e.g. 'image/png', 'image/jpeg', 'image/webp' + }); + export const InputContentPart_schema = z.discriminatedUnion('type', [ + InputTextPart_schema, + InputImagePart_schema, + ]); + // A turn in a stateless multi-turn conversation (when `input` is an array). - // Content is kept as a plain string for now; the API also accepts a list of content objects for multimodal. export const Turn_schema = z.object({ role: z.enum(['user', 'model']), - content: z.string(), + content: z.union([ + z.string(), // text-only turn (API convenience shape) + z.array(InputContentPart_schema), // multimodal turn + ]), }); export const RequestBody_schema = z.object({ agent: z.string(), // e.g. 'deep-research-pro-preview-12-2025' (note: we send bare id, without 'models/' prefix) input: z.union([ - z.string(), // single-turn convenience + z.string(), // single-turn text convenience + z.array(InputContentPart_schema), // single-turn multimodal z.array(Turn_schema), // stateless multi-turn history ]), background: z.literal(true), // required for agents @@ -79,10 +98,27 @@ export namespace GeminiInteractionsWire_API_Interactions { signature: z.string().optional(), }); - /** Discriminated union of output shapes we act on. Anything else: safeParse fails -> parser skips. */ + const ImageOutput_schema = z.object({ + type: z.literal('image'), + // API may return inline bytes (`data` + `mime_type`) or a URI. We accept both shapes; + // the parser prefers inline and falls back to a URI note when only `uri` is present. + data: z.string().optional(), // base64-encoded bytes + uri: z.string().optional(), + mime_type: z.string(), + }); + + const AudioOutput_schema = z.object({ + type: z.literal('audio'), + data: z.string(), // base64-encoded bytes (Gemini serves PCM; parser converts to WAV) + mime_type: z.string(), // e.g. 'audio/L16;codec=pcm;rate=24000' + }); + + /** Discriminated union of output shapes we act on. Anything else: safeParse fails -> parser warns once per index. */ export const KnownOutput_schema = z.discriminatedUnion('type', [ TextOutput_schema, ThoughtOutput_schema, + ImageOutput_schema, + AudioOutput_schema, ]);