AIX: Gemini Interactions: In/Out of images, Out of Audio

This commit is contained in:
Enrico Ros
2026-04-22 17:16:31 -07:00
parent 98474b2721
commit 996998a5cc
3 changed files with 166 additions and 37 deletions
@@ -8,18 +8,22 @@ import { approxDocPart_To_String, approxInReferenceTo_To_XMLString, aixSpillSyst
type TRequestBody = z.infer<typeof GeminiInteractionsWire_API_Interactions.RequestBody_schema>;
type TTurn = z.infer<typeof GeminiInteractionsWire_API_Interactions.Turn_schema>;
type TTurnContent = TTurn['content']; // string | InputContentPart[]
type TInputPart = z.infer<typeof GeminiInteractionsWire_API_Interactions.InputContentPart_schema>;
/**
* MINIMAL - Build the POST /v1beta/interactions body for Deep Research agents.
* Build the POST /v1beta/interactions body for Deep Research (and future agents).
*
* Scope:
* - Stateless multi-turn: the full `chatSequence` is flattened to role-tagged turns and sent as `input`.
* - `systemMessage` text (if any) is prepended to the first user turn, since the Interactions API for
* background agents does not accept a dedicated `system_instruction`.
* - Text-only content: doc parts are rendered via `approxDocPart_To_String`; in-reference-to XML is prepended to the user turn.
* - Model messages containing only tool invocations/responses/aux (no text) are dropped.
* - Non-text user parts (images, audio, cache-control) are silently dropped.
* - Stateless multi-turn: `chatSequence` is flattened to role-tagged turns and sent as `input`.
* - `systemMessage` text (if any) is prepended to the first user turn; background agents do not
* accept a dedicated `system_instruction`.
* - Multimodal: user and model turns carry images as content-part arrays when any image is present,
* otherwise stay as plain strings (preserves the API's convenience shape).
* - Doc parts render as text via `approxDocPart_To_String`; in-reference-to XML is prepended to the user turn.
* - Model messages containing only tool invocations/responses/aux (no text or images) are dropped.
* - Audio and cache-control parts are silently dropped (unsupported on this path).
*/
export function aixToGeminiInteractionsCreate(model: AixAPI_Model, chatGenerateRaw: AixAPIChatGenerate_Request): TRequestBody {
@@ -33,11 +37,11 @@ export function aixToGeminiInteractionsCreate(model: AixAPI_Model, chatGenerateR
const turns: TTurn[] = [];
for (const msg of chatGenerate.chatSequence) {
if (msg.role === 'user') {
const content = _flattenUserParts(msg.parts);
if (content) turns.push({ role: 'user', content });
const content = _buildUserContent(msg.parts);
if (_hasTurnContent(content)) turns.push({ role: 'user', content });
} else if (msg.role === 'model') {
const content = _flattenModelParts(msg.parts);
if (content) turns.push({ role: 'model', content });
const content = _buildModelContent(msg.parts);
if (_hasTurnContent(content)) turns.push({ role: 'model', content });
}
}
@@ -48,14 +52,14 @@ export function aixToGeminiInteractionsCreate(model: AixAPI_Model, chatGenerateR
if (systemPrefix) {
const firstUserIdx = turns.findIndex(t => t.role === 'user');
if (firstUserIdx >= 0)
turns[firstUserIdx] = { role: 'user', content: `${systemPrefix}\n\n${turns[firstUserIdx].content}` };
turns[firstUserIdx] = { role: 'user', content: _prependSystemText(turns[firstUserIdx].content, systemPrefix) };
}
// Sanity: the API expects the last turn to be 'user' (we're asking the model to respond)
if (turns[turns.length - 1].role !== 'user')
throw new Error('Gemini Interactions: last turn must be from user (chat sequence ended with a model message)');
// Simplify single-turn to string form (matches the Python/JS SDK convenience shape)
// Simplify single-turn to bare content form (matches the Python/JS SDK convenience shape)
const input: TRequestBody['input'] = (turns.length === 1 && turns[0].role === 'user')
? turns[0].content
: turns;
@@ -95,23 +99,26 @@ function _collectSystemText(systemMessage: AixAPIChatGenerate_Request['systemMes
return chunks.join('\n').trim();
}
function _flattenUserParts(parts: Extract<AixAPIChatGenerate_Request['chatSequence'][number], { role: 'user' }>['parts']): string {
const chunks: string[] = [];
function _buildUserContent(parts: Extract<AixAPIChatGenerate_Request['chatSequence'][number], { role: 'user' }>['parts']): TTurnContent {
const textChunks: string[] = [];
const prefixChunks: string[] = []; // in-reference-to goes before body
const images: TInputPart[] = [];
for (const part of parts) {
switch (part.pt) {
case 'text':
chunks.push(part.text);
textChunks.push(part.text);
break;
case 'doc':
chunks.push(approxDocPart_To_String(part));
textChunks.push(approxDocPart_To_String(part));
break;
case 'meta_in_reference_to':
const irt = approxInReferenceTo_To_XMLString(part);
if (irt) prefixChunks.push(irt);
break;
case 'inline_image':
images.push({ type: 'image', data: part.base64, mime_type: part.mimeType });
break;
case 'meta_cache_control':
break; // unsupported here; dropped
default:
@@ -119,26 +126,64 @@ function _flattenUserParts(parts: Extract<AixAPIChatGenerate_Request['chatSequen
}
}
return [...prefixChunks, ...chunks].join('\n\n').trim();
const text = [...prefixChunks, ...textChunks].join('\n\n').trim();
// text-only turn: return string (API convenience shape)
if (!images.length) return text;
// multimodal turn: emit as content-parts array; text first, then images (matches generateContent convention)
const contentParts: TInputPart[] = [];
if (text) contentParts.push({ type: 'text', text });
contentParts.push(...images);
return contentParts;
}
function _flattenModelParts(parts: Extract<AixAPIChatGenerate_Request['chatSequence'][number], { role: 'model' }>['parts']): string {
const chunks: string[] = [];
function _buildModelContent(parts: Extract<AixAPIChatGenerate_Request['chatSequence'][number], { role: 'model' }>['parts']): TTurnContent {
const textChunks: string[] = [];
const images: TInputPart[] = [];
for (const part of parts) {
switch (part.pt) {
case 'text':
chunks.push(part.text);
textChunks.push(part.text);
break;
case 'inline_image':
// model-authored images (e.g. from a prior generation) - replay as context
images.push({ type: 'image', data: part.base64, mime_type: part.mimeType });
break;
case 'inline_audio':
case 'inline_image':
case 'tool_invocation':
case 'tool_response':
case 'ma': // model aux (reasoning, etc.)
case 'meta_cache_control':
break; // drop non-text model output for Deep Research replays
break; // drop non-text/image model output for Deep Research replays
default:
const _exhaustive: never = part;
}
}
return chunks.join('\n\n').trim();
const text = textChunks.join('\n\n').trim();
if (!images.length) return text;
const contentParts: TInputPart[] = [];
if (text) contentParts.push({ type: 'text', text });
contentParts.push(...images);
return contentParts;
}
// -- helpers --
function _hasTurnContent(content: TTurnContent): boolean {
return typeof content === 'string' ? content.length > 0 : content.length > 0;
}
function _prependSystemText(content: TTurnContent, systemPrefix: string): TTurnContent {
if (typeof content === 'string')
return `${systemPrefix}\n\n${content}`;
// multimodal: inject a text part at the front, or fold into the leading text part if present
if (content.length > 0 && content[0].type === 'text')
return [{ type: 'text', text: `${systemPrefix}\n\n${content[0].text}` }, ...content.slice(1)];
return [{ type: 'text', text: systemPrefix }, ...content];
}
@@ -5,6 +5,7 @@ import type { ChatGenerateParseFunction } from '../chatGenerate.dispatch';
import type { IParticleTransmitter } from './IParticleTransmitter';
import { GeminiInteractionsWire_API_Interactions } from '../../wiretypes/gemini.interactions.wiretypes';
import { geminiConvertPCM2WAV } from './gemini.audioutils';
// Kill-switch: drop url_citation annotations - Deep Research ships opaque grounding-redirect URLs with no titles, and the text already contains a numbered source list.
@@ -16,7 +17,7 @@ type TUsage = NonNullable<TInteraction['usage']>;
/**
* Gemini Interactions API parser (for Deep Research agents).
* Gemini Interactions API parser (for Deep Research and future multimodal agents).
*
* Each SSE frame carries a *full* Interaction snapshot (from POST or from a GET poll).
* The parser diffs against prior state and emits only new content.
@@ -24,7 +25,11 @@ type TUsage = NonNullable<TInteraction['usage']>;
* Emission rules per output type:
* - `text` -> `pt.appendText(newSuffix)`. New url_citation annotations are emitted once.
* - `thought` -> `pt.appendReasoningText(newSuffix)`; signatures recorded via `setReasoningSignature`.
* - any other type -> ignored (Deep Research primarily emits text + thought).
* - `image` -> `pt.appendImageInline(...)` once per index (images are whole, not incremental).
* URI-only variants emit a visible note + `console.warn` (not yet wired as fetches).
* - `audio` -> PCM -> WAV via `geminiConvertPCM2WAV`, then `pt.appendAudioInline(...)` once per index.
* - unknown types -> `console.warn` + inline `_Unsupported content block: <type>_` note, once per index.
* Non-terminating: Deep Research streams are long-lived and must not blow up on new blocks.
*
* Part boundaries: when the output type at a given index changes kind (e.g. thought -> text),
* we call `endMessagePart()` so the transmitter flushes the previous part cleanly.
@@ -42,10 +47,12 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
// per-index emission state (array index in `outputs[]`)
type EmittedState = {
kind: 'text' | 'thought' | 'other';
kind: 'text' | 'thought' | 'image' | 'audio' | 'other';
emittedTextLen: number;
emittedCitationKeys: Set<string>; // `${url}@${start}-${end}` to de-dupe
signatureSent: boolean;
mediaEmitted: boolean; // image/audio: emit only once (whole, not incremental)
otherWarned: boolean; // unknown type: warn only once per index
};
const emitted: EmittedState[] = [];
let lastOpenIdx = -1; // index of the most recently opened part; -1 = none
@@ -84,17 +91,25 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
timeToFirstEvent = Date.now() - parserCreationTimestamp;
// process outputs (may be absent on early in_progress frames).
// Each raw output is classified via Zod safeParse against a discriminated union; unknown
// shapes fall through to `kind: 'other'` and are silently ignored.
// Each raw output is classified via Zod safeParse against a discriminated union.
// - Untyped/empty placeholders (`{}`, no `type` field) are skipped silently without creating
// state, so a later snapshot that populates them can classify cleanly.
// - Typed-but-unknown shapes warn once per index with a visible note (non-terminating).
const outputs = interaction.outputs ?? [];
for (let i = 0; i < outputs.length; i++) {
const classified = GeminiInteractionsWire_API_Interactions.KnownOutput_schema.safeParse(outputs[i]);
const raw = outputs[i] as { type?: unknown };
const rawType = typeof raw?.type === 'string' ? raw.type : null;
// skip not-yet-populated placeholder blocks silently (Deep Research pre-allocates slots)
if (rawType === null) continue;
const classified = GeminiInteractionsWire_API_Interactions.KnownOutput_schema.safeParse(raw);
const kind: EmittedState['kind'] = !classified.success ? 'other' : classified.data.type;
// first time we see this index: initialize + flush previous part if switching kinds
let state = emitted[i];
if (!state) {
state = { kind, emittedTextLen: 0, emittedCitationKeys: new Set(), signatureSent: false };
state = { kind, emittedTextLen: 0, emittedCitationKeys: new Set(), signatureSent: false, mediaEmitted: false, otherWarned: false };
emitted[i] = state;
// close previous part if we're opening a new index (natural part boundary)
@@ -103,7 +118,15 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
lastOpenIdx = i;
}
if (!classified.success) continue; // 'other': ignored for now
// 'other': warn once per index with visible note, then continue
if (!classified.success) {
if (!state.otherWarned) {
console.warn(`[GeminiInteractions] unsupported output type: ${rawType}`, raw);
pt.appendText(`\n_Unsupported content block: ${rawType}_\n`);
state.otherWarned = true;
}
continue;
}
const out = classified.data;
if (out.type === 'text') {
@@ -123,7 +146,7 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
pt.appendUrlCitation(ann.title || ann.url, ann.url, undefined, ann.start_index, ann.end_index, undefined, undefined);
}
}
} else /* out.type === 'thought' */ {
} else if (out.type === 'thought') {
const summary = out.summary ?? '';
if (summary.length > state.emittedTextLen) {
pt.appendReasoningText(summary.slice(state.emittedTextLen));
@@ -133,6 +156,31 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
pt.setReasoningSignature(out.signature);
state.signatureSent = true;
}
} else if (out.type === 'image') {
if (!state.mediaEmitted) {
if (out.data) {
pt.appendImageInline(out.mime_type, out.data, 'Gemini Generated Image', 'Gemini', '');
} else if (out.uri) {
// URI-hosted images aren't fetched here (yet); surface the link inline
console.warn('[GeminiInteractions] image output via URI is not yet fetched inline:', out.uri);
pt.appendText(`\n[Image: ${out.uri}]\n`);
} else {
console.warn('[GeminiInteractions] image output with neither data nor uri:', out);
pt.appendText(`\n_Image block without payload_\n`);
}
state.mediaEmitted = true;
}
} else /* out.type === 'audio' */ {
if (!state.mediaEmitted) {
try {
const wav = geminiConvertPCM2WAV(out.mime_type, out.data);
pt.appendAudioInline(wav.mimeType, wav.base64Data, 'Gemini Generated Audio', 'Gemini', wav.durationMs);
} catch (error) {
console.warn('[GeminiInteractions] audio convert failed:', error);
pt.appendText(`\n_Audio conversion failed: ${String(error)}_\n`);
}
state.mediaEmitted = true;
}
}
}
@@ -26,17 +26,36 @@ export namespace GeminiInteractionsWire_API_Interactions {
// -- Request Body (POST /v1beta/interactions) --
// Multimodal content parts (used when a turn carries images/audio in addition to text).
// Single-modal text turns stay as a plain string to match the API's convenience shape.
const InputTextPart_schema = z.object({
type: z.literal('text'),
text: z.string(),
});
const InputImagePart_schema = z.object({
type: z.literal('image'),
data: z.string(), // base64-encoded bytes
mime_type: z.string(), // e.g. 'image/png', 'image/jpeg', 'image/webp'
});
export const InputContentPart_schema = z.discriminatedUnion('type', [
InputTextPart_schema,
InputImagePart_schema,
]);
// A turn in a stateless multi-turn conversation (when `input` is an array).
// Content is kept as a plain string for now; the API also accepts a list of content objects for multimodal.
export const Turn_schema = z.object({
role: z.enum(['user', 'model']),
content: z.string(),
content: z.union([
z.string(), // text-only turn (API convenience shape)
z.array(InputContentPart_schema), // multimodal turn
]),
});
export const RequestBody_schema = z.object({
agent: z.string(), // e.g. 'deep-research-pro-preview-12-2025' (note: we send bare id, without 'models/' prefix)
input: z.union([
z.string(), // single-turn convenience
z.string(), // single-turn text convenience
z.array(InputContentPart_schema), // single-turn multimodal
z.array(Turn_schema), // stateless multi-turn history
]),
background: z.literal(true), // required for agents
@@ -79,10 +98,27 @@ export namespace GeminiInteractionsWire_API_Interactions {
signature: z.string().optional(),
});
/** Discriminated union of output shapes we act on. Anything else: safeParse fails -> parser skips. */
const ImageOutput_schema = z.object({
type: z.literal('image'),
// API may return inline bytes (`data` + `mime_type`) or a URI. We accept both shapes;
// the parser prefers inline and falls back to a URI note when only `uri` is present.
data: z.string().optional(), // base64-encoded bytes
uri: z.string().optional(),
mime_type: z.string(),
});
const AudioOutput_schema = z.object({
type: z.literal('audio'),
data: z.string(), // base64-encoded bytes (Gemini serves PCM; parser converts to WAV)
mime_type: z.string(), // e.g. 'audio/L16;codec=pcm;rate=24000'
});
/** Discriminated union of output shapes we act on. Anything else: safeParse fails -> parser warns once per index. */
export const KnownOutput_schema = z.discriminatedUnion('type', [
TextOutput_schema,
ThoughtOutput_schema,
ImageOutput_schema,
AudioOutput_schema,
]);