AIX: Gemini Interactions: In/Out of images, Out of Audio

2026-05-10 21:50:14 -07:00 · 2026-04-22 17:16:31 -07:00
parent 98474b2721
commit 996998a5cc
3 changed files with 166 additions and 37 deletions
@@ -8,18 +8,22 @@ import { approxDocPart_To_String, approxInReferenceTo_To_XMLString, aixSpillSyst

 type TRequestBody = z.infer<typeof GeminiInteractionsWire_API_Interactions.RequestBody_schema>;
 type TTurn = z.infer<typeof GeminiInteractionsWire_API_Interactions.Turn_schema>;
+type TTurnContent = TTurn['content']; // string | InputContentPart[]
+type TInputPart = z.infer<typeof GeminiInteractionsWire_API_Interactions.InputContentPart_schema>;


 /**
- * MINIMAL - Build the POST /v1beta/interactions body for Deep Research agents.
+ * Build the POST /v1beta/interactions body for Deep Research (and future agents).
 *
 * Scope:
- *  - Stateless multi-turn: the full `chatSequence` is flattened to role-tagged turns and sent as `input`.
- *  - `systemMessage` text (if any) is prepended to the first user turn, since the Interactions API for
- *    background agents does not accept a dedicated `system_instruction`.
- *  - Text-only content: doc parts are rendered via `approxDocPart_To_String`; in-reference-to XML is prepended to the user turn.
- *  - Model messages containing only tool invocations/responses/aux (no text) are dropped.
- *  - Non-text user parts (images, audio, cache-control) are silently dropped.
+ *  - Stateless multi-turn: `chatSequence` is flattened to role-tagged turns and sent as `input`.
+ *  - `systemMessage` text (if any) is prepended to the first user turn; background agents do not
+ *    accept a dedicated `system_instruction`.
+ *  - Multimodal: user and model turns carry images as content-part arrays when any image is present,
+ *    otherwise stay as plain strings (preserves the API's convenience shape).
+ *  - Doc parts render as text via `approxDocPart_To_String`; in-reference-to XML is prepended to the user turn.
+ *  - Model messages containing only tool invocations/responses/aux (no text or images) are dropped.
+ *  - Audio and cache-control parts are silently dropped (unsupported on this path).
 */
 export function aixToGeminiInteractionsCreate(model: AixAPI_Model, chatGenerateRaw: AixAPIChatGenerate_Request): TRequestBody {

@@ -33,11 +37,11 @@ export function aixToGeminiInteractionsCreate(model: AixAPI_Model, chatGenerateR
  const turns: TTurn[] = [];
  for (const msg of chatGenerate.chatSequence) {
    if (msg.role === 'user') {
-      const content = _flattenUserParts(msg.parts);
-      if (content) turns.push({ role: 'user', content });
+      const content = _buildUserContent(msg.parts);
+      if (_hasTurnContent(content)) turns.push({ role: 'user', content });
    } else if (msg.role === 'model') {
-      const content = _flattenModelParts(msg.parts);
-      if (content) turns.push({ role: 'model', content });
+      const content = _buildModelContent(msg.parts);
+      if (_hasTurnContent(content)) turns.push({ role: 'model', content });
    }
  }

@@ -48,14 +52,14 @@ export function aixToGeminiInteractionsCreate(model: AixAPI_Model, chatGenerateR
  if (systemPrefix) {
    const firstUserIdx = turns.findIndex(t => t.role === 'user');
    if (firstUserIdx >= 0)
-      turns[firstUserIdx] = { role: 'user', content: `${systemPrefix}\n\n${turns[firstUserIdx].content}` };
+      turns[firstUserIdx] = { role: 'user', content: _prependSystemText(turns[firstUserIdx].content, systemPrefix) };
  }

  // Sanity: the API expects the last turn to be 'user' (we're asking the model to respond)
  if (turns[turns.length - 1].role !== 'user')
    throw new Error('Gemini Interactions: last turn must be from user (chat sequence ended with a model message)');

-  // Simplify single-turn to string form (matches the Python/JS SDK convenience shape)
+  // Simplify single-turn to bare content form (matches the Python/JS SDK convenience shape)
  const input: TRequestBody['input'] = (turns.length === 1 && turns[0].role === 'user')
    ? turns[0].content
    : turns;
@@ -95,23 +99,26 @@ function _collectSystemText(systemMessage: AixAPIChatGenerate_Request['systemMes
  return chunks.join('\n').trim();
 }

-function _flattenUserParts(parts: Extract<AixAPIChatGenerate_Request['chatSequence'][number], { role: 'user' }>['parts']): string {
-  const chunks: string[] = [];
+function _buildUserContent(parts: Extract<AixAPIChatGenerate_Request['chatSequence'][number], { role: 'user' }>['parts']): TTurnContent {
+  const textChunks: string[] = [];
  const prefixChunks: string[] = []; // in-reference-to goes before body
+  const images: TInputPart[] = [];

  for (const part of parts) {
    switch (part.pt) {
      case 'text':
-        chunks.push(part.text);
+        textChunks.push(part.text);
        break;
      case 'doc':
-        chunks.push(approxDocPart_To_String(part));
+        textChunks.push(approxDocPart_To_String(part));
        break;
      case 'meta_in_reference_to':
        const irt = approxInReferenceTo_To_XMLString(part);
        if (irt) prefixChunks.push(irt);
        break;
      case 'inline_image':
+        images.push({ type: 'image', data: part.base64, mime_type: part.mimeType });
+        break;
      case 'meta_cache_control':
        break; // unsupported here; dropped
      default:
@@ -119,26 +126,64 @@ function _flattenUserParts(parts: Extract<AixAPIChatGenerate_Request['chatSequen
    }
  }

-  return [...prefixChunks, ...chunks].join('\n\n').trim();
+  const text = [...prefixChunks, ...textChunks].join('\n\n').trim();
+
+  // text-only turn: return string (API convenience shape)
+  if (!images.length) return text;
+
+  // multimodal turn: emit as content-parts array; text first, then images (matches generateContent convention)
+  const contentParts: TInputPart[] = [];
+  if (text) contentParts.push({ type: 'text', text });
+  contentParts.push(...images);
+  return contentParts;
 }

-function _flattenModelParts(parts: Extract<AixAPIChatGenerate_Request['chatSequence'][number], { role: 'model' }>['parts']): string {
-  const chunks: string[] = [];
+function _buildModelContent(parts: Extract<AixAPIChatGenerate_Request['chatSequence'][number], { role: 'model' }>['parts']): TTurnContent {
+  const textChunks: string[] = [];
+  const images: TInputPart[] = [];
+
  for (const part of parts) {
    switch (part.pt) {
      case 'text':
-        chunks.push(part.text);
+        textChunks.push(part.text);
+        break;
+      case 'inline_image':
+        // model-authored images (e.g. from a prior generation) - replay as context
+        images.push({ type: 'image', data: part.base64, mime_type: part.mimeType });
        break;
      case 'inline_audio':
-      case 'inline_image':
      case 'tool_invocation':
      case 'tool_response':
      case 'ma': // model aux (reasoning, etc.)
      case 'meta_cache_control':
-        break; // drop non-text model output for Deep Research replays
+        break; // drop non-text/image model output for Deep Research replays
      default:
        const _exhaustive: never = part;
    }
  }
-  return chunks.join('\n\n').trim();
+
+  const text = textChunks.join('\n\n').trim();
+
+  if (!images.length) return text;
+
+  const contentParts: TInputPart[] = [];
+  if (text) contentParts.push({ type: 'text', text });
+  contentParts.push(...images);
+  return contentParts;
+}
+
+
+// -- helpers --
+
+function _hasTurnContent(content: TTurnContent): boolean {
+  return typeof content === 'string' ? content.length > 0 : content.length > 0;
+}
+
+function _prependSystemText(content: TTurnContent, systemPrefix: string): TTurnContent {
+  if (typeof content === 'string')
+    return `${systemPrefix}\n\n${content}`;
+  // multimodal: inject a text part at the front, or fold into the leading text part if present
+  if (content.length > 0 && content[0].type === 'text')
+    return [{ type: 'text', text: `${systemPrefix}\n\n${content[0].text}` }, ...content.slice(1)];
+  return [{ type: 'text', text: systemPrefix }, ...content];
 }
@@ -5,6 +5,7 @@ import type { ChatGenerateParseFunction } from '../chatGenerate.dispatch';
 import type { IParticleTransmitter } from './IParticleTransmitter';

 import { GeminiInteractionsWire_API_Interactions } from '../../wiretypes/gemini.interactions.wiretypes';
+import { geminiConvertPCM2WAV } from './gemini.audioutils';


 // Kill-switch: drop url_citation annotations - Deep Research ships opaque grounding-redirect URLs with no titles, and the text already contains a numbered source list.
@@ -16,7 +17,7 @@ type TUsage = NonNullable<TInteraction['usage']>;


 /**
- * Gemini Interactions API parser (for Deep Research agents).
+ * Gemini Interactions API parser (for Deep Research and future multimodal agents).
 *
 * Each SSE frame carries a *full* Interaction snapshot (from POST or from a GET poll).
 * The parser diffs against prior state and emits only new content.
@@ -24,7 +25,11 @@ type TUsage = NonNullable<TInteraction['usage']>;
 * Emission rules per output type:
 *  - `text`           -> `pt.appendText(newSuffix)`. New url_citation annotations are emitted once.
 *  - `thought`        -> `pt.appendReasoningText(newSuffix)`; signatures recorded via `setReasoningSignature`.
- *  - any other type   -> ignored (Deep Research primarily emits text + thought).
+ *  - `image`          -> `pt.appendImageInline(...)` once per index (images are whole, not incremental).
+ *                        URI-only variants emit a visible note + `console.warn` (not yet wired as fetches).
+ *  - `audio`          -> PCM -> WAV via `geminiConvertPCM2WAV`, then `pt.appendAudioInline(...)` once per index.
+ *  - unknown types    -> `console.warn` + inline `_Unsupported content block: <type>_` note, once per index.
+ *                        Non-terminating: Deep Research streams are long-lived and must not blow up on new blocks.
 *
 * Part boundaries: when the output type at a given index changes kind (e.g. thought -> text),
 * we call `endMessagePart()` so the transmitter flushes the previous part cleanly.
@@ -42,10 +47,12 @@ export function createGeminiInteractionsParser(requestedModelName: string | null

  // per-index emission state (array index in `outputs[]`)
  type EmittedState = {
-    kind: 'text' | 'thought' | 'other';
+    kind: 'text' | 'thought' | 'image' | 'audio' | 'other';
    emittedTextLen: number;
    emittedCitationKeys: Set<string>; // `${url}@${start}-${end}` to de-dupe
    signatureSent: boolean;
+    mediaEmitted: boolean; // image/audio: emit only once (whole, not incremental)
+    otherWarned: boolean; // unknown type: warn only once per index
  };
  const emitted: EmittedState[] = [];
  let lastOpenIdx = -1; // index of the most recently opened part; -1 = none
@@ -84,17 +91,25 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
      timeToFirstEvent = Date.now() - parserCreationTimestamp;

    // process outputs (may be absent on early in_progress frames).
-    // Each raw output is classified via Zod safeParse against a discriminated union; unknown
-    // shapes fall through to `kind: 'other'` and are silently ignored.
+    // Each raw output is classified via Zod safeParse against a discriminated union.
+    // - Untyped/empty placeholders (`{}`, no `type` field) are skipped silently without creating
+    //   state, so a later snapshot that populates them can classify cleanly.
+    // - Typed-but-unknown shapes warn once per index with a visible note (non-terminating).
    const outputs = interaction.outputs ?? [];
    for (let i = 0; i < outputs.length; i++) {
-      const classified = GeminiInteractionsWire_API_Interactions.KnownOutput_schema.safeParse(outputs[i]);
+      const raw = outputs[i] as { type?: unknown };
+      const rawType = typeof raw?.type === 'string' ? raw.type : null;
+
+      // skip not-yet-populated placeholder blocks silently (Deep Research pre-allocates slots)
+      if (rawType === null) continue;
+
+      const classified = GeminiInteractionsWire_API_Interactions.KnownOutput_schema.safeParse(raw);
      const kind: EmittedState['kind'] = !classified.success ? 'other' : classified.data.type;

      // first time we see this index: initialize + flush previous part if switching kinds
      let state = emitted[i];
      if (!state) {
-        state = { kind, emittedTextLen: 0, emittedCitationKeys: new Set(), signatureSent: false };
+        state = { kind, emittedTextLen: 0, emittedCitationKeys: new Set(), signatureSent: false, mediaEmitted: false, otherWarned: false };
        emitted[i] = state;

        // close previous part if we're opening a new index (natural part boundary)
@@ -103,7 +118,15 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
        lastOpenIdx = i;
      }

-      if (!classified.success) continue; // 'other': ignored for now
+      // 'other': warn once per index with visible note, then continue
+      if (!classified.success) {
+        if (!state.otherWarned) {
+          console.warn(`[GeminiInteractions] unsupported output type: ${rawType}`, raw);
+          pt.appendText(`\n_Unsupported content block: ${rawType}_\n`);
+          state.otherWarned = true;
+        }
+        continue;
+      }

      const out = classified.data;
      if (out.type === 'text') {
@@ -123,7 +146,7 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
            pt.appendUrlCitation(ann.title || ann.url, ann.url, undefined, ann.start_index, ann.end_index, undefined, undefined);
          }
        }
-      } else /* out.type === 'thought' */ {
+      } else if (out.type === 'thought') {
        const summary = out.summary ?? '';
        if (summary.length > state.emittedTextLen) {
          pt.appendReasoningText(summary.slice(state.emittedTextLen));
@@ -133,6 +156,31 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
          pt.setReasoningSignature(out.signature);
          state.signatureSent = true;
        }
+      } else if (out.type === 'image') {
+        if (!state.mediaEmitted) {
+          if (out.data) {
+            pt.appendImageInline(out.mime_type, out.data, 'Gemini Generated Image', 'Gemini', '');
+          } else if (out.uri) {
+            // URI-hosted images aren't fetched here (yet); surface the link inline
+            console.warn('[GeminiInteractions] image output via URI is not yet fetched inline:', out.uri);
+            pt.appendText(`\n[Image: ${out.uri}]\n`);
+          } else {
+            console.warn('[GeminiInteractions] image output with neither data nor uri:', out);
+            pt.appendText(`\n_Image block without payload_\n`);
+          }
+          state.mediaEmitted = true;
+        }
+      } else /* out.type === 'audio' */ {
+        if (!state.mediaEmitted) {
+          try {
+            const wav = geminiConvertPCM2WAV(out.mime_type, out.data);
+            pt.appendAudioInline(wav.mimeType, wav.base64Data, 'Gemini Generated Audio', 'Gemini', wav.durationMs);
+          } catch (error) {
+            console.warn('[GeminiInteractions] audio convert failed:', error);
+            pt.appendText(`\n_Audio conversion failed: ${String(error)}_\n`);
+          }
+          state.mediaEmitted = true;
+        }
      }
    }

@@ -26,17 +26,36 @@ export namespace GeminiInteractionsWire_API_Interactions {

  // -- Request Body (POST /v1beta/interactions) --

+  // Multimodal content parts (used when a turn carries images/audio in addition to text).
+  // Single-modal text turns stay as a plain string to match the API's convenience shape.
+  const InputTextPart_schema = z.object({
+    type: z.literal('text'),
+    text: z.string(),
+  });
+  const InputImagePart_schema = z.object({
+    type: z.literal('image'),
+    data: z.string(), // base64-encoded bytes
+    mime_type: z.string(), // e.g. 'image/png', 'image/jpeg', 'image/webp'
+  });
+  export const InputContentPart_schema = z.discriminatedUnion('type', [
+    InputTextPart_schema,
+    InputImagePart_schema,
+  ]);
+
  // A turn in a stateless multi-turn conversation (when `input` is an array).
-  // Content is kept as a plain string for now; the API also accepts a list of content objects for multimodal.
  export const Turn_schema = z.object({
    role: z.enum(['user', 'model']),
-    content: z.string(),
+    content: z.union([
+      z.string(), // text-only turn (API convenience shape)
+      z.array(InputContentPart_schema), // multimodal turn
+    ]),
  });

  export const RequestBody_schema = z.object({
    agent: z.string(), // e.g. 'deep-research-pro-preview-12-2025' (note: we send bare id, without 'models/' prefix)
    input: z.union([
-      z.string(), // single-turn convenience
+      z.string(), // single-turn text convenience
+      z.array(InputContentPart_schema), // single-turn multimodal
      z.array(Turn_schema), // stateless multi-turn history
    ]),
    background: z.literal(true), // required for agents
@@ -79,10 +98,27 @@ export namespace GeminiInteractionsWire_API_Interactions {
    signature: z.string().optional(),
  });

-  /** Discriminated union of output shapes we act on. Anything else: safeParse fails -> parser skips. */
+  const ImageOutput_schema = z.object({
+    type: z.literal('image'),
+    // API may return inline bytes (`data` + `mime_type`) or a URI. We accept both shapes;
+    // the parser prefers inline and falls back to a URI note when only `uri` is present.
+    data: z.string().optional(), // base64-encoded bytes
+    uri: z.string().optional(),
+    mime_type: z.string(),
+  });
+
+  const AudioOutput_schema = z.object({
+    type: z.literal('audio'),
+    data: z.string(), // base64-encoded bytes (Gemini serves PCM; parser converts to WAV)
+    mime_type: z.string(), // e.g. 'audio/L16;codec=pcm;rate=24000'
+  });
+
+  /** Discriminated union of output shapes we act on. Anything else: safeParse fails -> parser warns once per index. */
  export const KnownOutput_schema = z.discriminatedUnion('type', [
    TextOutput_schema,
    ThoughtOutput_schema,
+    ImageOutput_schema,
+    AudioOutput_schema,
  ]);