mirror of
https://github.com/enricoros/big-AGI.git
synced 2026-05-10 21:50:14 -07:00
AIX: Gemini Interactions: In/Out of images, Out of Audio
This commit is contained in:
+69
-24
@@ -8,18 +8,22 @@ import { approxDocPart_To_String, approxInReferenceTo_To_XMLString, aixSpillSyst
|
||||
|
||||
type TRequestBody = z.infer<typeof GeminiInteractionsWire_API_Interactions.RequestBody_schema>;
|
||||
type TTurn = z.infer<typeof GeminiInteractionsWire_API_Interactions.Turn_schema>;
|
||||
type TTurnContent = TTurn['content']; // string | InputContentPart[]
|
||||
type TInputPart = z.infer<typeof GeminiInteractionsWire_API_Interactions.InputContentPart_schema>;
|
||||
|
||||
|
||||
/**
|
||||
* MINIMAL - Build the POST /v1beta/interactions body for Deep Research agents.
|
||||
* Build the POST /v1beta/interactions body for Deep Research (and future agents).
|
||||
*
|
||||
* Scope:
|
||||
* - Stateless multi-turn: the full `chatSequence` is flattened to role-tagged turns and sent as `input`.
|
||||
* - `systemMessage` text (if any) is prepended to the first user turn, since the Interactions API for
|
||||
* background agents does not accept a dedicated `system_instruction`.
|
||||
* - Text-only content: doc parts are rendered via `approxDocPart_To_String`; in-reference-to XML is prepended to the user turn.
|
||||
* - Model messages containing only tool invocations/responses/aux (no text) are dropped.
|
||||
* - Non-text user parts (images, audio, cache-control) are silently dropped.
|
||||
* - Stateless multi-turn: `chatSequence` is flattened to role-tagged turns and sent as `input`.
|
||||
* - `systemMessage` text (if any) is prepended to the first user turn; background agents do not
|
||||
* accept a dedicated `system_instruction`.
|
||||
* - Multimodal: user and model turns carry images as content-part arrays when any image is present,
|
||||
* otherwise stay as plain strings (preserves the API's convenience shape).
|
||||
* - Doc parts render as text via `approxDocPart_To_String`; in-reference-to XML is prepended to the user turn.
|
||||
* - Model messages containing only tool invocations/responses/aux (no text or images) are dropped.
|
||||
* - Audio and cache-control parts are silently dropped (unsupported on this path).
|
||||
*/
|
||||
export function aixToGeminiInteractionsCreate(model: AixAPI_Model, chatGenerateRaw: AixAPIChatGenerate_Request): TRequestBody {
|
||||
|
||||
@@ -33,11 +37,11 @@ export function aixToGeminiInteractionsCreate(model: AixAPI_Model, chatGenerateR
|
||||
const turns: TTurn[] = [];
|
||||
for (const msg of chatGenerate.chatSequence) {
|
||||
if (msg.role === 'user') {
|
||||
const content = _flattenUserParts(msg.parts);
|
||||
if (content) turns.push({ role: 'user', content });
|
||||
const content = _buildUserContent(msg.parts);
|
||||
if (_hasTurnContent(content)) turns.push({ role: 'user', content });
|
||||
} else if (msg.role === 'model') {
|
||||
const content = _flattenModelParts(msg.parts);
|
||||
if (content) turns.push({ role: 'model', content });
|
||||
const content = _buildModelContent(msg.parts);
|
||||
if (_hasTurnContent(content)) turns.push({ role: 'model', content });
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,14 +52,14 @@ export function aixToGeminiInteractionsCreate(model: AixAPI_Model, chatGenerateR
|
||||
if (systemPrefix) {
|
||||
const firstUserIdx = turns.findIndex(t => t.role === 'user');
|
||||
if (firstUserIdx >= 0)
|
||||
turns[firstUserIdx] = { role: 'user', content: `${systemPrefix}\n\n${turns[firstUserIdx].content}` };
|
||||
turns[firstUserIdx] = { role: 'user', content: _prependSystemText(turns[firstUserIdx].content, systemPrefix) };
|
||||
}
|
||||
|
||||
// Sanity: the API expects the last turn to be 'user' (we're asking the model to respond)
|
||||
if (turns[turns.length - 1].role !== 'user')
|
||||
throw new Error('Gemini Interactions: last turn must be from user (chat sequence ended with a model message)');
|
||||
|
||||
// Simplify single-turn to string form (matches the Python/JS SDK convenience shape)
|
||||
// Simplify single-turn to bare content form (matches the Python/JS SDK convenience shape)
|
||||
const input: TRequestBody['input'] = (turns.length === 1 && turns[0].role === 'user')
|
||||
? turns[0].content
|
||||
: turns;
|
||||
@@ -95,23 +99,26 @@ function _collectSystemText(systemMessage: AixAPIChatGenerate_Request['systemMes
|
||||
return chunks.join('\n').trim();
|
||||
}
|
||||
|
||||
function _flattenUserParts(parts: Extract<AixAPIChatGenerate_Request['chatSequence'][number], { role: 'user' }>['parts']): string {
|
||||
const chunks: string[] = [];
|
||||
function _buildUserContent(parts: Extract<AixAPIChatGenerate_Request['chatSequence'][number], { role: 'user' }>['parts']): TTurnContent {
|
||||
const textChunks: string[] = [];
|
||||
const prefixChunks: string[] = []; // in-reference-to goes before body
|
||||
const images: TInputPart[] = [];
|
||||
|
||||
for (const part of parts) {
|
||||
switch (part.pt) {
|
||||
case 'text':
|
||||
chunks.push(part.text);
|
||||
textChunks.push(part.text);
|
||||
break;
|
||||
case 'doc':
|
||||
chunks.push(approxDocPart_To_String(part));
|
||||
textChunks.push(approxDocPart_To_String(part));
|
||||
break;
|
||||
case 'meta_in_reference_to':
|
||||
const irt = approxInReferenceTo_To_XMLString(part);
|
||||
if (irt) prefixChunks.push(irt);
|
||||
break;
|
||||
case 'inline_image':
|
||||
images.push({ type: 'image', data: part.base64, mime_type: part.mimeType });
|
||||
break;
|
||||
case 'meta_cache_control':
|
||||
break; // unsupported here; dropped
|
||||
default:
|
||||
@@ -119,26 +126,64 @@ function _flattenUserParts(parts: Extract<AixAPIChatGenerate_Request['chatSequen
|
||||
}
|
||||
}
|
||||
|
||||
return [...prefixChunks, ...chunks].join('\n\n').trim();
|
||||
const text = [...prefixChunks, ...textChunks].join('\n\n').trim();
|
||||
|
||||
// text-only turn: return string (API convenience shape)
|
||||
if (!images.length) return text;
|
||||
|
||||
// multimodal turn: emit as content-parts array; text first, then images (matches generateContent convention)
|
||||
const contentParts: TInputPart[] = [];
|
||||
if (text) contentParts.push({ type: 'text', text });
|
||||
contentParts.push(...images);
|
||||
return contentParts;
|
||||
}
|
||||
|
||||
function _flattenModelParts(parts: Extract<AixAPIChatGenerate_Request['chatSequence'][number], { role: 'model' }>['parts']): string {
|
||||
const chunks: string[] = [];
|
||||
function _buildModelContent(parts: Extract<AixAPIChatGenerate_Request['chatSequence'][number], { role: 'model' }>['parts']): TTurnContent {
|
||||
const textChunks: string[] = [];
|
||||
const images: TInputPart[] = [];
|
||||
|
||||
for (const part of parts) {
|
||||
switch (part.pt) {
|
||||
case 'text':
|
||||
chunks.push(part.text);
|
||||
textChunks.push(part.text);
|
||||
break;
|
||||
case 'inline_image':
|
||||
// model-authored images (e.g. from a prior generation) - replay as context
|
||||
images.push({ type: 'image', data: part.base64, mime_type: part.mimeType });
|
||||
break;
|
||||
case 'inline_audio':
|
||||
case 'inline_image':
|
||||
case 'tool_invocation':
|
||||
case 'tool_response':
|
||||
case 'ma': // model aux (reasoning, etc.)
|
||||
case 'meta_cache_control':
|
||||
break; // drop non-text model output for Deep Research replays
|
||||
break; // drop non-text/image model output for Deep Research replays
|
||||
default:
|
||||
const _exhaustive: never = part;
|
||||
}
|
||||
}
|
||||
return chunks.join('\n\n').trim();
|
||||
|
||||
const text = textChunks.join('\n\n').trim();
|
||||
|
||||
if (!images.length) return text;
|
||||
|
||||
const contentParts: TInputPart[] = [];
|
||||
if (text) contentParts.push({ type: 'text', text });
|
||||
contentParts.push(...images);
|
||||
return contentParts;
|
||||
}
|
||||
|
||||
|
||||
// -- helpers --
|
||||
|
||||
function _hasTurnContent(content: TTurnContent): boolean {
|
||||
return typeof content === 'string' ? content.length > 0 : content.length > 0;
|
||||
}
|
||||
|
||||
function _prependSystemText(content: TTurnContent, systemPrefix: string): TTurnContent {
|
||||
if (typeof content === 'string')
|
||||
return `${systemPrefix}\n\n${content}`;
|
||||
// multimodal: inject a text part at the front, or fold into the leading text part if present
|
||||
if (content.length > 0 && content[0].type === 'text')
|
||||
return [{ type: 'text', text: `${systemPrefix}\n\n${content[0].text}` }, ...content.slice(1)];
|
||||
return [{ type: 'text', text: systemPrefix }, ...content];
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import type { ChatGenerateParseFunction } from '../chatGenerate.dispatch';
|
||||
import type { IParticleTransmitter } from './IParticleTransmitter';
|
||||
|
||||
import { GeminiInteractionsWire_API_Interactions } from '../../wiretypes/gemini.interactions.wiretypes';
|
||||
import { geminiConvertPCM2WAV } from './gemini.audioutils';
|
||||
|
||||
|
||||
// Kill-switch: drop url_citation annotations - Deep Research ships opaque grounding-redirect URLs with no titles, and the text already contains a numbered source list.
|
||||
@@ -16,7 +17,7 @@ type TUsage = NonNullable<TInteraction['usage']>;
|
||||
|
||||
|
||||
/**
|
||||
* Gemini Interactions API parser (for Deep Research agents).
|
||||
* Gemini Interactions API parser (for Deep Research and future multimodal agents).
|
||||
*
|
||||
* Each SSE frame carries a *full* Interaction snapshot (from POST or from a GET poll).
|
||||
* The parser diffs against prior state and emits only new content.
|
||||
@@ -24,7 +25,11 @@ type TUsage = NonNullable<TInteraction['usage']>;
|
||||
* Emission rules per output type:
|
||||
* - `text` -> `pt.appendText(newSuffix)`. New url_citation annotations are emitted once.
|
||||
* - `thought` -> `pt.appendReasoningText(newSuffix)`; signatures recorded via `setReasoningSignature`.
|
||||
* - any other type -> ignored (Deep Research primarily emits text + thought).
|
||||
* - `image` -> `pt.appendImageInline(...)` once per index (images are whole, not incremental).
|
||||
* URI-only variants emit a visible note + `console.warn` (not yet wired as fetches).
|
||||
* - `audio` -> PCM -> WAV via `geminiConvertPCM2WAV`, then `pt.appendAudioInline(...)` once per index.
|
||||
* - unknown types -> `console.warn` + inline `_Unsupported content block: <type>_` note, once per index.
|
||||
* Non-terminating: Deep Research streams are long-lived and must not blow up on new blocks.
|
||||
*
|
||||
* Part boundaries: when the output type at a given index changes kind (e.g. thought -> text),
|
||||
* we call `endMessagePart()` so the transmitter flushes the previous part cleanly.
|
||||
@@ -42,10 +47,12 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
|
||||
|
||||
// per-index emission state (array index in `outputs[]`)
|
||||
type EmittedState = {
|
||||
kind: 'text' | 'thought' | 'other';
|
||||
kind: 'text' | 'thought' | 'image' | 'audio' | 'other';
|
||||
emittedTextLen: number;
|
||||
emittedCitationKeys: Set<string>; // `${url}@${start}-${end}` to de-dupe
|
||||
signatureSent: boolean;
|
||||
mediaEmitted: boolean; // image/audio: emit only once (whole, not incremental)
|
||||
otherWarned: boolean; // unknown type: warn only once per index
|
||||
};
|
||||
const emitted: EmittedState[] = [];
|
||||
let lastOpenIdx = -1; // index of the most recently opened part; -1 = none
|
||||
@@ -84,17 +91,25 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
|
||||
timeToFirstEvent = Date.now() - parserCreationTimestamp;
|
||||
|
||||
// process outputs (may be absent on early in_progress frames).
|
||||
// Each raw output is classified via Zod safeParse against a discriminated union; unknown
|
||||
// shapes fall through to `kind: 'other'` and are silently ignored.
|
||||
// Each raw output is classified via Zod safeParse against a discriminated union.
|
||||
// - Untyped/empty placeholders (`{}`, no `type` field) are skipped silently without creating
|
||||
// state, so a later snapshot that populates them can classify cleanly.
|
||||
// - Typed-but-unknown shapes warn once per index with a visible note (non-terminating).
|
||||
const outputs = interaction.outputs ?? [];
|
||||
for (let i = 0; i < outputs.length; i++) {
|
||||
const classified = GeminiInteractionsWire_API_Interactions.KnownOutput_schema.safeParse(outputs[i]);
|
||||
const raw = outputs[i] as { type?: unknown };
|
||||
const rawType = typeof raw?.type === 'string' ? raw.type : null;
|
||||
|
||||
// skip not-yet-populated placeholder blocks silently (Deep Research pre-allocates slots)
|
||||
if (rawType === null) continue;
|
||||
|
||||
const classified = GeminiInteractionsWire_API_Interactions.KnownOutput_schema.safeParse(raw);
|
||||
const kind: EmittedState['kind'] = !classified.success ? 'other' : classified.data.type;
|
||||
|
||||
// first time we see this index: initialize + flush previous part if switching kinds
|
||||
let state = emitted[i];
|
||||
if (!state) {
|
||||
state = { kind, emittedTextLen: 0, emittedCitationKeys: new Set(), signatureSent: false };
|
||||
state = { kind, emittedTextLen: 0, emittedCitationKeys: new Set(), signatureSent: false, mediaEmitted: false, otherWarned: false };
|
||||
emitted[i] = state;
|
||||
|
||||
// close previous part if we're opening a new index (natural part boundary)
|
||||
@@ -103,7 +118,15 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
|
||||
lastOpenIdx = i;
|
||||
}
|
||||
|
||||
if (!classified.success) continue; // 'other': ignored for now
|
||||
// 'other': warn once per index with visible note, then continue
|
||||
if (!classified.success) {
|
||||
if (!state.otherWarned) {
|
||||
console.warn(`[GeminiInteractions] unsupported output type: ${rawType}`, raw);
|
||||
pt.appendText(`\n_Unsupported content block: ${rawType}_\n`);
|
||||
state.otherWarned = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
const out = classified.data;
|
||||
if (out.type === 'text') {
|
||||
@@ -123,7 +146,7 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
|
||||
pt.appendUrlCitation(ann.title || ann.url, ann.url, undefined, ann.start_index, ann.end_index, undefined, undefined);
|
||||
}
|
||||
}
|
||||
} else /* out.type === 'thought' */ {
|
||||
} else if (out.type === 'thought') {
|
||||
const summary = out.summary ?? '';
|
||||
if (summary.length > state.emittedTextLen) {
|
||||
pt.appendReasoningText(summary.slice(state.emittedTextLen));
|
||||
@@ -133,6 +156,31 @@ export function createGeminiInteractionsParser(requestedModelName: string | null
|
||||
pt.setReasoningSignature(out.signature);
|
||||
state.signatureSent = true;
|
||||
}
|
||||
} else if (out.type === 'image') {
|
||||
if (!state.mediaEmitted) {
|
||||
if (out.data) {
|
||||
pt.appendImageInline(out.mime_type, out.data, 'Gemini Generated Image', 'Gemini', '');
|
||||
} else if (out.uri) {
|
||||
// URI-hosted images aren't fetched here (yet); surface the link inline
|
||||
console.warn('[GeminiInteractions] image output via URI is not yet fetched inline:', out.uri);
|
||||
pt.appendText(`\n[Image: ${out.uri}]\n`);
|
||||
} else {
|
||||
console.warn('[GeminiInteractions] image output with neither data nor uri:', out);
|
||||
pt.appendText(`\n_Image block without payload_\n`);
|
||||
}
|
||||
state.mediaEmitted = true;
|
||||
}
|
||||
} else /* out.type === 'audio' */ {
|
||||
if (!state.mediaEmitted) {
|
||||
try {
|
||||
const wav = geminiConvertPCM2WAV(out.mime_type, out.data);
|
||||
pt.appendAudioInline(wav.mimeType, wav.base64Data, 'Gemini Generated Audio', 'Gemini', wav.durationMs);
|
||||
} catch (error) {
|
||||
console.warn('[GeminiInteractions] audio convert failed:', error);
|
||||
pt.appendText(`\n_Audio conversion failed: ${String(error)}_\n`);
|
||||
}
|
||||
state.mediaEmitted = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -26,17 +26,36 @@ export namespace GeminiInteractionsWire_API_Interactions {
|
||||
|
||||
// -- Request Body (POST /v1beta/interactions) --
|
||||
|
||||
// Multimodal content parts (used when a turn carries images/audio in addition to text).
|
||||
// Single-modal text turns stay as a plain string to match the API's convenience shape.
|
||||
const InputTextPart_schema = z.object({
|
||||
type: z.literal('text'),
|
||||
text: z.string(),
|
||||
});
|
||||
const InputImagePart_schema = z.object({
|
||||
type: z.literal('image'),
|
||||
data: z.string(), // base64-encoded bytes
|
||||
mime_type: z.string(), // e.g. 'image/png', 'image/jpeg', 'image/webp'
|
||||
});
|
||||
export const InputContentPart_schema = z.discriminatedUnion('type', [
|
||||
InputTextPart_schema,
|
||||
InputImagePart_schema,
|
||||
]);
|
||||
|
||||
// A turn in a stateless multi-turn conversation (when `input` is an array).
|
||||
// Content is kept as a plain string for now; the API also accepts a list of content objects for multimodal.
|
||||
export const Turn_schema = z.object({
|
||||
role: z.enum(['user', 'model']),
|
||||
content: z.string(),
|
||||
content: z.union([
|
||||
z.string(), // text-only turn (API convenience shape)
|
||||
z.array(InputContentPart_schema), // multimodal turn
|
||||
]),
|
||||
});
|
||||
|
||||
export const RequestBody_schema = z.object({
|
||||
agent: z.string(), // e.g. 'deep-research-pro-preview-12-2025' (note: we send bare id, without 'models/' prefix)
|
||||
input: z.union([
|
||||
z.string(), // single-turn convenience
|
||||
z.string(), // single-turn text convenience
|
||||
z.array(InputContentPart_schema), // single-turn multimodal
|
||||
z.array(Turn_schema), // stateless multi-turn history
|
||||
]),
|
||||
background: z.literal(true), // required for agents
|
||||
@@ -79,10 +98,27 @@ export namespace GeminiInteractionsWire_API_Interactions {
|
||||
signature: z.string().optional(),
|
||||
});
|
||||
|
||||
/** Discriminated union of output shapes we act on. Anything else: safeParse fails -> parser skips. */
|
||||
const ImageOutput_schema = z.object({
|
||||
type: z.literal('image'),
|
||||
// API may return inline bytes (`data` + `mime_type`) or a URI. We accept both shapes;
|
||||
// the parser prefers inline and falls back to a URI note when only `uri` is present.
|
||||
data: z.string().optional(), // base64-encoded bytes
|
||||
uri: z.string().optional(),
|
||||
mime_type: z.string(),
|
||||
});
|
||||
|
||||
const AudioOutput_schema = z.object({
|
||||
type: z.literal('audio'),
|
||||
data: z.string(), // base64-encoded bytes (Gemini serves PCM; parser converts to WAV)
|
||||
mime_type: z.string(), // e.g. 'audio/L16;codec=pcm;rate=24000'
|
||||
});
|
||||
|
||||
/** Discriminated union of output shapes we act on. Anything else: safeParse fails -> parser warns once per index. */
|
||||
export const KnownOutput_schema = z.discriminatedUnion('type', [
|
||||
TextOutput_schema,
|
||||
ThoughtOutput_schema,
|
||||
ImageOutput_schema,
|
||||
AudioOutput_schema,
|
||||
]);
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user