Files
big-agi/src/modules/aix/server/dispatch/chatGenerate/adapters/gemini.generateContent.ts
T

543 lines
22 KiB
TypeScript

import type { AixAPI_Model, AixAPIChatGenerate_Request, AixMessages_ChatMessage, AixParts_DocPart, AixTools_ToolDefinition, AixTools_ToolsPolicy } from '../../../api/aix.wiretypes';
import { GeminiWire_API_Generate_Content, GeminiWire_ContentParts, GeminiWire_Messages, GeminiWire_Safety, GeminiWire_ToolDeclarations } from '../../wiretypes/gemini.wiretypes';
import { aixSpillSystemToUser, approxDocPart_To_String, approxInReferenceTo_To_XMLString } from './adapters.common';
// configuration
const hotFixImagePartsFirst = true; // https://ai.google.dev/gemini-api/docs/image-understanding#tips-best-practices
const hotFixReplaceEmptyMessagesWithEmptyTextPart = true;
// [Gemini 3, 2025-11-20] Bypass dummy thoughtSignature for Gemini 3+ validation
// https://ai.google.dev/gemini-api/docs/thought-signatures
const GEMINI_BYPASS_THOUGHT_SIGNATURE = 'context_engineering_is_the_way_to_go';
export function aixToGeminiGenerateContent(model: AixAPI_Model, _chatGenerate: AixAPIChatGenerate_Request, geminiSafetyThreshold: GeminiWire_Safety.HarmBlockThreshold, jsonOutput: boolean, _streaming: boolean): TRequest {
// Hotfixes - reduce these to the minimum, as they shall be higher-level resolved
const isFamilyNanoBanana = model.id.includes('nano-banana') || model.id.includes('gemini-3-pro-image-preview');
const api3RequiresSignatures = isFamilyNanoBanana;
// Note: the streaming setting is ignored here as it only belongs in the path
// Pre-process CGR - approximate spill of System to User message - note: no need to flush as every message is not batched
const chatGenerate = aixSpillSystemToUser(_chatGenerate);
// System Instructions
let systemInstruction: TRequest['systemInstruction'] = undefined;
if (chatGenerate.systemMessage?.parts.length) {
systemInstruction = chatGenerate.systemMessage.parts.reduce((acc, part) => {
switch (part.pt) {
case 'text':
acc.parts.push(GeminiWire_ContentParts.TextPart(part.text));
break;
case 'doc':
acc.parts.push(GeminiWire_ContentParts.TextPart(approxDocPart_To_String(part)));
break;
case 'inline_image':
// we have already removed image parts from the system message
throw new Error('Gemini: images have to be in user messages, not in system message');
case 'meta_cache_control':
// ignore this breakpoint hint - Anthropic only
break;
default:
const _exhaustiveCheck: never = part;
throw new Error(`Unsupported part type in System message: ${(part as any).pt}`);
}
return acc;
}, { parts: [] } as Exclude<TRequest['systemInstruction'], undefined>);
// unset system instruction if empty
if (!systemInstruction.parts.length)
systemInstruction = undefined;
}
// Chat Messages
const contents: TRequest['contents'] = _toGeminiContents(chatGenerate.chatSequence, api3RequiresSignatures);
// constrained output modes - only JSON (not tool invocations for now)
const jsonOutputEnabled = !!model.strictJsonOutput || jsonOutput;
const jsonOutputSchema = model.strictJsonOutput?.schema;
// const strictToolInvocation = model.strictToolInvocations; // Gemini does not seem to support this yet - need to confirm
// Construct the request payload
const payload: TRequest = {
contents,
safetySettings: _toGeminiSafetySettings(geminiSafetyThreshold),
systemInstruction,
generationConfig: {
stopSequences: undefined, // (default, optional)
responseMimeType: jsonOutputEnabled ? 'application/json' : undefined,
responseSchema: jsonOutputSchema,
candidateCount: undefined, // (default, optional)
maxOutputTokens: model.maxTokens !== undefined ? model.maxTokens : undefined,
...(model.temperature !== null ? { temperature: model.temperature !== undefined ? model.temperature : undefined } : {}),
topP: undefined, // (default, optional)
topK: undefined, // (default, optional)
},
};
// Top-P instead of temperature
if (model.topP !== undefined) {
delete payload.generationConfig!.temperature;
payload.generationConfig!.topP = model.topP;
}
// Thinking models: thinking budget and show thoughts
if (model.vndGeminiShowThoughts === true || model.vndGeminiThinkingBudget !== undefined || model.vndGeminiThinkingLevel) {
const thinkingConfig: Exclude<TRequest['generationConfig'], undefined>['thinkingConfig'] = {};
// This shows mainly 'summaries' of thoughts, and we enable it for most cases where thinking is requested
if (model.vndGeminiShowThoughts || (model.vndGeminiThinkingBudget ?? 0) > 1 || model.vndGeminiThinkingLevel === 'high' || model.vndGeminiThinkingLevel === 'medium')
thinkingConfig.includeThoughts = true;
// [Gemini 3, 2025-11-18] Thinking Level (replaces thinkingBudget for Gemini 3)
// CRITICAL: Cannot use both thinkingLevel and thinkingBudget (400 error)
if (model.vndGeminiThinkingLevel) {
// FIXME: remove this cast once the 'medium' level is supported upstream
thinkingConfig.thinkingLevel = model.vndGeminiThinkingLevel === 'medium' ? 'high' : model.vndGeminiThinkingLevel;
}
// [Gemini 2.x] Thinking Budget (0 disables thinking explicitly)
else if (model.vndGeminiThinkingBudget !== undefined) {
if (model.vndGeminiThinkingBudget > 0)
thinkingConfig.includeThoughts = true;
thinkingConfig.thinkingBudget = model.vndGeminiThinkingBudget;
}
payload.generationConfig!.thinkingConfig = thinkingConfig;
}
// [Gemini, 2025-11-18] Media Resolution: controls vision processing quality
if (model.vndGeminiMediaResolution) {
const mediaResolutionValuesMap = {
'mr_low': 'MEDIA_RESOLUTION_LOW',
'mr_medium': 'MEDIA_RESOLUTION_MEDIUM',
'mr_high': 'MEDIA_RESOLUTION_HIGH',
} as const;
payload.generationConfig!.mediaResolution = mediaResolutionValuesMap[model.vndGeminiMediaResolution];
}
// [Gemini, 2025-10-02] [Gemini, 2025-11-20] Image generation: aspect ratio and size configuration
if (model.vndGeminiAspectRatio || model.vndGeminiImageSize) {
payload.generationConfig!.imageConfig = {
...(model.vndGeminiAspectRatio ? { aspectRatio: model.vndGeminiAspectRatio } : {}),
...(model.vndGeminiImageSize ? { imageSize: model.vndGeminiImageSize } : {}),
};
}
// [Gemini, 2025-05-20] Experimental Audio generation (TTS - audio only, no text): Request
const noTextOutput = !model.acceptsOutputs.includes('text');
if (model.acceptsOutputs.includes('audio')) {
// (undocumented) Adapt the request
delete payload.systemInstruction;
delete payload.generationConfig!.maxOutputTokens; // maxOutputTokens is not supported for audio-only output
payload.generationConfig!.temperature = 1;
// activate audio (/only) output
payload.generationConfig!.responseModalities = noTextOutput ? ['AUDIO'] : ['TEXT', 'AUDIO'];
// default voice config - list here: https://ai.google.dev/gemini-api/docs/speech-generation#voices
payload.generationConfig!.speechConfig = {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: 'Zephyr',
},
},
};
}
// [Gemini, 2025-03-14] Experimental Image generation: Request
else if (model.acceptsOutputs.includes('image')) {
payload.generationConfig!.responseModalities = noTextOutput ? ['IMAGE'] : ['TEXT', 'IMAGE'];
// 2025-03-14: both APIs v1alpha and v1beta do not support specifying the resolution
// payload.generationConfig!.mediaResolution = 'MEDIA_RESOLUTION_HIGH';
}
// --- Tools ---
// Allow/deny auto-adding hosted tools when custom tools are present
const hasCustomTools = chatGenerate.tools?.some(t => t.type === 'function_call');
const hasRestrictivePolicy = chatGenerate.toolsPolicy?.type === 'any' || chatGenerate.toolsPolicy?.type === 'function_call';
const skipHostedToolsDueToCustomTools = hasCustomTools && hasRestrictivePolicy; // FIXME: re-evaluate in the future whether this shall be on higher information levels (callers)
// Function Calls (Custom Tools)
if (chatGenerate.tools) {
payload.tools = _toGeminiTools(chatGenerate.tools);
if (chatGenerate.toolsPolicy)
payload.toolConfig = _toGeminiToolConfig(chatGenerate.toolsPolicy);
}
// Hosted tools
// [Gemini, 2025-11-18] Code Execution: add tool when enabled
if (model.vndGeminiCodeExecution === 'auto' && !skipHostedToolsDueToCustomTools) {
if (!payload.tools) payload.tools = [];
// Build the Code Execution tool configuration (empty object)
const codeExecutionTool: NonNullable<TRequest['tools']>[number] = {
codeExecution: {},
};
// Add to tools array
payload.tools.push(codeExecutionTool);
}
// [Gemini, 2025-11-01] Computer Use: add tool when environment is specified
if (model.vndGeminiComputerUse && !skipHostedToolsDueToCustomTools) {
if (!payload.tools) payload.tools = [];
// Build the Computer Use tool configuration
const computerUseTool: NonNullable<TRequest['tools']>[number] = {
computerUse: {
environment: model.vndGeminiComputerUse === 'browser' ? 'ENVIRONMENT_BROWSER' : 'ENVIRONMENT_BROWSER',
},
};
// Add to tools array
payload.tools.push(computerUseTool);
}
// [Gemini, 2025-10-13] Google Search Grounding: add tool when enabled
if (model.vndGeminiGoogleSearch && !skipHostedToolsDueToCustomTools) {
if (!payload.tools) payload.tools = [];
// Build the Google Search tool configuration
const googleSearchTool: NonNullable<TRequest['tools']>[number] = {
googleSearch: _buildGoogleSearchConfig(model.vndGeminiGoogleSearch),
};
// Add to tools array
payload.tools.push(googleSearchTool);
}
// [Gemini, 2025-08-18] URL Context: add tool when enabled
if (model.vndGeminiUrlContext === 'auto' && !isFamilyNanoBanana && !skipHostedToolsDueToCustomTools) {
if (!payload.tools) payload.tools = [];
// Build the URL Context tool configuration (empty object)
const urlContextTool: NonNullable<TRequest['tools']>[number] = {
urlContext: {},
};
// Add to tools array
payload.tools.push(urlContextTool);
}
// Preemptive error detection with server-side payload validation before sending it upstream
const validated = GeminiWire_API_Generate_Content.Request_schema.safeParse(payload);
if (!validated.success) {
console.warn('Gemini: invalid generateContent payload. Error:', validated.error.message);
throw new Error(`Invalid sequence for Gemini models: ${validated.error.issues?.[0]?.message || validated.error.message || validated.error}.`);
}
return validated.data;
}
type TRequest = GeminiWire_API_Generate_Content.Request;
function _toGeminiContents(chatSequence: AixMessages_ChatMessage[], apiRequiresSignatures: boolean): GeminiWire_Messages.Content[] {
// Remove messages that are made of empty parts
// if (hotFixRemoveEmptyMessages)
// chatSequence = chatSequence.filter(message => message.parts.length > 0);
return chatSequence.map(message => {
const parts: GeminiWire_ContentParts.ContentPart[] = [];
if (hotFixImagePartsFirst) {
// https://ai.google.dev/gemini-api/docs/image-understanding#tips-best-practices
// "When using a single image with text, place the text prompt after the image part in the contents array."
message.parts.sort((a, b) => {
if (a.pt === 'inline_image' && b.pt !== 'inline_image') return -1;
if (a.pt !== 'inline_image' && b.pt === 'inline_image') return 1;
return 0;
});
}
/* Semantically we want to preserve an empty assistant response, but Gemini requires
* at least one part for a `Content` object, so the empty message becomes a "" instead.
* E.g. { role: 'rolename', parts: [{text: ''}] }
*/
if (hotFixReplaceEmptyMessagesWithEmptyTextPart && message.parts.length === 0) {
parts.push(GeminiWire_ContentParts.TextPart(''));
}
for (const part of message.parts) {
let partRequiresSignature = false;
switch (part.pt) {
case 'text':
parts.push(GeminiWire_ContentParts.TextPart(part.text));
// [Gemini, 2025-11-20] Nano Banana Pro requires thoughtSignature on the first model text part
if (apiRequiresSignatures && message.role === 'model')
partRequiresSignature = true;
break;
case 'inline_audio':
case 'inline_image':
parts.push(GeminiWire_ContentParts.InlineDataPart(part.mimeType, part.base64));
if (apiRequiresSignatures)
partRequiresSignature = true;
break;
case 'doc':
parts.push(_toApproximateGeminiDocPart(part));
break;
case 'ma':
// ignore this thinking block - Anthropic only
break;
case 'meta_cache_control':
// ignore this breakpoint hint - Anthropic only
break;
case 'meta_in_reference_to':
const irtXMLString = approxInReferenceTo_To_XMLString(part);
if (irtXMLString)
parts.push(GeminiWire_ContentParts.TextPart(irtXMLString));
break;
case 'tool_invocation':
const invocation = part.invocation;
switch (invocation.type) {
case 'function_call':
let functionCallArgs: Record<string, any> | undefined;
if (invocation.args) {
// TODO: migrate to JSON | objects across all providers
// noinspection SuspiciousTypeOfGuard - reason: above
if (typeof invocation.args === 'string') {
try {
functionCallArgs = JSON.parse(invocation.args);
} catch (e) {
console.warn('Gemini: failed to parse (string -> JSON) function call arguments', e);
functionCallArgs = { output: invocation.args };
}
} else {
functionCallArgs = invocation.args;
}
}
parts.push(GeminiWire_ContentParts.FunctionCallPart(invocation.name, functionCallArgs));
break;
case 'code_execution':
if (invocation.language?.toLowerCase() !== 'python')
console.warn('Gemini only supports Python code execution, but got:', invocation.language);
parts.push(GeminiWire_ContentParts.ExecutableCodePart('PYTHON', invocation.code));
break;
default:
const _exhaustiveCheck: never = invocation;
throw new Error(`Unsupported tool call type in message: ${(part as any).call.type}`);
}
break;
case 'tool_response':
const toolErrorPrefix = part.error ? (typeof part.error === 'string' ? `[ERROR] ${part.error} - ` : '[ERROR] ') : '';
switch (part.response.type) {
case 'function_call':
let functionResponseResponse: Record<string, any> | undefined;
if (part.response.result) {
// TODO: migrate function call results to JSON | objects across all providers
// noinspection SuspiciousTypeOfGuard
if (typeof part.response.result === 'string') {
try {
functionResponseResponse = JSON.parse(part.response.result);
} catch (e) {
console.warn('Gemini: failed to parse (string -> JSON) function response result', e);
functionResponseResponse = { output: toolErrorPrefix + part.response.result };
}
if (Array.isArray(functionResponseResponse)) {
console.warn('toGeminiContents: Gemini requires results of function calls to be objects', { result: functionResponseResponse });
throw new Error('Gemini: unexpected array as function response');
}
} else {
functionResponseResponse = part.response.result;
}
}
parts.push(GeminiWire_ContentParts.FunctionResponsePart(part.response._name || part.id, functionResponseResponse));
break;
case 'code_execution':
parts.push(GeminiWire_ContentParts.CodeExecutionResultPart(!part.error ? 'OUTCOME_OK' : 'OUTCOME_FAILED', toolErrorPrefix + part.response.result));
break;
default:
const _exhaustiveCheck: never = part.response;
throw new Error(`Unsupported tool response type in message: ${(part as any).response.type}`);
}
break;
default:
const _exhaustiveCheck: never = part;
throw new Error(`Unsupported part type in Chat message: ${(part as any).pt}`);
}
// apply thoughtSignature if present
if (parts.length) {
const tsTarget = parts[parts.length - 1];
// apply thoughtSignature to the last part if applicable
if ('_vnd' in part && part._vnd?.gemini?.thoughtSignature) {
tsTarget.thoughtSignature = part._vnd.gemini.thoughtSignature;
}
// if not applied yet, and required for this part type, apply bypass dummy and warn
else if (partRequiresSignature) {
tsTarget.thoughtSignature = GEMINI_BYPASS_THOUGHT_SIGNATURE;
// [Gemini 3, 2025-11-20] Cross-provider or edited content warning
console.log(`[Gemini 3] ${part.pt} missing thoughtSignature - bypass applied`);
}
}
}
return {
role: message.role === 'model' ? 'model' : 'user',
parts,
};
});
}
function _toGeminiTools(itds: AixTools_ToolDefinition[]): NonNullable<TRequest['tools']> {
const tools: TRequest['tools'] = [];
itds.forEach(itd => {
switch (itd.type) {
// Note: we add each function call as a separate tool, however it could be possible to add
// a single tool with multiple function calls - which one to choose?
case 'function_call':
const { name, description, input_schema } = itd.function_call;
// create the function declaration
const functionDeclaration: GeminiWire_ToolDeclarations.FunctionDeclaration = {
name,
description,
};
// handle no-params function call definitions for Gemini (no input_schema, or empty properties)
if (input_schema?.properties && Object.keys(input_schema.properties).length) {
functionDeclaration.parameters = {
type: 'object',
properties: input_schema?.properties,
required: input_schema?.required,
};
}
// coalesce the function declaration into the last tool, if of the right type
const lastTool = tools[tools.length - 1];
if (lastTool && 'functionDeclarations' in lastTool && lastTool.functionDeclarations?.length) {
lastTool.functionDeclarations.push(functionDeclaration);
break;
}
// create a new tool with the function declaration
tools.push({
functionDeclarations: [functionDeclaration],
});
break;
case 'code_execution':
if (itd.variant !== 'gemini_auto_inline')
throw new Error('Gemini only supports inline code execution');
// throw if code execution is present more than once
if (tools.some(tool => tool.codeExecution))
throw new Error('Gemini code interpreter already defined');
tools.push({
codeExecution: {
// the official docs have no parameters yet...
// https://ai.google.dev/api/caching#tool
},
});
break;
default: // Note: Gemini's tool function doesn't break on unknown tools, so we need the default case here
throw new Error('Tool ${itd.type} is not supported by Gemini');
}
});
return tools;
}
function _toGeminiToolConfig(itp: AixTools_ToolsPolicy): NonNullable<TRequest['toolConfig']> {
switch (itp.type) {
case 'auto':
return { functionCallingConfig: { mode: 'AUTO' } };
case 'any':
return { functionCallingConfig: { mode: 'ANY' } };
case 'function_call':
return {
functionCallingConfig: {
mode: 'ANY',
allowedFunctionNames: [itp.function_call.name],
},
};
}
}
function _toGeminiSafetySettings(threshold: GeminiWire_Safety.HarmBlockThreshold): TRequest['safetySettings'] {
return threshold === 'HARM_BLOCK_THRESHOLD_UNSPECIFIED' ? undefined : [
{ category: 'HARM_CATEGORY_SEXUALLY_EXPLICIT', threshold: threshold },
{ category: 'HARM_CATEGORY_HATE_SPEECH', threshold: threshold },
{ category: 'HARM_CATEGORY_HARASSMENT', threshold: threshold },
{ category: 'HARM_CATEGORY_DANGEROUS_CONTENT', threshold: threshold },
{ category: 'HARM_CATEGORY_CIVIC_INTEGRITY', threshold: threshold },
];
}
// Approximate conversions - alternative approaches should be tried until we find the best one
function _toApproximateGeminiDocPart(aixPartsDocPart: AixParts_DocPart): GeminiWire_ContentParts.ContentPart {
// NOTE: we keep this function because we could use Gemini's different way to represent documents in the future...
return GeminiWire_ContentParts.TextPart(approxDocPart_To_String(aixPartsDocPart));
}
function _buildGoogleSearchConfig(searchGrounding: AixAPI_Model['vndGeminiGoogleSearch']): NonNullable<NonNullable<TRequest['tools']>[number]['googleSearch']> {
// enabled: any time interval
if (searchGrounding === 'unfiltered')
return {};
// calculate the time range based on the filter value
const until = new Date();
const startTime = new Date(until);
switch (searchGrounding) {
case '1d':
startTime.setDate(until.getDate() - 1);
// Fix "Invalid time range: end_time must be 24 hours after start_time."
until.setHours(until.getHours() + 1);
break;
case '1w':
startTime.setDate(until.getDate() - 7);
break;
case '1m':
startTime.setMonth(until.getMonth() - 1);
break;
case '6m':
startTime.setMonth(until.getMonth() - 6);
break;
case '1y':
startTime.setFullYear(until.getFullYear() - 1);
break;
default:
console.warn(`Unknown Google Search grounding value: ${searchGrounding}`);
return {};
}
// format timestamps: https://ai.google.dev/api/caching#Interval
return {
timeRangeFilter: {
startTime: startTime.toISOString().replace(/\.\d{3}Z$/, 'Z'),
endTime: until.toISOString().replace(/\.\d{3}Z$/, 'Z'),
},
};
}