AIX: Support for Images on System Messages

This commit is contained in:
Enrico Ros
2025-09-13 00:42:43 -07:00
parent 011c382360
commit a5199a23d9
5 changed files with 139 additions and 40 deletions
@@ -0,0 +1,74 @@
import { escapeXml } from '~/server/wire';
import { AixAPIChatGenerate_Request,
AixMessages_ChatMessage, AixMessages_SystemMessage, AixMessages_UserMessage, AixParts_DocPart, AixParts_MetaInReferenceToPart } from '../../../api/aix.wiretypes';
/**
* CGR Server-side approximate Helper
* Finds a cut point (if any) in the system message to move everything after it to a user message.
*/
export function aixSpillSystemToUser(chatGenerate: AixAPIChatGenerate_Request, splitItems: AixMessages_SystemMessage['parts'][number]['pt'][] = ['inline_image']): AixAPIChatGenerate_Request & { systemSplit: boolean } {
let systemSplit = false;
let { systemMessage, chatSequence } = chatGenerate;
// check if splittable
if (systemMessage?.parts.length) {
const splitIndex = systemMessage.parts.findIndex((p) => splitItems.includes(p.pt));
if (splitIndex >= 0) {
// perform the split
const partsPreSplit = systemMessage.parts.slice(0, splitIndex);
const partsPostSplit = systemMessage.parts.slice(splitIndex);
// system message keeps the first part
systemMessage = {
...systemMessage,
parts: partsPreSplit
};
// user message gets the rest
const userSynthMessage: AixMessages_UserMessage & { _FLUSH: true } = {
role: 'user',
parts: partsPostSplit,
_FLUSH: true, // make sure we finalize this part; this is a bit of a hack
};
chatSequence = [userSynthMessage, ...chatSequence];
systemSplit = true;
}
}
return {
...chatGenerate,
systemMessage,
chatSequence: chatSequence,
systemSplit,
}
}
export function aixSpillShallFlush(message: AixMessages_ChatMessage): boolean {
return '_FLUSH' in message && !!message._FLUSH;
}
// Approximate conversions - alternative approaches should be tried until we find the best one
export function approxDocPart_To_String({ ref, data }: AixParts_DocPart /*, wrapFormat?: 'markdown-code'*/): string {
// NOTE: Consider a better representation here
//
// We use the 'legacy' markdown encoding, but we may consider:
// - '<doc id='ref' title='title' version='version'>\n...\n</doc>'
// - ```doc id='ref' title='title' version='version'\n...\n```
// - # Title [id='ref' version='version']\n...\n
// - ...more ideas...
//
return '```' + (ref || '') + '\n' + data.text + '\n```\n';
}
export function approxInReferenceTo_To_XMLString(irt: AixParts_MetaInReferenceToPart): string | null {
const refs = irt.referTo.map(r => escapeXml(r.mText));
if (!refs.length)
return null; // `<context>User provides no specific references</context>`;
return refs.length === 1
? `<context>User refers to this in particular:<ref>${refs[0]}</ref></context>`
: `<context>User refers to ${refs.length} items:<ref>${refs.join('</ref><ref>')}</ref></context>`;
}
@@ -1,8 +1,8 @@
import { escapeXml } from '~/server/wire';
import type { AixAPI_Model, AixAPIChatGenerate_Request, AixMessages_ChatMessage, AixParts_DocPart, AixParts_MetaInReferenceToPart, AixTools_ToolDefinition, AixTools_ToolsPolicy } from '../../../api/aix.wiretypes';
import type { AixAPI_Model, AixAPIChatGenerate_Request, AixMessages_ChatMessage, AixTools_ToolDefinition, AixTools_ToolsPolicy } from '../../../api/aix.wiretypes';
import { AnthropicWire_API_Message_Create, AnthropicWire_Blocks } from '../../wiretypes/anthropic.wiretypes';
import { aixSpillShallFlush, aixSpillSystemToUser, approxDocPart_To_String, approxInReferenceTo_To_XMLString } from './adapters.common';
// configuration
const hotFixImagePartsFirst = true;
@@ -14,7 +14,10 @@ const hotFixMapModelImagesToUser = true;
type TRequest = AnthropicWire_API_Message_Create.Request;
export function aixToAnthropicMessageCreate(model: AixAPI_Model, chatGenerate: AixAPIChatGenerate_Request, streaming: boolean): TRequest {
export function aixToAnthropicMessageCreate(model: AixAPI_Model, _chatGenerate: AixAPIChatGenerate_Request, streaming: boolean): TRequest {
// Pre-process CGR - approximate spill of System to User message
const chatGenerate = aixSpillSystemToUser(_chatGenerate);
// Convert the system message
let systemMessage: TRequest['system'] = undefined;
@@ -30,6 +33,10 @@ export function aixToAnthropicMessageCreate(model: AixAPI_Model, chatGenerate: A
acc.push(AnthropicWire_Blocks.TextBlock(approxDocPart_To_String(part)));
break;
case 'inline_image':
// we have already removed image parts from the system message
throw new Error('Anthropic: images have to be in user messages, not in system message');
case 'meta_cache_control':
if (!acc.length)
console.warn('Anthropic: cache_control without a message to attach to');
@@ -40,6 +47,7 @@ export function aixToAnthropicMessageCreate(model: AixAPI_Model, chatGenerate: A
break;
default:
const _exhaustiveCheck: never = part;
throw new Error(`Unsupported part type in System message: ${(part as any).pt}`);
}
return acc;
@@ -76,6 +84,12 @@ export function aixToAnthropicMessageCreate(model: AixAPI_Model, chatGenerate: A
}
currentMessage.content.push(content);
}
// Flush: interrupt batching within the same-role and finalize the current message
if (aixSpillShallFlush(aixMessage) && currentMessage) {
chatMessages.push(currentMessage);
currentMessage = null;
}
}
if (currentMessage)
chatMessages.push(currentMessage);
@@ -305,27 +319,3 @@ function _toAnthropicToolChoice(itp: AixTools_ToolsPolicy): NonNullable<TRequest
return { type: 'tool' as const, name: itp.function_call.name };
}
}
// Approximate conversions - alternative approaches should be tried until we find the best one
export function approxDocPart_To_String({ ref, data }: AixParts_DocPart /*, wrapFormat?: 'markdown-code'*/): string {
// NOTE: Consider a better representation here
//
// We use the 'legacy' markdown encoding, but we may consider:
// - '<doc id='ref' title='title' version='version'>\n...\n</doc>'
// - ```doc id='ref' title='title' version='version'\n...\n```
// - # Title [id='ref' version='version']\n...\n
// - ...more ideas...
//
return '```' + (ref || '') + '\n' + data.text + '\n```\n';
}
export function approxInReferenceTo_To_XMLString(irt: AixParts_MetaInReferenceToPart): string | null {
const refs = irt.referTo.map(r => escapeXml(r.mText));
if (!refs.length)
return null; // `<context>User provides no specific references</context>`;
return refs.length === 1
? `<context>User refers to this in particular:<ref>${refs[0]}</ref></context>`
: `<context>User refers to ${refs.length} items:<ref>${refs.join('</ref><ref>')}</ref></context>`;
}
@@ -1,7 +1,7 @@
import type { AixAPI_Model, AixAPIChatGenerate_Request, AixMessages_ChatMessage, AixParts_DocPart, AixTools_ToolDefinition, AixTools_ToolsPolicy } from '../../../api/aix.wiretypes';
import { GeminiWire_API_Generate_Content, GeminiWire_ContentParts, GeminiWire_Messages, GeminiWire_Safety, GeminiWire_ToolDeclarations } from '../../wiretypes/gemini.wiretypes';
import { approxDocPart_To_String, approxInReferenceTo_To_XMLString } from './anthropic.messageCreate';
import { aixSpillSystemToUser, approxDocPart_To_String, approxInReferenceTo_To_XMLString } from './adapters.common';
// configuration
@@ -9,10 +9,13 @@ const hotFixImagePartsFirst = true; // https://ai.google.dev/gemini-api/docs/ima
const hotFixReplaceEmptyMessagesWithEmptyTextPart = true;
export function aixToGeminiGenerateContent(model: AixAPI_Model, chatGenerate: AixAPIChatGenerate_Request, geminiSafetyThreshold: GeminiWire_Safety.HarmBlockThreshold, jsonOutput: boolean, _streaming: boolean): TRequest {
export function aixToGeminiGenerateContent(model: AixAPI_Model, _chatGenerate: AixAPIChatGenerate_Request, geminiSafetyThreshold: GeminiWire_Safety.HarmBlockThreshold, jsonOutput: boolean, _streaming: boolean): TRequest {
// Note: the streaming setting is ignored as it only belongs in the path
// Pre-process CGR - approximate spill of System to User message - note: no need to flush as every message is not batched
const chatGenerate = aixSpillSystemToUser(_chatGenerate);
// System Instructions
let systemInstruction: TRequest['systemInstruction'] = undefined;
if (chatGenerate.systemMessage?.parts.length) {
@@ -27,6 +30,10 @@ export function aixToGeminiGenerateContent(model: AixAPI_Model, chatGenerate: Ai
acc.parts.push(GeminiWire_ContentParts.TextPart(approxDocPart_To_String(part)));
break;
case 'inline_image':
// we have already removed image parts from the system message
throw new Error('Gemini: images have to be in user messages, not in system message');
case 'meta_cache_control':
// ignore this breakpoint hint - Anthropic only
break;
@@ -3,7 +3,7 @@ import type { OpenAIDialects } from '~/modules/llms/server/openai/openai.router'
import { AixAPI_Model, AixAPIChatGenerate_Request, AixMessages_ChatMessage, AixMessages_SystemMessage, AixParts_DocPart, AixParts_InlineAudioPart, AixParts_MetaInReferenceToPart, AixTools_ToolDefinition, AixTools_ToolsPolicy } from '../../../api/aix.wiretypes';
import { OpenAIWire_API_Chat_Completions, OpenAIWire_ContentParts, OpenAIWire_Messages } from '../../wiretypes/openai.wiretypes';
import { approxDocPart_To_String } from './anthropic.messageCreate';
import { aixSpillShallFlush, aixSpillSystemToUser, approxDocPart_To_String } from './adapters.common';
//
@@ -29,7 +29,10 @@ const approxSystemMessageJoiner = '\n\n---\n\n';
type TRequest = OpenAIWire_API_Chat_Completions.Request;
type TRequestMessages = TRequest['messages'];
export function aixToOpenAIChatCompletions(openAIDialect: OpenAIDialects, model: AixAPI_Model, chatGenerate: AixAPIChatGenerate_Request, jsonOutput: boolean, streaming: boolean): TRequest {
export function aixToOpenAIChatCompletions(openAIDialect: OpenAIDialects, model: AixAPI_Model, _chatGenerate: AixAPIChatGenerate_Request, jsonOutput: boolean, streaming: boolean): TRequest {
// Pre-process CGR - approximate spill of System to User message
const chatGenerate = aixSpillSystemToUser(_chatGenerate);
// Dialect incompatibilities -> Hotfixes
const hotFixAlternateUserAssistantRoles = openAIDialect === 'deepseek' || openAIDialect === 'perplexity';
@@ -362,6 +365,10 @@ function _toOpenAIMessages(systemMessage: AixMessages_SystemMessage | null, chat
msg0TextParts.push(aixDocPart_to_OpenAITextContent(part));
break;
case 'inline_image':
// we have already removed image parts from the system message
throw new Error('OpenAI ChatCompletions: images have to be in user messages, not in system message');
case 'meta_cache_control':
// ignore this breakpoint hint - Anthropic only
break;
@@ -386,7 +393,9 @@ function _toOpenAIMessages(systemMessage: AixMessages_SystemMessage | null, chat
// Convert the messages
for (const { parts, role } of chatSequence) {
let allowAppend = true;
for (const aixMessage of chatSequence) {
const { parts, role } = aixMessage;
switch (role) {
case 'user':
@@ -398,20 +407,22 @@ function _toOpenAIMessages(systemMessage: AixMessages_SystemMessage | null, chat
const textContentPart = OpenAIWire_ContentParts.TextContentPart(part.text);
// Append to existing content[], or new message
if (currentMessage?.role === 'user' && Array.isArray(currentMessage.content))
if (allowAppend && currentMessage?.role === 'user' && Array.isArray(currentMessage.content))
currentMessage.content.push(textContentPart);
else
chatMessages.push({ role: 'user', content: hotFixPreferArrayUserContent ? [textContentPart] : textContentPart.text });
allowAppend = true;
break;
case 'doc':
const docContentPart = aixDocPart_to_OpenAITextContent(part);
// Append to existing content[], or new message
if (currentMessage?.role === 'user' && Array.isArray(currentMessage.content))
if (allowAppend && currentMessage?.role === 'user' && Array.isArray(currentMessage.content))
currentMessage.content.push(docContentPart);
else
chatMessages.push({ role: 'user', content: hotFixPreferArrayUserContent ? [docContentPart] : docContentPart.text });
allowAppend = true;
break;
case 'inline_image':
@@ -421,10 +432,11 @@ function _toOpenAIMessages(systemMessage: AixMessages_SystemMessage | null, chat
const imageContentPart = OpenAIWire_ContentParts.ImageContentPart(base64DataUrl, hotFixForceImageContentPartOpenAIDetail);
// Append to existing content[], or new message
if (currentMessage?.role === 'user' && Array.isArray(currentMessage.content))
if (allowAppend && currentMessage?.role === 'user' && Array.isArray(currentMessage.content))
currentMessage.content.push(imageContentPart);
else
chatMessages.push({ role: 'user', content: [imageContentPart] });
allowAppend = true;
break;
case 'meta_cache_control':
@@ -443,6 +455,9 @@ function _toOpenAIMessages(systemMessage: AixMessages_SystemMessage | null, chat
throw new Error(`Unsupported part type in User message: ${(part as any).pt}`);
}
}
// If this message shall be flushed, disallow append once next
allowAppend = !aixSpillShallFlush(aixMessage);
break;
case 'model':
@@ -4,7 +4,7 @@ import { AixAPI_Model, AixAPIChatGenerate_Request, AixMessages_ChatMessage, AixM
import { OpenAIWire_API_Responses, OpenAIWire_Responses_Items, OpenAIWire_Responses_Tools } from '../../wiretypes/openai.wiretypes';
import { aixDocPart_to_OpenAITextContent, aixMetaRef_to_OpenAIText, aixTexts_to_OpenAIInstructionText } from './openai.chatCompletions';
import { approxDocPart_To_String } from './anthropic.messageCreate';
import { aixSpillShallFlush, aixSpillSystemToUser, approxDocPart_To_String } from './adapters.common';
// configuration
@@ -23,7 +23,10 @@ type TRequestTool = OpenAIWire_Responses_Tools.Tool;
* - much side functionality is not implemented yet
* - testing with o3-pro only for now
*/
export function aixToOpenAIResponses(openAIDialect: OpenAIDialects, model: AixAPI_Model, chatGenerate: AixAPIChatGenerate_Request, jsonOutput: boolean, streaming: boolean): TRequest {
export function aixToOpenAIResponses(openAIDialect: OpenAIDialects, model: AixAPI_Model, _chatGenerate: AixAPIChatGenerate_Request, jsonOutput: boolean, streaming: boolean): TRequest {
// Pre-process CGR - approximate spill of System to User message
const chatGenerate = aixSpillSystemToUser(_chatGenerate);
// [OpenAI] Vendor-specific model checks
const isOpenAIOFamily = ['gpt-6', 'gpt-5', 'o4', 'o3', 'o1'].some(_id => model.id === _id || model.id.startsWith(_id + '-'));
@@ -168,6 +171,10 @@ function _toOpenAIResponsesRequestInput(systemMessage: AixMessages_SystemMessage
instructionsParts.push(aixDocPart_to_OpenAITextContent(part).text);
break;
case 'inline_image':
// we have already removed image parts from the system message
throw new Error('OpenAI Responses: images have to be in user messages, not in system message');
case 'meta_cache_control':
// ignore this breakpoint hint - Anthropic only
break;
@@ -187,10 +194,11 @@ function _toOpenAIResponsesRequestInput(systemMessage: AixMessages_SystemMessage
type FunctionCallMessage = OpenAIWire_Responses_Items.OutputFunctionCallItem;
type FunctionCallOutputMessage = OpenAIWire_Responses_Items.FunctionToolCallOutput;
let allowUserAppend = true;
function userMessage() {
// Ensure the last message is a user message, or create a new one
let lastMessage = chatMessages.length ? chatMessages[chatMessages.length - 1] : undefined;
if (lastMessage && lastMessage.type === 'message' && lastMessage.role === 'user')
if (allowUserAppend && lastMessage && lastMessage.type === 'message' && lastMessage.role === 'user')
return lastMessage;
const newMessage: UserMessage = {
type: 'message',
@@ -198,6 +206,7 @@ function _toOpenAIResponsesRequestInput(systemMessage: AixMessages_SystemMessage
content: [],
};
chatMessages.push(newMessage);
allowUserAppend = true;
return newMessage;
}
@@ -245,7 +254,8 @@ function _toOpenAIResponsesRequestInput(systemMessage: AixMessages_SystemMessage
* - assistant messages to the old Input Message format (which doesn't need IDs)
*
*/
for (const { role: messageRole, parts: messageParts } of chatSequence) {
for (const aixMessage of chatSequence) {
const { role: messageRole, parts: messageParts } = aixMessage;
switch (messageRole) {
case 'user':
@@ -295,6 +305,9 @@ function _toOpenAIResponsesRequestInput(systemMessage: AixMessages_SystemMessage
throw new Error(`Unsupported part type in User message: ${uPt}`);
}
}
// If this message shall be flushed, disallow append once next
allowUserAppend = !aixSpillShallFlush(aixMessage);
break;
case 'model':