From a5199a23d986c541fc053082f3b236eee3f2af0a Mon Sep 17 00:00:00 2001 From: Enrico Ros Date: Sat, 13 Sep 2025 00:42:43 -0700 Subject: [PATCH] AIX: Support for Images on System Messages --- .../chatGenerate/adapters/adapters.common.ts | 74 +++++++++++++++++++ .../adapters/anthropic.messageCreate.ts | 46 +++++------- .../adapters/gemini.generateContent.ts | 11 ++- .../adapters/openai.chatCompletions.ts | 27 +++++-- .../adapters/openai.responsesCreate.ts | 21 +++++- 5 files changed, 139 insertions(+), 40 deletions(-) create mode 100644 src/modules/aix/server/dispatch/chatGenerate/adapters/adapters.common.ts diff --git a/src/modules/aix/server/dispatch/chatGenerate/adapters/adapters.common.ts b/src/modules/aix/server/dispatch/chatGenerate/adapters/adapters.common.ts new file mode 100644 index 000000000..0859e1439 --- /dev/null +++ b/src/modules/aix/server/dispatch/chatGenerate/adapters/adapters.common.ts @@ -0,0 +1,74 @@ +import { escapeXml } from '~/server/wire'; + +import { AixAPIChatGenerate_Request, + AixMessages_ChatMessage, AixMessages_SystemMessage, AixMessages_UserMessage, AixParts_DocPart, AixParts_MetaInReferenceToPart } from '../../../api/aix.wiretypes'; + + +/** + * CGR Server-side approximate Helper + * Finds a cut point (if any) in the system message to move everything after it to a user message. + */ +export function aixSpillSystemToUser(chatGenerate: AixAPIChatGenerate_Request, splitItems: AixMessages_SystemMessage['parts'][number]['pt'][] = ['inline_image']): AixAPIChatGenerate_Request & { systemSplit: boolean } { + let systemSplit = false; + let { systemMessage, chatSequence } = chatGenerate; + + // check if splittable + if (systemMessage?.parts.length) { + const splitIndex = systemMessage.parts.findIndex((p) => splitItems.includes(p.pt)); + if (splitIndex >= 0) { + // perform the split + const partsPreSplit = systemMessage.parts.slice(0, splitIndex); + const partsPostSplit = systemMessage.parts.slice(splitIndex); + + // system message keeps the first part + systemMessage = { + ...systemMessage, + parts: partsPreSplit + }; + + // user message gets the rest + const userSynthMessage: AixMessages_UserMessage & { _FLUSH: true } = { + role: 'user', + parts: partsPostSplit, + _FLUSH: true, // make sure we finalize this part; this is a bit of a hack + }; + chatSequence = [userSynthMessage, ...chatSequence]; + systemSplit = true; + } + } + + return { + ...chatGenerate, + systemMessage, + chatSequence: chatSequence, + systemSplit, + } +} + +export function aixSpillShallFlush(message: AixMessages_ChatMessage): boolean { + return '_FLUSH' in message && !!message._FLUSH; +} + + +// Approximate conversions - alternative approaches should be tried until we find the best one + +export function approxDocPart_To_String({ ref, data }: AixParts_DocPart /*, wrapFormat?: 'markdown-code'*/): string { + // NOTE: Consider a better representation here + // + // We use the 'legacy' markdown encoding, but we may consider: + // - '\n...\n' + // - ```doc id='ref' title='title' version='version'\n...\n``` + // - # Title [id='ref' version='version']\n...\n + // - ...more ideas... + // + return '```' + (ref || '') + '\n' + data.text + '\n```\n'; +} + +export function approxInReferenceTo_To_XMLString(irt: AixParts_MetaInReferenceToPart): string | null { + const refs = irt.referTo.map(r => escapeXml(r.mText)); + if (!refs.length) + return null; // `User provides no specific references`; + return refs.length === 1 + ? `User refers to this in particular:${refs[0]}` + : `User refers to ${refs.length} items:${refs.join('')}`; +} \ No newline at end of file diff --git a/src/modules/aix/server/dispatch/chatGenerate/adapters/anthropic.messageCreate.ts b/src/modules/aix/server/dispatch/chatGenerate/adapters/anthropic.messageCreate.ts index e55ff28a2..99285d727 100644 --- a/src/modules/aix/server/dispatch/chatGenerate/adapters/anthropic.messageCreate.ts +++ b/src/modules/aix/server/dispatch/chatGenerate/adapters/anthropic.messageCreate.ts @@ -1,8 +1,8 @@ -import { escapeXml } from '~/server/wire'; - -import type { AixAPI_Model, AixAPIChatGenerate_Request, AixMessages_ChatMessage, AixParts_DocPart, AixParts_MetaInReferenceToPart, AixTools_ToolDefinition, AixTools_ToolsPolicy } from '../../../api/aix.wiretypes'; +import type { AixAPI_Model, AixAPIChatGenerate_Request, AixMessages_ChatMessage, AixTools_ToolDefinition, AixTools_ToolsPolicy } from '../../../api/aix.wiretypes'; import { AnthropicWire_API_Message_Create, AnthropicWire_Blocks } from '../../wiretypes/anthropic.wiretypes'; +import { aixSpillShallFlush, aixSpillSystemToUser, approxDocPart_To_String, approxInReferenceTo_To_XMLString } from './adapters.common'; + // configuration const hotFixImagePartsFirst = true; @@ -14,7 +14,10 @@ const hotFixMapModelImagesToUser = true; type TRequest = AnthropicWire_API_Message_Create.Request; -export function aixToAnthropicMessageCreate(model: AixAPI_Model, chatGenerate: AixAPIChatGenerate_Request, streaming: boolean): TRequest { +export function aixToAnthropicMessageCreate(model: AixAPI_Model, _chatGenerate: AixAPIChatGenerate_Request, streaming: boolean): TRequest { + + // Pre-process CGR - approximate spill of System to User message + const chatGenerate = aixSpillSystemToUser(_chatGenerate); // Convert the system message let systemMessage: TRequest['system'] = undefined; @@ -30,6 +33,10 @@ export function aixToAnthropicMessageCreate(model: AixAPI_Model, chatGenerate: A acc.push(AnthropicWire_Blocks.TextBlock(approxDocPart_To_String(part))); break; + case 'inline_image': + // we have already removed image parts from the system message + throw new Error('Anthropic: images have to be in user messages, not in system message'); + case 'meta_cache_control': if (!acc.length) console.warn('Anthropic: cache_control without a message to attach to'); @@ -40,6 +47,7 @@ export function aixToAnthropicMessageCreate(model: AixAPI_Model, chatGenerate: A break; default: + const _exhaustiveCheck: never = part; throw new Error(`Unsupported part type in System message: ${(part as any).pt}`); } return acc; @@ -76,6 +84,12 @@ export function aixToAnthropicMessageCreate(model: AixAPI_Model, chatGenerate: A } currentMessage.content.push(content); } + + // Flush: interrupt batching within the same-role and finalize the current message + if (aixSpillShallFlush(aixMessage) && currentMessage) { + chatMessages.push(currentMessage); + currentMessage = null; + } } if (currentMessage) chatMessages.push(currentMessage); @@ -305,27 +319,3 @@ function _toAnthropicToolChoice(itp: AixTools_ToolsPolicy): NonNullable\n...\n' - // - ```doc id='ref' title='title' version='version'\n...\n``` - // - # Title [id='ref' version='version']\n...\n - // - ...more ideas... - // - return '```' + (ref || '') + '\n' + data.text + '\n```\n'; -} - -export function approxInReferenceTo_To_XMLString(irt: AixParts_MetaInReferenceToPart): string | null { - const refs = irt.referTo.map(r => escapeXml(r.mText)); - if (!refs.length) - return null; // `User provides no specific references`; - return refs.length === 1 - ? `User refers to this in particular:${refs[0]}` - : `User refers to ${refs.length} items:${refs.join('')}`; -} diff --git a/src/modules/aix/server/dispatch/chatGenerate/adapters/gemini.generateContent.ts b/src/modules/aix/server/dispatch/chatGenerate/adapters/gemini.generateContent.ts index 0da64a83a..3d8d556a4 100644 --- a/src/modules/aix/server/dispatch/chatGenerate/adapters/gemini.generateContent.ts +++ b/src/modules/aix/server/dispatch/chatGenerate/adapters/gemini.generateContent.ts @@ -1,7 +1,7 @@ import type { AixAPI_Model, AixAPIChatGenerate_Request, AixMessages_ChatMessage, AixParts_DocPart, AixTools_ToolDefinition, AixTools_ToolsPolicy } from '../../../api/aix.wiretypes'; import { GeminiWire_API_Generate_Content, GeminiWire_ContentParts, GeminiWire_Messages, GeminiWire_Safety, GeminiWire_ToolDeclarations } from '../../wiretypes/gemini.wiretypes'; -import { approxDocPart_To_String, approxInReferenceTo_To_XMLString } from './anthropic.messageCreate'; +import { aixSpillSystemToUser, approxDocPart_To_String, approxInReferenceTo_To_XMLString } from './adapters.common'; // configuration @@ -9,10 +9,13 @@ const hotFixImagePartsFirst = true; // https://ai.google.dev/gemini-api/docs/ima const hotFixReplaceEmptyMessagesWithEmptyTextPart = true; -export function aixToGeminiGenerateContent(model: AixAPI_Model, chatGenerate: AixAPIChatGenerate_Request, geminiSafetyThreshold: GeminiWire_Safety.HarmBlockThreshold, jsonOutput: boolean, _streaming: boolean): TRequest { +export function aixToGeminiGenerateContent(model: AixAPI_Model, _chatGenerate: AixAPIChatGenerate_Request, geminiSafetyThreshold: GeminiWire_Safety.HarmBlockThreshold, jsonOutput: boolean, _streaming: boolean): TRequest { // Note: the streaming setting is ignored as it only belongs in the path + // Pre-process CGR - approximate spill of System to User message - note: no need to flush as every message is not batched + const chatGenerate = aixSpillSystemToUser(_chatGenerate); + // System Instructions let systemInstruction: TRequest['systemInstruction'] = undefined; if (chatGenerate.systemMessage?.parts.length) { @@ -27,6 +30,10 @@ export function aixToGeminiGenerateContent(model: AixAPI_Model, chatGenerate: Ai acc.parts.push(GeminiWire_ContentParts.TextPart(approxDocPart_To_String(part))); break; + case 'inline_image': + // we have already removed image parts from the system message + throw new Error('Gemini: images have to be in user messages, not in system message'); + case 'meta_cache_control': // ignore this breakpoint hint - Anthropic only break; diff --git a/src/modules/aix/server/dispatch/chatGenerate/adapters/openai.chatCompletions.ts b/src/modules/aix/server/dispatch/chatGenerate/adapters/openai.chatCompletions.ts index d8967c5e5..a849da8e8 100644 --- a/src/modules/aix/server/dispatch/chatGenerate/adapters/openai.chatCompletions.ts +++ b/src/modules/aix/server/dispatch/chatGenerate/adapters/openai.chatCompletions.ts @@ -3,7 +3,7 @@ import type { OpenAIDialects } from '~/modules/llms/server/openai/openai.router' import { AixAPI_Model, AixAPIChatGenerate_Request, AixMessages_ChatMessage, AixMessages_SystemMessage, AixParts_DocPart, AixParts_InlineAudioPart, AixParts_MetaInReferenceToPart, AixTools_ToolDefinition, AixTools_ToolsPolicy } from '../../../api/aix.wiretypes'; import { OpenAIWire_API_Chat_Completions, OpenAIWire_ContentParts, OpenAIWire_Messages } from '../../wiretypes/openai.wiretypes'; -import { approxDocPart_To_String } from './anthropic.messageCreate'; +import { aixSpillShallFlush, aixSpillSystemToUser, approxDocPart_To_String } from './adapters.common'; // @@ -29,7 +29,10 @@ const approxSystemMessageJoiner = '\n\n---\n\n'; type TRequest = OpenAIWire_API_Chat_Completions.Request; type TRequestMessages = TRequest['messages']; -export function aixToOpenAIChatCompletions(openAIDialect: OpenAIDialects, model: AixAPI_Model, chatGenerate: AixAPIChatGenerate_Request, jsonOutput: boolean, streaming: boolean): TRequest { +export function aixToOpenAIChatCompletions(openAIDialect: OpenAIDialects, model: AixAPI_Model, _chatGenerate: AixAPIChatGenerate_Request, jsonOutput: boolean, streaming: boolean): TRequest { + + // Pre-process CGR - approximate spill of System to User message + const chatGenerate = aixSpillSystemToUser(_chatGenerate); // Dialect incompatibilities -> Hotfixes const hotFixAlternateUserAssistantRoles = openAIDialect === 'deepseek' || openAIDialect === 'perplexity'; @@ -362,6 +365,10 @@ function _toOpenAIMessages(systemMessage: AixMessages_SystemMessage | null, chat msg0TextParts.push(aixDocPart_to_OpenAITextContent(part)); break; + case 'inline_image': + // we have already removed image parts from the system message + throw new Error('OpenAI ChatCompletions: images have to be in user messages, not in system message'); + case 'meta_cache_control': // ignore this breakpoint hint - Anthropic only break; @@ -386,7 +393,9 @@ function _toOpenAIMessages(systemMessage: AixMessages_SystemMessage | null, chat // Convert the messages - for (const { parts, role } of chatSequence) { + let allowAppend = true; + for (const aixMessage of chatSequence) { + const { parts, role } = aixMessage; switch (role) { case 'user': @@ -398,20 +407,22 @@ function _toOpenAIMessages(systemMessage: AixMessages_SystemMessage | null, chat const textContentPart = OpenAIWire_ContentParts.TextContentPart(part.text); // Append to existing content[], or new message - if (currentMessage?.role === 'user' && Array.isArray(currentMessage.content)) + if (allowAppend && currentMessage?.role === 'user' && Array.isArray(currentMessage.content)) currentMessage.content.push(textContentPart); else chatMessages.push({ role: 'user', content: hotFixPreferArrayUserContent ? [textContentPart] : textContentPart.text }); + allowAppend = true; break; case 'doc': const docContentPart = aixDocPart_to_OpenAITextContent(part); // Append to existing content[], or new message - if (currentMessage?.role === 'user' && Array.isArray(currentMessage.content)) + if (allowAppend && currentMessage?.role === 'user' && Array.isArray(currentMessage.content)) currentMessage.content.push(docContentPart); else chatMessages.push({ role: 'user', content: hotFixPreferArrayUserContent ? [docContentPart] : docContentPart.text }); + allowAppend = true; break; case 'inline_image': @@ -421,10 +432,11 @@ function _toOpenAIMessages(systemMessage: AixMessages_SystemMessage | null, chat const imageContentPart = OpenAIWire_ContentParts.ImageContentPart(base64DataUrl, hotFixForceImageContentPartOpenAIDetail); // Append to existing content[], or new message - if (currentMessage?.role === 'user' && Array.isArray(currentMessage.content)) + if (allowAppend && currentMessage?.role === 'user' && Array.isArray(currentMessage.content)) currentMessage.content.push(imageContentPart); else chatMessages.push({ role: 'user', content: [imageContentPart] }); + allowAppend = true; break; case 'meta_cache_control': @@ -443,6 +455,9 @@ function _toOpenAIMessages(systemMessage: AixMessages_SystemMessage | null, chat throw new Error(`Unsupported part type in User message: ${(part as any).pt}`); } } + + // If this message shall be flushed, disallow append once next + allowAppend = !aixSpillShallFlush(aixMessage); break; case 'model': diff --git a/src/modules/aix/server/dispatch/chatGenerate/adapters/openai.responsesCreate.ts b/src/modules/aix/server/dispatch/chatGenerate/adapters/openai.responsesCreate.ts index f2d15537b..e771fec5d 100644 --- a/src/modules/aix/server/dispatch/chatGenerate/adapters/openai.responsesCreate.ts +++ b/src/modules/aix/server/dispatch/chatGenerate/adapters/openai.responsesCreate.ts @@ -4,7 +4,7 @@ import { AixAPI_Model, AixAPIChatGenerate_Request, AixMessages_ChatMessage, AixM import { OpenAIWire_API_Responses, OpenAIWire_Responses_Items, OpenAIWire_Responses_Tools } from '../../wiretypes/openai.wiretypes'; import { aixDocPart_to_OpenAITextContent, aixMetaRef_to_OpenAIText, aixTexts_to_OpenAIInstructionText } from './openai.chatCompletions'; -import { approxDocPart_To_String } from './anthropic.messageCreate'; +import { aixSpillShallFlush, aixSpillSystemToUser, approxDocPart_To_String } from './adapters.common'; // configuration @@ -23,7 +23,10 @@ type TRequestTool = OpenAIWire_Responses_Tools.Tool; * - much side functionality is not implemented yet * - testing with o3-pro only for now */ -export function aixToOpenAIResponses(openAIDialect: OpenAIDialects, model: AixAPI_Model, chatGenerate: AixAPIChatGenerate_Request, jsonOutput: boolean, streaming: boolean): TRequest { +export function aixToOpenAIResponses(openAIDialect: OpenAIDialects, model: AixAPI_Model, _chatGenerate: AixAPIChatGenerate_Request, jsonOutput: boolean, streaming: boolean): TRequest { + + // Pre-process CGR - approximate spill of System to User message + const chatGenerate = aixSpillSystemToUser(_chatGenerate); // [OpenAI] Vendor-specific model checks const isOpenAIOFamily = ['gpt-6', 'gpt-5', 'o4', 'o3', 'o1'].some(_id => model.id === _id || model.id.startsWith(_id + '-')); @@ -168,6 +171,10 @@ function _toOpenAIResponsesRequestInput(systemMessage: AixMessages_SystemMessage instructionsParts.push(aixDocPart_to_OpenAITextContent(part).text); break; + case 'inline_image': + // we have already removed image parts from the system message + throw new Error('OpenAI Responses: images have to be in user messages, not in system message'); + case 'meta_cache_control': // ignore this breakpoint hint - Anthropic only break; @@ -187,10 +194,11 @@ function _toOpenAIResponsesRequestInput(systemMessage: AixMessages_SystemMessage type FunctionCallMessage = OpenAIWire_Responses_Items.OutputFunctionCallItem; type FunctionCallOutputMessage = OpenAIWire_Responses_Items.FunctionToolCallOutput; + let allowUserAppend = true; function userMessage() { // Ensure the last message is a user message, or create a new one let lastMessage = chatMessages.length ? chatMessages[chatMessages.length - 1] : undefined; - if (lastMessage && lastMessage.type === 'message' && lastMessage.role === 'user') + if (allowUserAppend && lastMessage && lastMessage.type === 'message' && lastMessage.role === 'user') return lastMessage; const newMessage: UserMessage = { type: 'message', @@ -198,6 +206,7 @@ function _toOpenAIResponsesRequestInput(systemMessage: AixMessages_SystemMessage content: [], }; chatMessages.push(newMessage); + allowUserAppend = true; return newMessage; } @@ -245,7 +254,8 @@ function _toOpenAIResponsesRequestInput(systemMessage: AixMessages_SystemMessage * - assistant messages to the old Input Message format (which doesn't need IDs) * */ - for (const { role: messageRole, parts: messageParts } of chatSequence) { + for (const aixMessage of chatSequence) { + const { role: messageRole, parts: messageParts } = aixMessage; switch (messageRole) { case 'user': @@ -295,6 +305,9 @@ function _toOpenAIResponsesRequestInput(systemMessage: AixMessages_SystemMessage throw new Error(`Unsupported part type in User message: ${uPt}`); } } + + // If this message shall be flushed, disallow append once next + allowUserAppend = !aixSpillShallFlush(aixMessage); break; case 'model':