From ffd76dc587ed23555598573ecaa6efb6a65f1d46 Mon Sep 17 00:00:00 2001 From: Enrico Ros Date: Wed, 5 Nov 2025 14:11:35 -0800 Subject: [PATCH] *Image Captioning with a dedicated (configurable) model. Fixes #862 --- .../llmattachments/LLMAttachmentButton.tsx | 1 + src/apps/settings-modal/AppChatSettingsAI.tsx | 7 ++ .../attachment-drafts/attachment.pipeline.ts | 33 ++++++++ .../attachment-drafts/attachment.types.ts | 2 +- src/common/stores/chat/chat.fragments.ts | 2 +- .../stores/llms/model.domains.registry.ts | 11 ++- src/common/stores/llms/model.domains.types.ts | 5 ++ .../image-caption/imageCaptionFromImage.ts | 83 +++++++++++++++++++ src/modules/aix/server/api/aix.wiretypes.ts | 1 + .../llms/models-modal/LLMOptionsModal.tsx | 4 +- 10 files changed, 144 insertions(+), 5 deletions(-) create mode 100644 src/modules/aifn/image-caption/imageCaptionFromImage.ts diff --git a/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx b/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx index ffdd15162..e2235c50c 100644 --- a/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx +++ b/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx @@ -98,6 +98,7 @@ const converterTypeToIconMap: { [key in AttachmentDraftConverterType]: React.Com 'image-resized-high': PhotoSizeSelectLargeOutlinedIcon, 'image-resized-low': PhotoSizeSelectSmallOutlinedIcon, 'image-to-default': ImageOutlinedIcon, + 'image-caption': AbcIcon, 'image-ocr': AbcIcon, 'pdf-text': PictureAsPdfIcon, 'pdf-images': PermMediaOutlinedIcon, diff --git a/src/apps/settings-modal/AppChatSettingsAI.tsx b/src/apps/settings-modal/AppChatSettingsAI.tsx index a454822e9..4b5390250 100644 --- a/src/apps/settings-modal/AppChatSettingsAI.tsx +++ b/src/apps/settings-modal/AppChatSettingsAI.tsx @@ -129,6 +129,13 @@ export function AppChatSettingsAI() { } /> + + {labsDevMode && ( edit(attachment.id, { outputsConversionProgress: progress / 100 }), + ); + // if we're here we shall have valid text + newFragments.push(createDocAttachmentFragment( + title, + caption + ' (Caption)', + DVMimeType.TextPlain, + createDMessageDataInlineText(captionText || 'This image could not be described', 'text/plain'), + refString, + DOCPART_DEFAULT_VERSION, + { ...docMeta, srcOcrFrom: 'image-caption' }, + )); + } catch (error: any) { + console.log('[DEV] Failed to caption image:', error); + const errorText = `[Captioning failed: ${error?.message || String(error)}]`; + newFragments.push(createDocAttachmentFragment(title, caption + ' (Error)', DVMimeType.TextPlain, createDMessageDataInlineText(errorText, 'text/plain'), refString, DOCPART_DEFAULT_VERSION, { ...docMeta, srcOcrFrom: 'image-caption' })); + } + break; + // pdf to text case 'pdf-text': diff --git a/src/common/attachment-drafts/attachment.types.ts b/src/common/attachment-drafts/attachment.types.ts index ea18b184a..bc7cf8403 100644 --- a/src/common/attachment-drafts/attachment.types.ts +++ b/src/common/attachment-drafts/attachment.types.ts @@ -136,7 +136,7 @@ export type AttachmentDraftConverter = { export type AttachmentDraftConverterType = | 'text' | 'rich-text' | 'rich-text-cleaner' | 'rich-text-table' - | 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-to-default' + | 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-caption' | 'image-to-default' | 'pdf-text' | 'pdf-images' | 'pdf-text-and-images' | 'docx-to-html' | 'url-page-text' | 'url-page-markdown' | 'url-page-html' | 'url-page-null' | 'url-page-image' diff --git a/src/common/stores/chat/chat.fragments.ts b/src/common/stores/chat/chat.fragments.ts index 4d0b99eed..023125990 100644 --- a/src/common/stores/chat/chat.fragments.ts +++ b/src/common/stores/chat/chat.fragments.ts @@ -119,7 +119,7 @@ type DMessageDocMeta = { codeLanguage?: string; srcFileName?: string; srcFileSize?: number; - srcOcrFrom?: 'image' | 'pdf'; + srcOcrFrom?: 'image' | 'pdf' | 'image-caption'; } diff --git a/src/common/stores/llms/model.domains.registry.ts b/src/common/stores/llms/model.domains.registry.ts index dc5f52fec..4c1f63589 100644 --- a/src/common/stores/llms/model.domains.registry.ts +++ b/src/common/stores/llms/model.domains.registry.ts @@ -19,7 +19,7 @@ type ModelDomainSpec = { }; -export const ModelDomainsList: DModelDomainId[] = ['primaryChat', 'codeApply', 'fastUtil'] as const; +export const ModelDomainsList: DModelDomainId[] = ['primaryChat', 'codeApply', 'fastUtil', 'imageCaption'] as const; export const ModelDomainsRegistry: Record = { primaryChat: { @@ -48,6 +48,15 @@ export const ModelDomainsRegistry: Record = { autoStrategy: 'topVendorLowestCost', requiredInterfaces: [LLM_IF_OAI_Fn], // NOTE: we do enforce this already, although this may not be correctly set for all vendors }, + imageCaption: { + label: 'Image Captioning', + confLabel: 'Vision', + confTooltip: 'Vision model for image captioning', + description: 'Describes images as text', + recommended: 'Qwen VL', + autoStrategy: 'topVendorTopLlm', + fallbackDomain: 'primaryChat', + }, }; diff --git a/src/common/stores/llms/model.domains.types.ts b/src/common/stores/llms/model.domains.types.ts index b5b422735..40e99c7b1 100644 --- a/src/common/stores/llms/model.domains.types.ts +++ b/src/common/stores/llms/model.domains.types.ts @@ -17,4 +17,9 @@ export type DModelDomainId = * Fast Utility model; must have function calling, but we won't enforce in the code for now until all LLMs are correctly identified as FC or not - used for quick responses and simple tasks */ 'fastUtil' + | + /** + * Image Captioning model - used to generate detailed text descriptions of images before sending to primary chat model + */ + 'imageCaption' ; \ No newline at end of file diff --git a/src/modules/aifn/image-caption/imageCaptionFromImage.ts b/src/modules/aifn/image-caption/imageCaptionFromImage.ts new file mode 100644 index 000000000..c8fc35450 --- /dev/null +++ b/src/modules/aifn/image-caption/imageCaptionFromImage.ts @@ -0,0 +1,83 @@ +import type { AixAPIChatGenerate_Request } from '~/modules/aix/server/api/aix.wiretypes'; +import { aixChatGenerateContent_DMessage_orThrow, aixCreateChatGenerateContext } from '~/modules/aix/client/aix.client'; + +import { convert_Blob_To_Base64 } from '~/common/util/blobUtils'; +import { getDomainModelIdOrThrow } from '~/common/stores/llms/store-llms'; +import { messageFragmentsReduceText } from '~/common/stores/chat/chat.message'; + + +/** + * System prompt for image captioning - designed to minimize information loss + */ +const IMAGE_CAPTIONING_SYSTEM_PROMPT = `You are an expert at describing images in comprehensive detail. Your goal is to create a text description that captures as much visual information as possible, minimizing information loss for downstream AI models that will only see your text description. + +Provide a detailed description covering: +1. **Overall Scene & Main Elements**: What is the primary subject? What is happening? +2. **Visual Details**: Colors, textures, patterns, materials, lighting, shadows, style (photographic, illustration, 3D render, etc.) +3. **Spatial Layout & Composition**: Positions and relationships between elements, foreground/background, arrangement +4. **Text Content**: If any text is visible, transcribe it exactly as it appears +5. **Context & Atmosphere**: Mood, setting, time of day, weather conditions (if applicable) +6. **Technical Aspects**: For UI/diagrams/charts - describe structure, labels, data, flow, connections, hierarchy + +Be thorough but concise. Prioritize information that would be difficult to infer from a general description.`; + + +/** + * Generate a detailed text description of an image using a vision model + * @returns The generated caption text + */ +export async function imageCaptionFromImageOrThrow( + imageBlob: Blob, + imageMimeType: string, + contextRef: string, + abortSignal: AbortSignal, + onProgress?: (progress: number) => void, +): Promise { + + // can throw if no model + const llmId = getDomainModelIdOrThrow(['imageCaption', 'fastUtil'], false, true, 'aifn-image-caption'); + + // image -> base64 + onProgress?.(0); + const base64Data = await convert_Blob_To_Base64(imageBlob, 'aifn-image-caption'); + onProgress?.(20); + + // create the vision request with inline image + const visionRequest: AixAPIChatGenerate_Request = { + systemMessage: { + parts: [{ pt: 'text', text: IMAGE_CAPTIONING_SYSTEM_PROMPT }], + }, + chatSequence: [{ + role: 'user', + parts: [ + { pt: 'text', text: 'Describe this image in comprehensive detail.' }, + { pt: 'inline_image', mimeType: imageMimeType as any, base64: base64Data }, + ], + }], + } as const; + + // call AIX with vision model + let lastProgress = 40; + const result = await aixChatGenerateContent_DMessage_orThrow( + llmId, + visionRequest, + aixCreateChatGenerateContext('aifn-image-caption', contextRef), + true, // streaming + { abortSignal }, + (update, isDone) => { + // update progress during streaming, 40...90% with every token (so we assume at least 200 tokens) + if (!isDone && onProgress) + onProgress(Math.round(lastProgress = Math.min(90, lastProgress + 0.25))); + }, + ); + + // extract text from the result + onProgress?.(100); + + // concatenate all text fragments + const caption = messageFragmentsReduceText(result.fragments, '', false); + if (!caption.trim()) + throw new Error('Vision model returned empty caption'); + + return caption.trim(); +} diff --git a/src/modules/aix/server/api/aix.wiretypes.ts b/src/modules/aix/server/api/aix.wiretypes.ts index 56b8d250a..69cccb695 100644 --- a/src/modules/aix/server/api/aix.wiretypes.ts +++ b/src/modules/aix/server/api/aix.wiretypes.ts @@ -471,6 +471,7 @@ export namespace AixWire_API { // streaming AI operations 'ai-diagram', // making a diagram - messageId 'ai-flattener', // flattening a thread - messageId of the first message + 'aifn-image-caption', // generating image captions - attachmentId 'beam-gather', // fusing beam rays - fusionId 'beam-scatter', // scattering beam rays - rayId 'call', // having a phone conversation - messageId of the first message diff --git a/src/modules/llms/models-modal/LLMOptionsModal.tsx b/src/modules/llms/models-modal/LLMOptionsModal.tsx index cc8183eff..5ba5509bf 100644 --- a/src/modules/llms/models-modal/LLMOptionsModal.tsx +++ b/src/modules/llms/models-modal/LLMOptionsModal.tsx @@ -79,7 +79,7 @@ export function LLMOptionsModal(props: { id: DLLMId, onClose: () => void }) { // state - auto-open details if user has customized pricing or token limits const [showDetails, setShowDetails] = React.useState( - !!llm?.userPricing || llm?.userContextTokens !== undefined || llm?.userMaxOutputTokens !== undefined + !!llm?.userPricing || llm?.userContextTokens !== undefined || llm?.userMaxOutputTokens !== undefined, ); const domainAssignments = useModelDomains(); const { removeLLM, updateLLM, assignDomainModelId, resetLLMUserParameters } = llmsStoreActions(); @@ -258,7 +258,7 @@ export function LLMOptionsModal(props: { id: DLLMId, onClose: () => void }) { - {ModelDomainsList.map(domainId => { + {ModelDomainsList.filter(dId => !['imageCaption'].includes(dId)).map(domainId => { const domainSpec = ModelDomainsRegistry[domainId]; const domainModelId = domainAssignments[domainId]?.modelId; const isActive = domainModelId === llm.id;