From ffd76dc587ed23555598573ecaa6efb6a65f1d46 Mon Sep 17 00:00:00 2001
From: Enrico Ros <enrico.ros@gmail.com>
Date: Wed, 5 Nov 2025 14:11:35 -0800
Subject: [PATCH] *Image Captioning with a dedicated (configurable) model.
 Fixes #862

---
 .../llmattachments/LLMAttachmentButton.tsx    |  1 +
 src/apps/settings-modal/AppChatSettingsAI.tsx |  7 ++
 .../attachment-drafts/attachment.pipeline.ts  | 33 ++++++++
 .../attachment-drafts/attachment.types.ts     |  2 +-
 src/common/stores/chat/chat.fragments.ts      |  2 +-
 .../stores/llms/model.domains.registry.ts     | 11 ++-
 src/common/stores/llms/model.domains.types.ts |  5 ++
 .../image-caption/imageCaptionFromImage.ts    | 83 +++++++++++++++++++
 src/modules/aix/server/api/aix.wiretypes.ts   |  1 +
 .../llms/models-modal/LLMOptionsModal.tsx     |  4 +-
 10 files changed, 144 insertions(+), 5 deletions(-)
 create mode 100644 src/modules/aifn/image-caption/imageCaptionFromImage.ts

diff --git a/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx b/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx
index ffdd15162..e2235c50c 100644
--- a/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx
+++ b/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx
@@ -98,6 +98,7 @@ const converterTypeToIconMap: { [key in AttachmentDraftConverterType]: React.Com
   'image-resized-high': PhotoSizeSelectLargeOutlinedIcon,
   'image-resized-low': PhotoSizeSelectSmallOutlinedIcon,
   'image-to-default': ImageOutlinedIcon,
+  'image-caption': AbcIcon,
   'image-ocr': AbcIcon,
   'pdf-text': PictureAsPdfIcon,
   'pdf-images': PermMediaOutlinedIcon,
diff --git a/src/apps/settings-modal/AppChatSettingsAI.tsx b/src/apps/settings-modal/AppChatSettingsAI.tsx
index a454822e9..4b5390250 100644
--- a/src/apps/settings-modal/AppChatSettingsAI.tsx
+++ b/src/apps/settings-modal/AppChatSettingsAI.tsx
@@ -129,6 +129,13 @@ export function AppChatSettingsAI() {
       </>}
     />
 
+    <FormControlDomainModel
+      domainId='imageCaption'
+      title='Vision model'
+      description='Image captioning'
+      tooltip='Vision model used to generate text descriptions of images when the Caption (Text) attachment option is selected.'
+    />
+
     {labsDevMode && (
       <FormControlDomainModel
         domainId='primaryChat'
diff --git a/src/common/attachment-drafts/attachment.pipeline.ts b/src/common/attachment-drafts/attachment.pipeline.ts
index e2a8d6cad..aa086edb4 100644
--- a/src/common/attachment-drafts/attachment.pipeline.ts
+++ b/src/common/attachment-drafts/attachment.pipeline.ts
@@ -2,12 +2,14 @@ import type { FileWithHandle } from 'browser-fs-access';
 
 import { callBrowseFetchPageOrThrow } from '~/modules/browse/browse.client';
 import { extractYoutubeVideoIDFromURL } from '~/modules/youtube/youtube.utils';
+import { imageCaptionFromImageOrThrow } from '~/modules/aifn/image-caption/imageCaptionFromImage';
 import { youTubeGetVideoData } from '~/modules/youtube/useYouTubeTranscript';
 
 import type { CommonImageMimeTypes } from '~/common/util/imageUtils';
 import { Is } from '~/common/util/pwaUtils';
 import { agiCustomId, agiUuid } from '~/common/util/idUtils';
 import { convert_Base64DataURL_To_Base64WithMimeType, convert_Base64WithMimeType_To_Blob } from '~/common/util/blobUtils';
+import { getDomainModelConfiguration } from '~/common/stores/llms/hooks/useModelDomain';
 import { htmlTableToMarkdown } from '~/common/util/htmlTableToMarkdown';
 import { humanReadableHyphenated } from '~/common/util/textUtils';
 import { pdfToImageDataURLs, pdfToText } from '~/common/util/pdfUtils';
@@ -279,11 +281,13 @@ export function attachmentDefineConverters(source: AttachmentDraftSource, input:
     // Images (Known/Unknown)
     case input.mimeType.startsWith('image/'):
       const inputImageMimeSupported = mimeTypeIsSupportedImage(input.mimeType);
+      const visionModelMissing = !getDomainModelConfiguration('imageCaption', true, true);
       converters.push({ id: 'image-resized-high', name: 'Image (high detail)', disabled: !inputImageMimeSupported });
       converters.push({ id: 'image-resized-low', name: 'Image (low detail)', disabled: !inputImageMimeSupported });
       converters.push({ id: 'image-original', name: 'Image (original quality)', disabled: !inputImageMimeSupported });
       if (!inputImageMimeSupported)
         converters.push({ id: 'image-to-default', name: `As Image (${DEFAULT_ADRAFT_IMAGE_MIMETYPE})` });
+      converters.push({ id: 'image-caption', name: 'Caption (Text)', disabled: visionModelMissing });
       converters.push({ id: 'unhandled', name: 'No Image' });
       converters.push({ id: 'image-ocr', name: 'Add Text (OCR)', isCheckbox: true });
       break;
@@ -590,6 +594,35 @@ export async function attachmentPerformConversion(
         }
         break;
 
+      // image to caption
+      case 'image-caption':
+        if (!_expectBlob(input.data, 'Image captioning converter')) break;
+        try {
+          const abortController = new AbortController();
+          const captionText = await imageCaptionFromImageOrThrow(
+            input.data,
+            input.mimeType,
+            attachment.id,
+            abortController.signal,
+            progress => edit(attachment.id, { outputsConversionProgress: progress / 100 }),
+          );
+          // if we're here we shall have valid text
+          newFragments.push(createDocAttachmentFragment(
+            title,
+            caption + ' (Caption)',
+            DVMimeType.TextPlain,
+            createDMessageDataInlineText(captionText || 'This image could not be described', 'text/plain'),
+            refString,
+            DOCPART_DEFAULT_VERSION,
+            { ...docMeta, srcOcrFrom: 'image-caption' },
+          ));
+        } catch (error: any) {
+          console.log('[DEV] Failed to caption image:', error);
+          const errorText = `[Captioning failed: ${error?.message || String(error)}]`;
+          newFragments.push(createDocAttachmentFragment(title, caption + ' (Error)', DVMimeType.TextPlain, createDMessageDataInlineText(errorText, 'text/plain'), refString, DOCPART_DEFAULT_VERSION, { ...docMeta, srcOcrFrom: 'image-caption' }));
+        }
+        break;
+
 
       // pdf to text
       case 'pdf-text':
diff --git a/src/common/attachment-drafts/attachment.types.ts b/src/common/attachment-drafts/attachment.types.ts
index ea18b184a..bc7cf8403 100644
--- a/src/common/attachment-drafts/attachment.types.ts
+++ b/src/common/attachment-drafts/attachment.types.ts
@@ -136,7 +136,7 @@ export type AttachmentDraftConverter = {
 
 export type AttachmentDraftConverterType =
   | 'text' | 'rich-text' | 'rich-text-cleaner' | 'rich-text-table'
-  | 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-to-default'
+  | 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-caption' | 'image-to-default'
   | 'pdf-text' | 'pdf-images' | 'pdf-text-and-images'
   | 'docx-to-html'
   | 'url-page-text' | 'url-page-markdown' | 'url-page-html' | 'url-page-null' | 'url-page-image'
diff --git a/src/common/stores/chat/chat.fragments.ts b/src/common/stores/chat/chat.fragments.ts
index 4d0b99eed..023125990 100644
--- a/src/common/stores/chat/chat.fragments.ts
+++ b/src/common/stores/chat/chat.fragments.ts
@@ -119,7 +119,7 @@ type DMessageDocMeta = {
   codeLanguage?: string;
   srcFileName?: string;
   srcFileSize?: number;
-  srcOcrFrom?: 'image' | 'pdf';
+  srcOcrFrom?: 'image' | 'pdf' | 'image-caption';
 }
 
 
diff --git a/src/common/stores/llms/model.domains.registry.ts b/src/common/stores/llms/model.domains.registry.ts
index dc5f52fec..4c1f63589 100644
--- a/src/common/stores/llms/model.domains.registry.ts
+++ b/src/common/stores/llms/model.domains.registry.ts
@@ -19,7 +19,7 @@ type ModelDomainSpec = {
 };
 
 
-export const ModelDomainsList: DModelDomainId[] = ['primaryChat', 'codeApply', 'fastUtil'] as const;
+export const ModelDomainsList: DModelDomainId[] = ['primaryChat', 'codeApply', 'fastUtil', 'imageCaption'] as const;
 
 export const ModelDomainsRegistry: Record<DModelDomainId, ModelDomainSpec> = {
   primaryChat: {
@@ -48,6 +48,15 @@ export const ModelDomainsRegistry: Record<DModelDomainId, ModelDomainSpec> = {
     autoStrategy: 'topVendorLowestCost',
     requiredInterfaces: [LLM_IF_OAI_Fn], // NOTE: we do enforce this already, although this may not be correctly set for all vendors
   },
+  imageCaption: {
+    label: 'Image Captioning',
+    confLabel: 'Vision',
+    confTooltip: 'Vision model for image captioning',
+    description: 'Describes images as text',
+    recommended: 'Qwen VL',
+    autoStrategy: 'topVendorTopLlm',
+    fallbackDomain: 'primaryChat',
+  },
 };
 
 
diff --git a/src/common/stores/llms/model.domains.types.ts b/src/common/stores/llms/model.domains.types.ts
index b5b422735..40e99c7b1 100644
--- a/src/common/stores/llms/model.domains.types.ts
+++ b/src/common/stores/llms/model.domains.types.ts
@@ -17,4 +17,9 @@ export type DModelDomainId =
    * Fast Utility model; must have function calling, but we won't enforce in the code for now until all LLMs are correctly identified as FC or not - used for quick responses and simple tasks
    */
   'fastUtil'
+  |
+  /**
+   * Image Captioning model - used to generate detailed text descriptions of images before sending to primary chat model
+   */
+  'imageCaption'
   ;
\ No newline at end of file
diff --git a/src/modules/aifn/image-caption/imageCaptionFromImage.ts b/src/modules/aifn/image-caption/imageCaptionFromImage.ts
new file mode 100644
index 000000000..c8fc35450
--- /dev/null
+++ b/src/modules/aifn/image-caption/imageCaptionFromImage.ts
@@ -0,0 +1,83 @@
+import type { AixAPIChatGenerate_Request } from '~/modules/aix/server/api/aix.wiretypes';
+import { aixChatGenerateContent_DMessage_orThrow, aixCreateChatGenerateContext } from '~/modules/aix/client/aix.client';
+
+import { convert_Blob_To_Base64 } from '~/common/util/blobUtils';
+import { getDomainModelIdOrThrow } from '~/common/stores/llms/store-llms';
+import { messageFragmentsReduceText } from '~/common/stores/chat/chat.message';
+
+
+/**
+ * System prompt for image captioning - designed to minimize information loss
+ */
+const IMAGE_CAPTIONING_SYSTEM_PROMPT = `You are an expert at describing images in comprehensive detail. Your goal is to create a text description that captures as much visual information as possible, minimizing information loss for downstream AI models that will only see your text description.
+
+Provide a detailed description covering:
+1. **Overall Scene & Main Elements**: What is the primary subject? What is happening?
+2. **Visual Details**: Colors, textures, patterns, materials, lighting, shadows, style (photographic, illustration, 3D render, etc.)
+3. **Spatial Layout & Composition**: Positions and relationships between elements, foreground/background, arrangement
+4. **Text Content**: If any text is visible, transcribe it exactly as it appears
+5. **Context & Atmosphere**: Mood, setting, time of day, weather conditions (if applicable)
+6. **Technical Aspects**: For UI/diagrams/charts - describe structure, labels, data, flow, connections, hierarchy
+
+Be thorough but concise. Prioritize information that would be difficult to infer from a general description.`;
+
+
+/**
+ * Generate a detailed text description of an image using a vision model
+ * @returns The generated caption text
+ */
+export async function imageCaptionFromImageOrThrow(
+  imageBlob: Blob,
+  imageMimeType: string,
+  contextRef: string,
+  abortSignal: AbortSignal,
+  onProgress?: (progress: number) => void,
+): Promise<string> {
+
+  // can throw if no model
+  const llmId = getDomainModelIdOrThrow(['imageCaption', 'fastUtil'], false, true, 'aifn-image-caption');
+
+  // image -> base64
+  onProgress?.(0);
+  const base64Data = await convert_Blob_To_Base64(imageBlob, 'aifn-image-caption');
+  onProgress?.(20);
+
+  // create the vision request with inline image
+  const visionRequest: AixAPIChatGenerate_Request = {
+    systemMessage: {
+      parts: [{ pt: 'text', text: IMAGE_CAPTIONING_SYSTEM_PROMPT }],
+    },
+    chatSequence: [{
+      role: 'user',
+      parts: [
+        { pt: 'text', text: 'Describe this image in comprehensive detail.' },
+        { pt: 'inline_image', mimeType: imageMimeType as any, base64: base64Data },
+      ],
+    }],
+  } as const;
+
+  // call AIX with vision model
+  let lastProgress = 40;
+  const result = await aixChatGenerateContent_DMessage_orThrow(
+    llmId,
+    visionRequest,
+    aixCreateChatGenerateContext('aifn-image-caption', contextRef),
+    true, // streaming
+    { abortSignal },
+    (update, isDone) => {
+      // update progress during streaming, 40...90% with every token (so we assume at least 200 tokens)
+      if (!isDone && onProgress)
+        onProgress(Math.round(lastProgress = Math.min(90, lastProgress + 0.25)));
+    },
+  );
+
+  // extract text from the result
+  onProgress?.(100);
+
+  // concatenate all text fragments
+  const caption = messageFragmentsReduceText(result.fragments, '', false);
+  if (!caption.trim())
+    throw new Error('Vision model returned empty caption');
+
+  return caption.trim();
+}
diff --git a/src/modules/aix/server/api/aix.wiretypes.ts b/src/modules/aix/server/api/aix.wiretypes.ts
index 56b8d250a..69cccb695 100644
--- a/src/modules/aix/server/api/aix.wiretypes.ts
+++ b/src/modules/aix/server/api/aix.wiretypes.ts
@@ -471,6 +471,7 @@ export namespace AixWire_API {
       // streaming AI operations
       'ai-diagram',               // making a diagram - messageId
       'ai-flattener',             // flattening a thread - messageId of the first message
+      'aifn-image-caption',       // generating image captions - attachmentId
       'beam-gather',              // fusing beam rays - fusionId
       'beam-scatter',             // scattering beam rays - rayId
       'call',                     // having a phone conversation - messageId of the first message
diff --git a/src/modules/llms/models-modal/LLMOptionsModal.tsx b/src/modules/llms/models-modal/LLMOptionsModal.tsx
index cc8183eff..5ba5509bf 100644
--- a/src/modules/llms/models-modal/LLMOptionsModal.tsx
+++ b/src/modules/llms/models-modal/LLMOptionsModal.tsx
@@ -79,7 +79,7 @@ export function LLMOptionsModal(props: { id: DLLMId, onClose: () => void }) {
 
   // state - auto-open details if user has customized pricing or token limits
   const [showDetails, setShowDetails] = React.useState(
-    !!llm?.userPricing || llm?.userContextTokens !== undefined || llm?.userMaxOutputTokens !== undefined
+    !!llm?.userPricing || llm?.userContextTokens !== undefined || llm?.userMaxOutputTokens !== undefined,
   );
   const domainAssignments = useModelDomains();
   const { removeLLM, updateLLM, assignDomainModelId, resetLLMUserParameters } = llmsStoreActions();
@@ -258,7 +258,7 @@ export function LLMOptionsModal(props: { id: DLLMId, onClose: () => void }) {
       <FormControl orientation='horizontal' sx={{ flexWrap: 'wrap', alignItems: 'center' }}>
         <FormLabelStart title='Assignment' description='Default model' sx={{ minWidth: 80 }} />
         <ButtonGroup orientation='horizontal' size='sm' variant='outlined'>
-          {ModelDomainsList.map(domainId => {
+          {ModelDomainsList.filter(dId => !['imageCaption'].includes(dId)).map(domainId => {
             const domainSpec = ModelDomainsRegistry[domainId];
             const domainModelId = domainAssignments[domainId]?.modelId;
             const isActive = domainModelId === llm.id;