*Image Captioning with a dedicated (configurable) model. Fixes #862

2026-05-10 21:50:14 -07:00 · 2025-11-05 14:11:35 -08:00
parent d7f3594a73
commit ffd76dc587
10 changed files with 144 additions and 5 deletions
@@ -98,6 +98,7 @@ const converterTypeToIconMap: { [key in AttachmentDraftConverterType]: React.Com
  'image-resized-high': PhotoSizeSelectLargeOutlinedIcon,
  'image-resized-low': PhotoSizeSelectSmallOutlinedIcon,
  'image-to-default': ImageOutlinedIcon,
+  'image-caption': AbcIcon,
  'image-ocr': AbcIcon,
  'pdf-text': PictureAsPdfIcon,
  'pdf-images': PermMediaOutlinedIcon,
@@ -129,6 +129,13 @@ export function AppChatSettingsAI() {
      </>}
    />

+    <FormControlDomainModel
+      domainId='imageCaption'
+      title='Vision model'
+      description='Image captioning'
+      tooltip='Vision model used to generate text descriptions of images when the Caption (Text) attachment option is selected.'
+    />
+
    {labsDevMode && (
      <FormControlDomainModel
        domainId='primaryChat'
@@ -2,12 +2,14 @@ import type { FileWithHandle } from 'browser-fs-access';

 import { callBrowseFetchPageOrThrow } from '~/modules/browse/browse.client';
 import { extractYoutubeVideoIDFromURL } from '~/modules/youtube/youtube.utils';
+import { imageCaptionFromImageOrThrow } from '~/modules/aifn/image-caption/imageCaptionFromImage';
 import { youTubeGetVideoData } from '~/modules/youtube/useYouTubeTranscript';

 import type { CommonImageMimeTypes } from '~/common/util/imageUtils';
 import { Is } from '~/common/util/pwaUtils';
 import { agiCustomId, agiUuid } from '~/common/util/idUtils';
 import { convert_Base64DataURL_To_Base64WithMimeType, convert_Base64WithMimeType_To_Blob } from '~/common/util/blobUtils';
+import { getDomainModelConfiguration } from '~/common/stores/llms/hooks/useModelDomain';
 import { htmlTableToMarkdown } from '~/common/util/htmlTableToMarkdown';
 import { humanReadableHyphenated } from '~/common/util/textUtils';
 import { pdfToImageDataURLs, pdfToText } from '~/common/util/pdfUtils';
@@ -279,11 +281,13 @@ export function attachmentDefineConverters(source: AttachmentDraftSource, input:
    // Images (Known/Unknown)
    case input.mimeType.startsWith('image/'):
      const inputImageMimeSupported = mimeTypeIsSupportedImage(input.mimeType);
+      const visionModelMissing = !getDomainModelConfiguration('imageCaption', true, true);
      converters.push({ id: 'image-resized-high', name: 'Image (high detail)', disabled: !inputImageMimeSupported });
      converters.push({ id: 'image-resized-low', name: 'Image (low detail)', disabled: !inputImageMimeSupported });
      converters.push({ id: 'image-original', name: 'Image (original quality)', disabled: !inputImageMimeSupported });
      if (!inputImageMimeSupported)
        converters.push({ id: 'image-to-default', name: `As Image (${DEFAULT_ADRAFT_IMAGE_MIMETYPE})` });
+      converters.push({ id: 'image-caption', name: 'Caption (Text)', disabled: visionModelMissing });
      converters.push({ id: 'unhandled', name: 'No Image' });
      converters.push({ id: 'image-ocr', name: 'Add Text (OCR)', isCheckbox: true });
      break;
@@ -590,6 +594,35 @@ export async function attachmentPerformConversion(
        }
        break;

+      // image to caption
+      case 'image-caption':
+        if (!_expectBlob(input.data, 'Image captioning converter')) break;
+        try {
+          const abortController = new AbortController();
+          const captionText = await imageCaptionFromImageOrThrow(
+            input.data,
+            input.mimeType,
+            attachment.id,
+            abortController.signal,
+            progress => edit(attachment.id, { outputsConversionProgress: progress / 100 }),
+          );
+          // if we're here we shall have valid text
+          newFragments.push(createDocAttachmentFragment(
+            title,
+            caption + ' (Caption)',
+            DVMimeType.TextPlain,
+            createDMessageDataInlineText(captionText || 'This image could not be described', 'text/plain'),
+            refString,
+            DOCPART_DEFAULT_VERSION,
+            { ...docMeta, srcOcrFrom: 'image-caption' },
+          ));
+        } catch (error: any) {
+          console.log('[DEV] Failed to caption image:', error);
+          const errorText = `[Captioning failed: ${error?.message || String(error)}]`;
+          newFragments.push(createDocAttachmentFragment(title, caption + ' (Error)', DVMimeType.TextPlain, createDMessageDataInlineText(errorText, 'text/plain'), refString, DOCPART_DEFAULT_VERSION, { ...docMeta, srcOcrFrom: 'image-caption' }));
+        }
+        break;
+

      // pdf to text
      case 'pdf-text':
@@ -136,7 +136,7 @@ export type AttachmentDraftConverter = {

 export type AttachmentDraftConverterType =
  | 'text' | 'rich-text' | 'rich-text-cleaner' | 'rich-text-table'
-  | 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-to-default'
+  | 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-caption' | 'image-to-default'
  | 'pdf-text' | 'pdf-images' | 'pdf-text-and-images'
  | 'docx-to-html'
  | 'url-page-text' | 'url-page-markdown' | 'url-page-html' | 'url-page-null' | 'url-page-image'
@@ -119,7 +119,7 @@ type DMessageDocMeta = {
  codeLanguage?: string;
  srcFileName?: string;
  srcFileSize?: number;
-  srcOcrFrom?: 'image' | 'pdf';
+  srcOcrFrom?: 'image' | 'pdf' | 'image-caption';
 }


@@ -19,7 +19,7 @@ type ModelDomainSpec = {
 };


-export const ModelDomainsList: DModelDomainId[] = ['primaryChat', 'codeApply', 'fastUtil'] as const;
+export const ModelDomainsList: DModelDomainId[] = ['primaryChat', 'codeApply', 'fastUtil', 'imageCaption'] as const;

 export const ModelDomainsRegistry: Record<DModelDomainId, ModelDomainSpec> = {
  primaryChat: {
@@ -48,6 +48,15 @@ export const ModelDomainsRegistry: Record<DModelDomainId, ModelDomainSpec> = {
    autoStrategy: 'topVendorLowestCost',
    requiredInterfaces: [LLM_IF_OAI_Fn], // NOTE: we do enforce this already, although this may not be correctly set for all vendors
  },
+  imageCaption: {
+    label: 'Image Captioning',
+    confLabel: 'Vision',
+    confTooltip: 'Vision model for image captioning',
+    description: 'Describes images as text',
+    recommended: 'Qwen VL',
+    autoStrategy: 'topVendorTopLlm',
+    fallbackDomain: 'primaryChat',
+  },
 };


@@ -17,4 +17,9 @@ export type DModelDomainId =
   * Fast Utility model; must have function calling, but we won't enforce in the code for now until all LLMs are correctly identified as FC or not - used for quick responses and simple tasks
   */
  'fastUtil'
+  |
+  /**
+   * Image Captioning model - used to generate detailed text descriptions of images before sending to primary chat model
+   */
+  'imageCaption'
  ;
@@ -0,0 +1,83 @@
+import type { AixAPIChatGenerate_Request } from '~/modules/aix/server/api/aix.wiretypes';
+import { aixChatGenerateContent_DMessage_orThrow, aixCreateChatGenerateContext } from '~/modules/aix/client/aix.client';
+
+import { convert_Blob_To_Base64 } from '~/common/util/blobUtils';
+import { getDomainModelIdOrThrow } from '~/common/stores/llms/store-llms';
+import { messageFragmentsReduceText } from '~/common/stores/chat/chat.message';
+
+
+/**
+ * System prompt for image captioning - designed to minimize information loss
+ */
+const IMAGE_CAPTIONING_SYSTEM_PROMPT = `You are an expert at describing images in comprehensive detail. Your goal is to create a text description that captures as much visual information as possible, minimizing information loss for downstream AI models that will only see your text description.
+
+Provide a detailed description covering:
+1. **Overall Scene & Main Elements**: What is the primary subject? What is happening?
+2. **Visual Details**: Colors, textures, patterns, materials, lighting, shadows, style (photographic, illustration, 3D render, etc.)
+3. **Spatial Layout & Composition**: Positions and relationships between elements, foreground/background, arrangement
+4. **Text Content**: If any text is visible, transcribe it exactly as it appears
+5. **Context & Atmosphere**: Mood, setting, time of day, weather conditions (if applicable)
+6. **Technical Aspects**: For UI/diagrams/charts - describe structure, labels, data, flow, connections, hierarchy
+
+Be thorough but concise. Prioritize information that would be difficult to infer from a general description.`;
+
+
+/**
+ * Generate a detailed text description of an image using a vision model
+ * @returns The generated caption text
+ */
+export async function imageCaptionFromImageOrThrow(
+  imageBlob: Blob,
+  imageMimeType: string,
+  contextRef: string,
+  abortSignal: AbortSignal,
+  onProgress?: (progress: number) => void,
+): Promise<string> {
+
+  // can throw if no model
+  const llmId = getDomainModelIdOrThrow(['imageCaption', 'fastUtil'], false, true, 'aifn-image-caption');
+
+  // image -> base64
+  onProgress?.(0);
+  const base64Data = await convert_Blob_To_Base64(imageBlob, 'aifn-image-caption');
+  onProgress?.(20);
+
+  // create the vision request with inline image
+  const visionRequest: AixAPIChatGenerate_Request = {
+    systemMessage: {
+      parts: [{ pt: 'text', text: IMAGE_CAPTIONING_SYSTEM_PROMPT }],
+    },
+    chatSequence: [{
+      role: 'user',
+      parts: [
+        { pt: 'text', text: 'Describe this image in comprehensive detail.' },
+        { pt: 'inline_image', mimeType: imageMimeType as any, base64: base64Data },
+      ],
+    }],
+  } as const;
+
+  // call AIX with vision model
+  let lastProgress = 40;
+  const result = await aixChatGenerateContent_DMessage_orThrow(
+    llmId,
+    visionRequest,
+    aixCreateChatGenerateContext('aifn-image-caption', contextRef),
+    true, // streaming
+    { abortSignal },
+    (update, isDone) => {
+      // update progress during streaming, 40...90% with every token (so we assume at least 200 tokens)
+      if (!isDone && onProgress)
+        onProgress(Math.round(lastProgress = Math.min(90, lastProgress + 0.25)));
+    },
+  );
+
+  // extract text from the result
+  onProgress?.(100);
+
+  // concatenate all text fragments
+  const caption = messageFragmentsReduceText(result.fragments, '', false);
+  if (!caption.trim())
+    throw new Error('Vision model returned empty caption');
+
+  return caption.trim();
+}
@@ -471,6 +471,7 @@ export namespace AixWire_API {
      // streaming AI operations
      'ai-diagram',               // making a diagram - messageId
      'ai-flattener',             // flattening a thread - messageId of the first message
+      'aifn-image-caption',       // generating image captions - attachmentId
      'beam-gather',              // fusing beam rays - fusionId
      'beam-scatter',             // scattering beam rays - rayId
      'call',                     // having a phone conversation - messageId of the first message
@@ -79,7 +79,7 @@ export function LLMOptionsModal(props: { id: DLLMId, onClose: () => void }) {

  // state - auto-open details if user has customized pricing or token limits
  const [showDetails, setShowDetails] = React.useState(
-    !!llm?.userPricing || llm?.userContextTokens !== undefined || llm?.userMaxOutputTokens !== undefined
+    !!llm?.userPricing || llm?.userContextTokens !== undefined || llm?.userMaxOutputTokens !== undefined,
  );
  const domainAssignments = useModelDomains();
  const { removeLLM, updateLLM, assignDomainModelId, resetLLMUserParameters } = llmsStoreActions();
@@ -258,7 +258,7 @@ export function LLMOptionsModal(props: { id: DLLMId, onClose: () => void }) {
      <FormControl orientation='horizontal' sx={{ flexWrap: 'wrap', alignItems: 'center' }}>
        <FormLabelStart title='Assignment' description='Default model' sx={{ minWidth: 80 }} />
        <ButtonGroup orientation='horizontal' size='sm' variant='outlined'>
-          {ModelDomainsList.map(domainId => {
+          {ModelDomainsList.filter(dId => !['imageCaption'].includes(dId)).map(domainId => {
            const domainSpec = ModelDomainsRegistry[domainId];
            const domainModelId = domainAssignments[domainId]?.modelId;
            const isActive = domainModelId === llm.id;