*Image Captioning with a dedicated (configurable) model. Fixes #862

This commit is contained in:
Enrico Ros
2025-11-05 14:11:35 -08:00
parent d7f3594a73
commit ffd76dc587
10 changed files with 144 additions and 5 deletions
@@ -98,6 +98,7 @@ const converterTypeToIconMap: { [key in AttachmentDraftConverterType]: React.Com
'image-resized-high': PhotoSizeSelectLargeOutlinedIcon,
'image-resized-low': PhotoSizeSelectSmallOutlinedIcon,
'image-to-default': ImageOutlinedIcon,
'image-caption': AbcIcon,
'image-ocr': AbcIcon,
'pdf-text': PictureAsPdfIcon,
'pdf-images': PermMediaOutlinedIcon,
@@ -129,6 +129,13 @@ export function AppChatSettingsAI() {
</>}
/>
<FormControlDomainModel
domainId='imageCaption'
title='Vision model'
description='Image captioning'
tooltip='Vision model used to generate text descriptions of images when the Caption (Text) attachment option is selected.'
/>
{labsDevMode && (
<FormControlDomainModel
domainId='primaryChat'
@@ -2,12 +2,14 @@ import type { FileWithHandle } from 'browser-fs-access';
import { callBrowseFetchPageOrThrow } from '~/modules/browse/browse.client';
import { extractYoutubeVideoIDFromURL } from '~/modules/youtube/youtube.utils';
import { imageCaptionFromImageOrThrow } from '~/modules/aifn/image-caption/imageCaptionFromImage';
import { youTubeGetVideoData } from '~/modules/youtube/useYouTubeTranscript';
import type { CommonImageMimeTypes } from '~/common/util/imageUtils';
import { Is } from '~/common/util/pwaUtils';
import { agiCustomId, agiUuid } from '~/common/util/idUtils';
import { convert_Base64DataURL_To_Base64WithMimeType, convert_Base64WithMimeType_To_Blob } from '~/common/util/blobUtils';
import { getDomainModelConfiguration } from '~/common/stores/llms/hooks/useModelDomain';
import { htmlTableToMarkdown } from '~/common/util/htmlTableToMarkdown';
import { humanReadableHyphenated } from '~/common/util/textUtils';
import { pdfToImageDataURLs, pdfToText } from '~/common/util/pdfUtils';
@@ -279,11 +281,13 @@ export function attachmentDefineConverters(source: AttachmentDraftSource, input:
// Images (Known/Unknown)
case input.mimeType.startsWith('image/'):
const inputImageMimeSupported = mimeTypeIsSupportedImage(input.mimeType);
const visionModelMissing = !getDomainModelConfiguration('imageCaption', true, true);
converters.push({ id: 'image-resized-high', name: 'Image (high detail)', disabled: !inputImageMimeSupported });
converters.push({ id: 'image-resized-low', name: 'Image (low detail)', disabled: !inputImageMimeSupported });
converters.push({ id: 'image-original', name: 'Image (original quality)', disabled: !inputImageMimeSupported });
if (!inputImageMimeSupported)
converters.push({ id: 'image-to-default', name: `As Image (${DEFAULT_ADRAFT_IMAGE_MIMETYPE})` });
converters.push({ id: 'image-caption', name: 'Caption (Text)', disabled: visionModelMissing });
converters.push({ id: 'unhandled', name: 'No Image' });
converters.push({ id: 'image-ocr', name: 'Add Text (OCR)', isCheckbox: true });
break;
@@ -590,6 +594,35 @@ export async function attachmentPerformConversion(
}
break;
// image to caption
case 'image-caption':
if (!_expectBlob(input.data, 'Image captioning converter')) break;
try {
const abortController = new AbortController();
const captionText = await imageCaptionFromImageOrThrow(
input.data,
input.mimeType,
attachment.id,
abortController.signal,
progress => edit(attachment.id, { outputsConversionProgress: progress / 100 }),
);
// if we're here we shall have valid text
newFragments.push(createDocAttachmentFragment(
title,
caption + ' (Caption)',
DVMimeType.TextPlain,
createDMessageDataInlineText(captionText || 'This image could not be described', 'text/plain'),
refString,
DOCPART_DEFAULT_VERSION,
{ ...docMeta, srcOcrFrom: 'image-caption' },
));
} catch (error: any) {
console.log('[DEV] Failed to caption image:', error);
const errorText = `[Captioning failed: ${error?.message || String(error)}]`;
newFragments.push(createDocAttachmentFragment(title, caption + ' (Error)', DVMimeType.TextPlain, createDMessageDataInlineText(errorText, 'text/plain'), refString, DOCPART_DEFAULT_VERSION, { ...docMeta, srcOcrFrom: 'image-caption' }));
}
break;
// pdf to text
case 'pdf-text':
@@ -136,7 +136,7 @@ export type AttachmentDraftConverter = {
export type AttachmentDraftConverterType =
| 'text' | 'rich-text' | 'rich-text-cleaner' | 'rich-text-table'
| 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-to-default'
| 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-caption' | 'image-to-default'
| 'pdf-text' | 'pdf-images' | 'pdf-text-and-images'
| 'docx-to-html'
| 'url-page-text' | 'url-page-markdown' | 'url-page-html' | 'url-page-null' | 'url-page-image'
+1 -1
View File
@@ -119,7 +119,7 @@ type DMessageDocMeta = {
codeLanguage?: string;
srcFileName?: string;
srcFileSize?: number;
srcOcrFrom?: 'image' | 'pdf';
srcOcrFrom?: 'image' | 'pdf' | 'image-caption';
}
@@ -19,7 +19,7 @@ type ModelDomainSpec = {
};
export const ModelDomainsList: DModelDomainId[] = ['primaryChat', 'codeApply', 'fastUtil'] as const;
export const ModelDomainsList: DModelDomainId[] = ['primaryChat', 'codeApply', 'fastUtil', 'imageCaption'] as const;
export const ModelDomainsRegistry: Record<DModelDomainId, ModelDomainSpec> = {
primaryChat: {
@@ -48,6 +48,15 @@ export const ModelDomainsRegistry: Record<DModelDomainId, ModelDomainSpec> = {
autoStrategy: 'topVendorLowestCost',
requiredInterfaces: [LLM_IF_OAI_Fn], // NOTE: we do enforce this already, although this may not be correctly set for all vendors
},
imageCaption: {
label: 'Image Captioning',
confLabel: 'Vision',
confTooltip: 'Vision model for image captioning',
description: 'Describes images as text',
recommended: 'Qwen VL',
autoStrategy: 'topVendorTopLlm',
fallbackDomain: 'primaryChat',
},
};
@@ -17,4 +17,9 @@ export type DModelDomainId =
* Fast Utility model; must have function calling, but we won't enforce in the code for now until all LLMs are correctly identified as FC or not - used for quick responses and simple tasks
*/
'fastUtil'
|
/**
* Image Captioning model - used to generate detailed text descriptions of images before sending to primary chat model
*/
'imageCaption'
;
@@ -0,0 +1,83 @@
import type { AixAPIChatGenerate_Request } from '~/modules/aix/server/api/aix.wiretypes';
import { aixChatGenerateContent_DMessage_orThrow, aixCreateChatGenerateContext } from '~/modules/aix/client/aix.client';
import { convert_Blob_To_Base64 } from '~/common/util/blobUtils';
import { getDomainModelIdOrThrow } from '~/common/stores/llms/store-llms';
import { messageFragmentsReduceText } from '~/common/stores/chat/chat.message';
/**
* System prompt for image captioning - designed to minimize information loss
*/
const IMAGE_CAPTIONING_SYSTEM_PROMPT = `You are an expert at describing images in comprehensive detail. Your goal is to create a text description that captures as much visual information as possible, minimizing information loss for downstream AI models that will only see your text description.
Provide a detailed description covering:
1. **Overall Scene & Main Elements**: What is the primary subject? What is happening?
2. **Visual Details**: Colors, textures, patterns, materials, lighting, shadows, style (photographic, illustration, 3D render, etc.)
3. **Spatial Layout & Composition**: Positions and relationships between elements, foreground/background, arrangement
4. **Text Content**: If any text is visible, transcribe it exactly as it appears
5. **Context & Atmosphere**: Mood, setting, time of day, weather conditions (if applicable)
6. **Technical Aspects**: For UI/diagrams/charts - describe structure, labels, data, flow, connections, hierarchy
Be thorough but concise. Prioritize information that would be difficult to infer from a general description.`;
/**
* Generate a detailed text description of an image using a vision model
* @returns The generated caption text
*/
export async function imageCaptionFromImageOrThrow(
imageBlob: Blob,
imageMimeType: string,
contextRef: string,
abortSignal: AbortSignal,
onProgress?: (progress: number) => void,
): Promise<string> {
// can throw if no model
const llmId = getDomainModelIdOrThrow(['imageCaption', 'fastUtil'], false, true, 'aifn-image-caption');
// image -> base64
onProgress?.(0);
const base64Data = await convert_Blob_To_Base64(imageBlob, 'aifn-image-caption');
onProgress?.(20);
// create the vision request with inline image
const visionRequest: AixAPIChatGenerate_Request = {
systemMessage: {
parts: [{ pt: 'text', text: IMAGE_CAPTIONING_SYSTEM_PROMPT }],
},
chatSequence: [{
role: 'user',
parts: [
{ pt: 'text', text: 'Describe this image in comprehensive detail.' },
{ pt: 'inline_image', mimeType: imageMimeType as any, base64: base64Data },
],
}],
} as const;
// call AIX with vision model
let lastProgress = 40;
const result = await aixChatGenerateContent_DMessage_orThrow(
llmId,
visionRequest,
aixCreateChatGenerateContext('aifn-image-caption', contextRef),
true, // streaming
{ abortSignal },
(update, isDone) => {
// update progress during streaming, 40...90% with every token (so we assume at least 200 tokens)
if (!isDone && onProgress)
onProgress(Math.round(lastProgress = Math.min(90, lastProgress + 0.25)));
},
);
// extract text from the result
onProgress?.(100);
// concatenate all text fragments
const caption = messageFragmentsReduceText(result.fragments, '', false);
if (!caption.trim())
throw new Error('Vision model returned empty caption');
return caption.trim();
}
@@ -471,6 +471,7 @@ export namespace AixWire_API {
// streaming AI operations
'ai-diagram', // making a diagram - messageId
'ai-flattener', // flattening a thread - messageId of the first message
'aifn-image-caption', // generating image captions - attachmentId
'beam-gather', // fusing beam rays - fusionId
'beam-scatter', // scattering beam rays - rayId
'call', // having a phone conversation - messageId of the first message
@@ -79,7 +79,7 @@ export function LLMOptionsModal(props: { id: DLLMId, onClose: () => void }) {
// state - auto-open details if user has customized pricing or token limits
const [showDetails, setShowDetails] = React.useState(
!!llm?.userPricing || llm?.userContextTokens !== undefined || llm?.userMaxOutputTokens !== undefined
!!llm?.userPricing || llm?.userContextTokens !== undefined || llm?.userMaxOutputTokens !== undefined,
);
const domainAssignments = useModelDomains();
const { removeLLM, updateLLM, assignDomainModelId, resetLLMUserParameters } = llmsStoreActions();
@@ -258,7 +258,7 @@ export function LLMOptionsModal(props: { id: DLLMId, onClose: () => void }) {
<FormControl orientation='horizontal' sx={{ flexWrap: 'wrap', alignItems: 'center' }}>
<FormLabelStart title='Assignment' description='Default model' sx={{ minWidth: 80 }} />
<ButtonGroup orientation='horizontal' size='sm' variant='outlined'>
{ModelDomainsList.map(domainId => {
{ModelDomainsList.filter(dId => !['imageCaption'].includes(dId)).map(domainId => {
const domainSpec = ModelDomainsRegistry[domainId];
const domainModelId = domainAssignments[domainId]?.modelId;
const isActive = domainModelId === llm.id;