diff --git a/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx b/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx
index ffdd15162..e2235c50c 100644
--- a/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx
+++ b/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx
@@ -98,6 +98,7 @@ const converterTypeToIconMap: { [key in AttachmentDraftConverterType]: React.Com
'image-resized-high': PhotoSizeSelectLargeOutlinedIcon,
'image-resized-low': PhotoSizeSelectSmallOutlinedIcon,
'image-to-default': ImageOutlinedIcon,
+ 'image-caption': AbcIcon,
'image-ocr': AbcIcon,
'pdf-text': PictureAsPdfIcon,
'pdf-images': PermMediaOutlinedIcon,
diff --git a/src/apps/settings-modal/AppChatSettingsAI.tsx b/src/apps/settings-modal/AppChatSettingsAI.tsx
index a454822e9..4b5390250 100644
--- a/src/apps/settings-modal/AppChatSettingsAI.tsx
+++ b/src/apps/settings-modal/AppChatSettingsAI.tsx
@@ -129,6 +129,13 @@ export function AppChatSettingsAI() {
>}
/>
+
+
{labsDevMode && (
edit(attachment.id, { outputsConversionProgress: progress / 100 }),
+ );
+ // if we're here we shall have valid text
+ newFragments.push(createDocAttachmentFragment(
+ title,
+ caption + ' (Caption)',
+ DVMimeType.TextPlain,
+ createDMessageDataInlineText(captionText || 'This image could not be described', 'text/plain'),
+ refString,
+ DOCPART_DEFAULT_VERSION,
+ { ...docMeta, srcOcrFrom: 'image-caption' },
+ ));
+ } catch (error: any) {
+ console.log('[DEV] Failed to caption image:', error);
+ const errorText = `[Captioning failed: ${error?.message || String(error)}]`;
+ newFragments.push(createDocAttachmentFragment(title, caption + ' (Error)', DVMimeType.TextPlain, createDMessageDataInlineText(errorText, 'text/plain'), refString, DOCPART_DEFAULT_VERSION, { ...docMeta, srcOcrFrom: 'image-caption' }));
+ }
+ break;
+
// pdf to text
case 'pdf-text':
diff --git a/src/common/attachment-drafts/attachment.types.ts b/src/common/attachment-drafts/attachment.types.ts
index ea18b184a..bc7cf8403 100644
--- a/src/common/attachment-drafts/attachment.types.ts
+++ b/src/common/attachment-drafts/attachment.types.ts
@@ -136,7 +136,7 @@ export type AttachmentDraftConverter = {
export type AttachmentDraftConverterType =
| 'text' | 'rich-text' | 'rich-text-cleaner' | 'rich-text-table'
- | 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-to-default'
+ | 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-caption' | 'image-to-default'
| 'pdf-text' | 'pdf-images' | 'pdf-text-and-images'
| 'docx-to-html'
| 'url-page-text' | 'url-page-markdown' | 'url-page-html' | 'url-page-null' | 'url-page-image'
diff --git a/src/common/stores/chat/chat.fragments.ts b/src/common/stores/chat/chat.fragments.ts
index 4d0b99eed..023125990 100644
--- a/src/common/stores/chat/chat.fragments.ts
+++ b/src/common/stores/chat/chat.fragments.ts
@@ -119,7 +119,7 @@ type DMessageDocMeta = {
codeLanguage?: string;
srcFileName?: string;
srcFileSize?: number;
- srcOcrFrom?: 'image' | 'pdf';
+ srcOcrFrom?: 'image' | 'pdf' | 'image-caption';
}
diff --git a/src/common/stores/llms/model.domains.registry.ts b/src/common/stores/llms/model.domains.registry.ts
index dc5f52fec..4c1f63589 100644
--- a/src/common/stores/llms/model.domains.registry.ts
+++ b/src/common/stores/llms/model.domains.registry.ts
@@ -19,7 +19,7 @@ type ModelDomainSpec = {
};
-export const ModelDomainsList: DModelDomainId[] = ['primaryChat', 'codeApply', 'fastUtil'] as const;
+export const ModelDomainsList: DModelDomainId[] = ['primaryChat', 'codeApply', 'fastUtil', 'imageCaption'] as const;
export const ModelDomainsRegistry: Record = {
primaryChat: {
@@ -48,6 +48,15 @@ export const ModelDomainsRegistry: Record = {
autoStrategy: 'topVendorLowestCost',
requiredInterfaces: [LLM_IF_OAI_Fn], // NOTE: we do enforce this already, although this may not be correctly set for all vendors
},
+ imageCaption: {
+ label: 'Image Captioning',
+ confLabel: 'Vision',
+ confTooltip: 'Vision model for image captioning',
+ description: 'Describes images as text',
+ recommended: 'Qwen VL',
+ autoStrategy: 'topVendorTopLlm',
+ fallbackDomain: 'primaryChat',
+ },
};
diff --git a/src/common/stores/llms/model.domains.types.ts b/src/common/stores/llms/model.domains.types.ts
index b5b422735..40e99c7b1 100644
--- a/src/common/stores/llms/model.domains.types.ts
+++ b/src/common/stores/llms/model.domains.types.ts
@@ -17,4 +17,9 @@ export type DModelDomainId =
* Fast Utility model; must have function calling, but we won't enforce in the code for now until all LLMs are correctly identified as FC or not - used for quick responses and simple tasks
*/
'fastUtil'
+ |
+ /**
+ * Image Captioning model - used to generate detailed text descriptions of images before sending to primary chat model
+ */
+ 'imageCaption'
;
\ No newline at end of file
diff --git a/src/modules/aifn/image-caption/imageCaptionFromImage.ts b/src/modules/aifn/image-caption/imageCaptionFromImage.ts
new file mode 100644
index 000000000..c8fc35450
--- /dev/null
+++ b/src/modules/aifn/image-caption/imageCaptionFromImage.ts
@@ -0,0 +1,83 @@
+import type { AixAPIChatGenerate_Request } from '~/modules/aix/server/api/aix.wiretypes';
+import { aixChatGenerateContent_DMessage_orThrow, aixCreateChatGenerateContext } from '~/modules/aix/client/aix.client';
+
+import { convert_Blob_To_Base64 } from '~/common/util/blobUtils';
+import { getDomainModelIdOrThrow } from '~/common/stores/llms/store-llms';
+import { messageFragmentsReduceText } from '~/common/stores/chat/chat.message';
+
+
+/**
+ * System prompt for image captioning - designed to minimize information loss
+ */
+const IMAGE_CAPTIONING_SYSTEM_PROMPT = `You are an expert at describing images in comprehensive detail. Your goal is to create a text description that captures as much visual information as possible, minimizing information loss for downstream AI models that will only see your text description.
+
+Provide a detailed description covering:
+1. **Overall Scene & Main Elements**: What is the primary subject? What is happening?
+2. **Visual Details**: Colors, textures, patterns, materials, lighting, shadows, style (photographic, illustration, 3D render, etc.)
+3. **Spatial Layout & Composition**: Positions and relationships between elements, foreground/background, arrangement
+4. **Text Content**: If any text is visible, transcribe it exactly as it appears
+5. **Context & Atmosphere**: Mood, setting, time of day, weather conditions (if applicable)
+6. **Technical Aspects**: For UI/diagrams/charts - describe structure, labels, data, flow, connections, hierarchy
+
+Be thorough but concise. Prioritize information that would be difficult to infer from a general description.`;
+
+
+/**
+ * Generate a detailed text description of an image using a vision model
+ * @returns The generated caption text
+ */
+export async function imageCaptionFromImageOrThrow(
+ imageBlob: Blob,
+ imageMimeType: string,
+ contextRef: string,
+ abortSignal: AbortSignal,
+ onProgress?: (progress: number) => void,
+): Promise {
+
+ // can throw if no model
+ const llmId = getDomainModelIdOrThrow(['imageCaption', 'fastUtil'], false, true, 'aifn-image-caption');
+
+ // image -> base64
+ onProgress?.(0);
+ const base64Data = await convert_Blob_To_Base64(imageBlob, 'aifn-image-caption');
+ onProgress?.(20);
+
+ // create the vision request with inline image
+ const visionRequest: AixAPIChatGenerate_Request = {
+ systemMessage: {
+ parts: [{ pt: 'text', text: IMAGE_CAPTIONING_SYSTEM_PROMPT }],
+ },
+ chatSequence: [{
+ role: 'user',
+ parts: [
+ { pt: 'text', text: 'Describe this image in comprehensive detail.' },
+ { pt: 'inline_image', mimeType: imageMimeType as any, base64: base64Data },
+ ],
+ }],
+ } as const;
+
+ // call AIX with vision model
+ let lastProgress = 40;
+ const result = await aixChatGenerateContent_DMessage_orThrow(
+ llmId,
+ visionRequest,
+ aixCreateChatGenerateContext('aifn-image-caption', contextRef),
+ true, // streaming
+ { abortSignal },
+ (update, isDone) => {
+ // update progress during streaming, 40...90% with every token (so we assume at least 200 tokens)
+ if (!isDone && onProgress)
+ onProgress(Math.round(lastProgress = Math.min(90, lastProgress + 0.25)));
+ },
+ );
+
+ // extract text from the result
+ onProgress?.(100);
+
+ // concatenate all text fragments
+ const caption = messageFragmentsReduceText(result.fragments, '', false);
+ if (!caption.trim())
+ throw new Error('Vision model returned empty caption');
+
+ return caption.trim();
+}
diff --git a/src/modules/aix/server/api/aix.wiretypes.ts b/src/modules/aix/server/api/aix.wiretypes.ts
index 56b8d250a..69cccb695 100644
--- a/src/modules/aix/server/api/aix.wiretypes.ts
+++ b/src/modules/aix/server/api/aix.wiretypes.ts
@@ -471,6 +471,7 @@ export namespace AixWire_API {
// streaming AI operations
'ai-diagram', // making a diagram - messageId
'ai-flattener', // flattening a thread - messageId of the first message
+ 'aifn-image-caption', // generating image captions - attachmentId
'beam-gather', // fusing beam rays - fusionId
'beam-scatter', // scattering beam rays - rayId
'call', // having a phone conversation - messageId of the first message
diff --git a/src/modules/llms/models-modal/LLMOptionsModal.tsx b/src/modules/llms/models-modal/LLMOptionsModal.tsx
index cc8183eff..5ba5509bf 100644
--- a/src/modules/llms/models-modal/LLMOptionsModal.tsx
+++ b/src/modules/llms/models-modal/LLMOptionsModal.tsx
@@ -79,7 +79,7 @@ export function LLMOptionsModal(props: { id: DLLMId, onClose: () => void }) {
// state - auto-open details if user has customized pricing or token limits
const [showDetails, setShowDetails] = React.useState(
- !!llm?.userPricing || llm?.userContextTokens !== undefined || llm?.userMaxOutputTokens !== undefined
+ !!llm?.userPricing || llm?.userContextTokens !== undefined || llm?.userMaxOutputTokens !== undefined,
);
const domainAssignments = useModelDomains();
const { removeLLM, updateLLM, assignDomainModelId, resetLLMUserParameters } = llmsStoreActions();
@@ -258,7 +258,7 @@ export function LLMOptionsModal(props: { id: DLLMId, onClose: () => void }) {
- {ModelDomainsList.map(domainId => {
+ {ModelDomainsList.filter(dId => !['imageCaption'].includes(dId)).map(domainId => {
const domainSpec = ModelDomainsRegistry[domainId];
const domainModelId = domainAssignments[domainId]?.modelId;
const isActive = domainModelId === llm.id;