mirror of
https://github.com/enricoros/big-AGI.git
synced 2026-05-10 21:50:14 -07:00
*Image Captioning with a dedicated (configurable) model. Fixes #862
This commit is contained in:
@@ -98,6 +98,7 @@ const converterTypeToIconMap: { [key in AttachmentDraftConverterType]: React.Com
|
||||
'image-resized-high': PhotoSizeSelectLargeOutlinedIcon,
|
||||
'image-resized-low': PhotoSizeSelectSmallOutlinedIcon,
|
||||
'image-to-default': ImageOutlinedIcon,
|
||||
'image-caption': AbcIcon,
|
||||
'image-ocr': AbcIcon,
|
||||
'pdf-text': PictureAsPdfIcon,
|
||||
'pdf-images': PermMediaOutlinedIcon,
|
||||
|
||||
@@ -129,6 +129,13 @@ export function AppChatSettingsAI() {
|
||||
</>}
|
||||
/>
|
||||
|
||||
<FormControlDomainModel
|
||||
domainId='imageCaption'
|
||||
title='Vision model'
|
||||
description='Image captioning'
|
||||
tooltip='Vision model used to generate text descriptions of images when the Caption (Text) attachment option is selected.'
|
||||
/>
|
||||
|
||||
{labsDevMode && (
|
||||
<FormControlDomainModel
|
||||
domainId='primaryChat'
|
||||
|
||||
@@ -2,12 +2,14 @@ import type { FileWithHandle } from 'browser-fs-access';
|
||||
|
||||
import { callBrowseFetchPageOrThrow } from '~/modules/browse/browse.client';
|
||||
import { extractYoutubeVideoIDFromURL } from '~/modules/youtube/youtube.utils';
|
||||
import { imageCaptionFromImageOrThrow } from '~/modules/aifn/image-caption/imageCaptionFromImage';
|
||||
import { youTubeGetVideoData } from '~/modules/youtube/useYouTubeTranscript';
|
||||
|
||||
import type { CommonImageMimeTypes } from '~/common/util/imageUtils';
|
||||
import { Is } from '~/common/util/pwaUtils';
|
||||
import { agiCustomId, agiUuid } from '~/common/util/idUtils';
|
||||
import { convert_Base64DataURL_To_Base64WithMimeType, convert_Base64WithMimeType_To_Blob } from '~/common/util/blobUtils';
|
||||
import { getDomainModelConfiguration } from '~/common/stores/llms/hooks/useModelDomain';
|
||||
import { htmlTableToMarkdown } from '~/common/util/htmlTableToMarkdown';
|
||||
import { humanReadableHyphenated } from '~/common/util/textUtils';
|
||||
import { pdfToImageDataURLs, pdfToText } from '~/common/util/pdfUtils';
|
||||
@@ -279,11 +281,13 @@ export function attachmentDefineConverters(source: AttachmentDraftSource, input:
|
||||
// Images (Known/Unknown)
|
||||
case input.mimeType.startsWith('image/'):
|
||||
const inputImageMimeSupported = mimeTypeIsSupportedImage(input.mimeType);
|
||||
const visionModelMissing = !getDomainModelConfiguration('imageCaption', true, true);
|
||||
converters.push({ id: 'image-resized-high', name: 'Image (high detail)', disabled: !inputImageMimeSupported });
|
||||
converters.push({ id: 'image-resized-low', name: 'Image (low detail)', disabled: !inputImageMimeSupported });
|
||||
converters.push({ id: 'image-original', name: 'Image (original quality)', disabled: !inputImageMimeSupported });
|
||||
if (!inputImageMimeSupported)
|
||||
converters.push({ id: 'image-to-default', name: `As Image (${DEFAULT_ADRAFT_IMAGE_MIMETYPE})` });
|
||||
converters.push({ id: 'image-caption', name: 'Caption (Text)', disabled: visionModelMissing });
|
||||
converters.push({ id: 'unhandled', name: 'No Image' });
|
||||
converters.push({ id: 'image-ocr', name: 'Add Text (OCR)', isCheckbox: true });
|
||||
break;
|
||||
@@ -590,6 +594,35 @@ export async function attachmentPerformConversion(
|
||||
}
|
||||
break;
|
||||
|
||||
// image to caption
|
||||
case 'image-caption':
|
||||
if (!_expectBlob(input.data, 'Image captioning converter')) break;
|
||||
try {
|
||||
const abortController = new AbortController();
|
||||
const captionText = await imageCaptionFromImageOrThrow(
|
||||
input.data,
|
||||
input.mimeType,
|
||||
attachment.id,
|
||||
abortController.signal,
|
||||
progress => edit(attachment.id, { outputsConversionProgress: progress / 100 }),
|
||||
);
|
||||
// if we're here we shall have valid text
|
||||
newFragments.push(createDocAttachmentFragment(
|
||||
title,
|
||||
caption + ' (Caption)',
|
||||
DVMimeType.TextPlain,
|
||||
createDMessageDataInlineText(captionText || 'This image could not be described', 'text/plain'),
|
||||
refString,
|
||||
DOCPART_DEFAULT_VERSION,
|
||||
{ ...docMeta, srcOcrFrom: 'image-caption' },
|
||||
));
|
||||
} catch (error: any) {
|
||||
console.log('[DEV] Failed to caption image:', error);
|
||||
const errorText = `[Captioning failed: ${error?.message || String(error)}]`;
|
||||
newFragments.push(createDocAttachmentFragment(title, caption + ' (Error)', DVMimeType.TextPlain, createDMessageDataInlineText(errorText, 'text/plain'), refString, DOCPART_DEFAULT_VERSION, { ...docMeta, srcOcrFrom: 'image-caption' }));
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
// pdf to text
|
||||
case 'pdf-text':
|
||||
|
||||
@@ -136,7 +136,7 @@ export type AttachmentDraftConverter = {
|
||||
|
||||
export type AttachmentDraftConverterType =
|
||||
| 'text' | 'rich-text' | 'rich-text-cleaner' | 'rich-text-table'
|
||||
| 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-to-default'
|
||||
| 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-caption' | 'image-to-default'
|
||||
| 'pdf-text' | 'pdf-images' | 'pdf-text-and-images'
|
||||
| 'docx-to-html'
|
||||
| 'url-page-text' | 'url-page-markdown' | 'url-page-html' | 'url-page-null' | 'url-page-image'
|
||||
|
||||
@@ -119,7 +119,7 @@ type DMessageDocMeta = {
|
||||
codeLanguage?: string;
|
||||
srcFileName?: string;
|
||||
srcFileSize?: number;
|
||||
srcOcrFrom?: 'image' | 'pdf';
|
||||
srcOcrFrom?: 'image' | 'pdf' | 'image-caption';
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ type ModelDomainSpec = {
|
||||
};
|
||||
|
||||
|
||||
export const ModelDomainsList: DModelDomainId[] = ['primaryChat', 'codeApply', 'fastUtil'] as const;
|
||||
export const ModelDomainsList: DModelDomainId[] = ['primaryChat', 'codeApply', 'fastUtil', 'imageCaption'] as const;
|
||||
|
||||
export const ModelDomainsRegistry: Record<DModelDomainId, ModelDomainSpec> = {
|
||||
primaryChat: {
|
||||
@@ -48,6 +48,15 @@ export const ModelDomainsRegistry: Record<DModelDomainId, ModelDomainSpec> = {
|
||||
autoStrategy: 'topVendorLowestCost',
|
||||
requiredInterfaces: [LLM_IF_OAI_Fn], // NOTE: we do enforce this already, although this may not be correctly set for all vendors
|
||||
},
|
||||
imageCaption: {
|
||||
label: 'Image Captioning',
|
||||
confLabel: 'Vision',
|
||||
confTooltip: 'Vision model for image captioning',
|
||||
description: 'Describes images as text',
|
||||
recommended: 'Qwen VL',
|
||||
autoStrategy: 'topVendorTopLlm',
|
||||
fallbackDomain: 'primaryChat',
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -17,4 +17,9 @@ export type DModelDomainId =
|
||||
* Fast Utility model; must have function calling, but we won't enforce in the code for now until all LLMs are correctly identified as FC or not - used for quick responses and simple tasks
|
||||
*/
|
||||
'fastUtil'
|
||||
|
|
||||
/**
|
||||
* Image Captioning model - used to generate detailed text descriptions of images before sending to primary chat model
|
||||
*/
|
||||
'imageCaption'
|
||||
;
|
||||
@@ -0,0 +1,83 @@
|
||||
import type { AixAPIChatGenerate_Request } from '~/modules/aix/server/api/aix.wiretypes';
|
||||
import { aixChatGenerateContent_DMessage_orThrow, aixCreateChatGenerateContext } from '~/modules/aix/client/aix.client';
|
||||
|
||||
import { convert_Blob_To_Base64 } from '~/common/util/blobUtils';
|
||||
import { getDomainModelIdOrThrow } from '~/common/stores/llms/store-llms';
|
||||
import { messageFragmentsReduceText } from '~/common/stores/chat/chat.message';
|
||||
|
||||
|
||||
/**
|
||||
* System prompt for image captioning - designed to minimize information loss
|
||||
*/
|
||||
const IMAGE_CAPTIONING_SYSTEM_PROMPT = `You are an expert at describing images in comprehensive detail. Your goal is to create a text description that captures as much visual information as possible, minimizing information loss for downstream AI models that will only see your text description.
|
||||
|
||||
Provide a detailed description covering:
|
||||
1. **Overall Scene & Main Elements**: What is the primary subject? What is happening?
|
||||
2. **Visual Details**: Colors, textures, patterns, materials, lighting, shadows, style (photographic, illustration, 3D render, etc.)
|
||||
3. **Spatial Layout & Composition**: Positions and relationships between elements, foreground/background, arrangement
|
||||
4. **Text Content**: If any text is visible, transcribe it exactly as it appears
|
||||
5. **Context & Atmosphere**: Mood, setting, time of day, weather conditions (if applicable)
|
||||
6. **Technical Aspects**: For UI/diagrams/charts - describe structure, labels, data, flow, connections, hierarchy
|
||||
|
||||
Be thorough but concise. Prioritize information that would be difficult to infer from a general description.`;
|
||||
|
||||
|
||||
/**
|
||||
* Generate a detailed text description of an image using a vision model
|
||||
* @returns The generated caption text
|
||||
*/
|
||||
export async function imageCaptionFromImageOrThrow(
|
||||
imageBlob: Blob,
|
||||
imageMimeType: string,
|
||||
contextRef: string,
|
||||
abortSignal: AbortSignal,
|
||||
onProgress?: (progress: number) => void,
|
||||
): Promise<string> {
|
||||
|
||||
// can throw if no model
|
||||
const llmId = getDomainModelIdOrThrow(['imageCaption', 'fastUtil'], false, true, 'aifn-image-caption');
|
||||
|
||||
// image -> base64
|
||||
onProgress?.(0);
|
||||
const base64Data = await convert_Blob_To_Base64(imageBlob, 'aifn-image-caption');
|
||||
onProgress?.(20);
|
||||
|
||||
// create the vision request with inline image
|
||||
const visionRequest: AixAPIChatGenerate_Request = {
|
||||
systemMessage: {
|
||||
parts: [{ pt: 'text', text: IMAGE_CAPTIONING_SYSTEM_PROMPT }],
|
||||
},
|
||||
chatSequence: [{
|
||||
role: 'user',
|
||||
parts: [
|
||||
{ pt: 'text', text: 'Describe this image in comprehensive detail.' },
|
||||
{ pt: 'inline_image', mimeType: imageMimeType as any, base64: base64Data },
|
||||
],
|
||||
}],
|
||||
} as const;
|
||||
|
||||
// call AIX with vision model
|
||||
let lastProgress = 40;
|
||||
const result = await aixChatGenerateContent_DMessage_orThrow(
|
||||
llmId,
|
||||
visionRequest,
|
||||
aixCreateChatGenerateContext('aifn-image-caption', contextRef),
|
||||
true, // streaming
|
||||
{ abortSignal },
|
||||
(update, isDone) => {
|
||||
// update progress during streaming, 40...90% with every token (so we assume at least 200 tokens)
|
||||
if (!isDone && onProgress)
|
||||
onProgress(Math.round(lastProgress = Math.min(90, lastProgress + 0.25)));
|
||||
},
|
||||
);
|
||||
|
||||
// extract text from the result
|
||||
onProgress?.(100);
|
||||
|
||||
// concatenate all text fragments
|
||||
const caption = messageFragmentsReduceText(result.fragments, '', false);
|
||||
if (!caption.trim())
|
||||
throw new Error('Vision model returned empty caption');
|
||||
|
||||
return caption.trim();
|
||||
}
|
||||
@@ -471,6 +471,7 @@ export namespace AixWire_API {
|
||||
// streaming AI operations
|
||||
'ai-diagram', // making a diagram - messageId
|
||||
'ai-flattener', // flattening a thread - messageId of the first message
|
||||
'aifn-image-caption', // generating image captions - attachmentId
|
||||
'beam-gather', // fusing beam rays - fusionId
|
||||
'beam-scatter', // scattering beam rays - rayId
|
||||
'call', // having a phone conversation - messageId of the first message
|
||||
|
||||
@@ -79,7 +79,7 @@ export function LLMOptionsModal(props: { id: DLLMId, onClose: () => void }) {
|
||||
|
||||
// state - auto-open details if user has customized pricing or token limits
|
||||
const [showDetails, setShowDetails] = React.useState(
|
||||
!!llm?.userPricing || llm?.userContextTokens !== undefined || llm?.userMaxOutputTokens !== undefined
|
||||
!!llm?.userPricing || llm?.userContextTokens !== undefined || llm?.userMaxOutputTokens !== undefined,
|
||||
);
|
||||
const domainAssignments = useModelDomains();
|
||||
const { removeLLM, updateLLM, assignDomainModelId, resetLLMUserParameters } = llmsStoreActions();
|
||||
@@ -258,7 +258,7 @@ export function LLMOptionsModal(props: { id: DLLMId, onClose: () => void }) {
|
||||
<FormControl orientation='horizontal' sx={{ flexWrap: 'wrap', alignItems: 'center' }}>
|
||||
<FormLabelStart title='Assignment' description='Default model' sx={{ minWidth: 80 }} />
|
||||
<ButtonGroup orientation='horizontal' size='sm' variant='outlined'>
|
||||
{ModelDomainsList.map(domainId => {
|
||||
{ModelDomainsList.filter(dId => !['imageCaption'].includes(dId)).map(domainId => {
|
||||
const domainSpec = ModelDomainsRegistry[domainId];
|
||||
const domainModelId = domainAssignments[domainId]?.modelId;
|
||||
const isActive = domainModelId === llm.id;
|
||||
|
||||
Reference in New Issue
Block a user