Attachments: PDF: add a PDF->OCR (via interim images) and an 'Auto' (default)

The Auto mode tries plain Text, then Text to Images to OCR, then falls back to pure images.
This commit is contained in:
Enrico Ros
2026-01-14 15:09:40 -08:00
parent 7aa9cb07b2
commit 88d39345a5
4 changed files with 250 additions and 113 deletions
@@ -1,7 +1,7 @@
import * as React from 'react';
import TimeAgo from 'react-timeago';
import { Box, Button, CircularProgress, ColorPaletteProp, Sheet, Typography, VariantProp } from '@mui/joy';
import { Box, Button, CircularProgress, ColorPaletteProp, ListItem, Sheet, Typography, VariantProp } from '@mui/joy';
import AbcIcon from '@mui/icons-material/Abc';
import CodeIcon from '@mui/icons-material/Code';
import DescriptionOutlinedIcon from '@mui/icons-material/DescriptionOutlined';
@@ -100,8 +100,10 @@ const converterTypeToIconMap: { [key in AttachmentDraftConverterType]: React.Com
'image-to-default': ImageOutlinedIcon,
'image-caption': AbcIcon,
'image-ocr': AbcIcon,
'pdf-auto': PictureAsPdfIcon,
'pdf-text': PictureAsPdfIcon,
'pdf-images': PermMediaOutlinedIcon,
'pdf-images-ocr': AbcIcon,
'pdf-text-and-images': PermMediaOutlinedIcon,
'docx-to-html': DescriptionOutlinedIcon,
'url-page-text': TextFieldsIcon, // was LanguageIcon
@@ -228,9 +230,10 @@ function LLMAttachmentButton(props: {
const isUnconvertible = !draft.converters.length;
const isOutputLoading = draft.outputsConverting;
const isOutputMissing = !draft.outputFragments.length;
const isOutputWarned = !!draft.outputWarnings?.length;
const hasLiveFiles = draft.outputFragments.some(_f => _f.liveFileId);
const showWarning = isUnconvertible || (isOutputMissing || !llmSupportsAllFragments);
const showWarning = isUnconvertible || (isOutputMissing || !llmSupportsAllFragments) || isOutputWarned;
// handlers
@@ -1,16 +1,15 @@
import * as React from 'react';
import type { SxProps } from '@mui/joy/styles/types';
import { Box, Checkbox, Chip, CircularProgress, LinearProgress, ListDivider, ListItem, ListItemDecorator, MenuItem, Radio, Typography } from '@mui/joy';
import AttachmentIcon from '@mui/icons-material/Attachment';
import { Box, Button, ButtonGroup, Checkbox, Chip, CircularProgress, Divider, LinearProgress, ListDivider, ListItem, ListItemDecorator, MenuItem, Radio, Typography } from '@mui/joy';
import ClearIcon from '@mui/icons-material/Clear';
import ContentCopyIcon from '@mui/icons-material/ContentCopy';
import DeleteForeverIcon from '@mui/icons-material/DeleteForever';
import DeleteOutlineIcon from '@mui/icons-material/DeleteOutline';
import ExpandLessIcon from '@mui/icons-material/ExpandLess';
import ExpandMoreIcon from '@mui/icons-material/ExpandMore';
import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown';
import KeyboardArrowLeftIcon from '@mui/icons-material/KeyboardArrowLeft';
import KeyboardArrowRightIcon from '@mui/icons-material/KeyboardArrowRight';
import ReadMoreIcon from '@mui/icons-material/ReadMore';
import VerticalAlignBottomIcon from '@mui/icons-material/VerticalAlignBottom';
import VisibilityIcon from '@mui/icons-material/Visibility';
@@ -18,6 +17,7 @@ import { CloseablePopup } from '~/common/components/CloseablePopup';
import { DMessageAttachmentFragment, DMessageDocPart, DMessageImageRefPart, isDocPart, isImageRefPart, isZyncAssetImageReferencePartWithLegacyDBlob } from '~/common/stores/chat/chat.fragments';
import { LiveFileIcon } from '~/common/livefile/liveFile.icons';
import { copyToClipboard } from '~/common/util/clipboardUtils';
import { humanReadableBytes } from '~/common/util/textUtils';
import { themeZIndexOverMobileDrawer } from '~/common/app.theme';
import { useUIPreferencesStore } from '~/common/stores/store-ui';
@@ -32,12 +32,20 @@ const DEFAULT_DETAILS_OPEN = true;
const SHOW_INLINING_OPERATIONS = false;
const indicatorSx = {
fontSize: '1rem',
} as const;
// const indicatorSx = {
// fontSize: '1rem',
// } as const;
//
// const indicatorGapSx: SxProps = {
// paddingLeft: '1.375rem',
// };
const indicatorGapSx: SxProps = {
paddingLeft: '1.375rem',
const actionButtonsSx: SxProps = {
ml: 'auto',
minHeight: 0,
borderRadius: '1rem',
backgroundColor: 'background.surface',
'& button': { fontSize: 'xs', fontWeight: 'md', py: 0, minWidth: 0, minHeight: 0 },
};
@@ -82,9 +90,10 @@ export function LLMAttachmentMenu(props: {
const isUnconvertible = !draft.converters.length;
const isOutputMissing = !draft.outputFragments.length;
const isOutputMultiple = draft.outputFragments.length > 1;
const isOutputWarned = !!draft.outputWarnings?.length;
const hasLiveFiles = draft.outputFragments.some(_f => _f.liveFileId);
const showWarning = isUnconvertible || isOutputMissing || !llmSupportsAllFragments;
const showWarning = isUnconvertible || isOutputMissing || !llmSupportsAllFragments || isOutputWarned;
// hooks
@@ -197,6 +206,17 @@ export function LLMAttachmentMenu(props: {
)}
</ListItem>
)}
{/* Auto-heuristics message, with explanation */}
{!!draft.outputsHeuristic?.isAuto && (
<ListItem color={draft.outputsHeuristic.isAuto ? 'primary' : undefined} sx={{ fontSize: 'sm', fontWeight: 'lg', mb: 0.5 }}>
{draft.outputsHeuristic.isAuto ? 'Auto: ' : ''}
{draft.outputsHeuristic.actualConverterId === 'pdf-text' && 'Text'}
{draft.outputsHeuristic.actualConverterId === 'pdf-images-ocr' && 'OCR'}
{draft.outputsHeuristic.actualConverterId === 'pdf-images' && 'Images'}
{draft.outputsHeuristic.actualConverterId === 'pdf-text-and-images' && 'Text + Images'}
{draft.outputsHeuristic.explain && ` (${draft.outputsHeuristic.explain})`}
</ListItem>
)}
{!isUnconvertible && draft.converters.map((c, idx) =>
<MenuItem
disabled={c.disabled || isConverting}
@@ -213,18 +233,13 @@ export function LLMAttachmentMenu(props: {
</ListItemDecorator>
{c.unsupported
? <Box>Unsupported 🤔 <Typography level='body-xs'>{c.name}</Typography></Box>
: c.name}
: (/* auto-converted */ draft.outputsHeuristic?.isAuto && c.id === draft.outputsHeuristic.actualConverterId)
? <Box component='span' sx={{ fontWeight: 'lg', color: 'primary.softColor' }}>{c.name}</Box>
: c.name}
</MenuItem>,
)}
{/*{!isUnconvertible && <ListDivider sx={{ mb: 0 }} />}*/}
{/* Auto-fallback notice (e.g., PDF with low text converted to images) */}
{draft.conversionFallback && (
<ListItem sx={{ fontSize: 'sm', color: 'success.softColor', fontStyle: 'italic', py: 0.5, px: 2 }}>
Auto: {draft.conversionFallback.reason}
</ListItem>
)}
{/* Progress indicator (mainly for OCRs of Images, PDFs, and PDF to Images) */}
{!!draft.outputsConversionProgress && draft.outputsConversionProgress < 1 && (
<LinearProgress determinate value={100 * draft.outputsConversionProgress} sx={{ mx: 1 }} />
@@ -268,11 +283,19 @@ export function LLMAttachmentMenu(props: {
<Typography color={isInputError ? 'danger' : 'warning'} level='title-sm'>
{isInputError ? 'Loading Issue' : 'Warning'}
</Typography>
{/* Only show 1 warning, excluding lower priorities */}
{isInputError ? <div>{draft.inputError}</div>
: isUnconvertible ? <div>Attachments of type {draft.input?.mimeType} are not supported yet. You can request this on GitHub.</div>
: isOutputMissing ? <div>File not supported. Please try another format.</div>
: !llmSupportsAllFragments ? <div>May not be compatible with the current model. Please try another format.</div>
: <>Unknown warning</>}
: draft.outputWarnings?.length ? '' /* printed below */
: <>Unknown warning</>}
{/* Explicit output warnings */}
{!!draft.outputWarnings?.length && draft.outputWarnings.map((w, widx) =>
<Box key={'ow-' + widx} sx={{ fontSize: 'sm', color: 'warning.softColor', py: 1 }}> {w}</Box>)
}
</Box>
</MenuItem>
</Box>
@@ -301,24 +324,24 @@ export function LLMAttachmentMenu(props: {
Details
</Typography>
) : (
<Box sx={{ my: 0.5 }}>
<Box sx={{ my: 1 }}>
{/* <- inputs */}
{showInputs && !!draftInput && (
<Typography level='body-sm' textColor='text.primary' startDecorator={<AttachmentIcon sx={indicatorSx} />}>
{draftInput.mimeType}{typeof draftInput.dataSize === 'number' ? ` · ${draftInput.dataSize.toLocaleString()} bytes` : ''}
<Typography level='body-sm' textColor='success.softColor'>
Input: {draftInput.mimeType}{typeof draftInput.dataSize === 'number' ? ` · ${humanReadableBytes(draftInput.dataSize)}` : ''}
</Typography>
)}
{showInputs && !!draftInput?.altMimeType && (
<Typography level='body-sm' sx={indicatorGapSx}>
{draftInput.altMimeType} · {draftInput.altData?.length.toLocaleString()}
<Typography level='body-sm' textColor='success.softColor'>
Input: {draftInput.altMimeType}{!draftInput.altData?.length ? '' : ` · ${humanReadableBytes(draftInput.altData.length)}`}
</Typography>
)}
{showInputs && !!draftInput?.urlImage && (
<Typography level='body-sm' sx={indicatorGapSx}>
{draftInput.urlImage.mimeType} · {draftInput.urlImage.width} x {draftInput.urlImage.height} · {draftInput.urlImage.imgDataUrl?.length.toLocaleString()}
{' · '}
<Chip component='span' size='sm' color='primary' variant='outlined' startDecorator={<VisibilityIcon />} onClick={(event) => {
<Typography level='body-sm' textColor='success.softColor' sx={{ display: 'flex', alignItems: 'center' }}>
Input: {draftInput.urlImage.mimeType} · {draftInput.urlImage.width}x{draftInput.urlImage.height}{!draftInput.urlImage.imgDataUrl?.length ? '' : ` · ${humanReadableBytes(draftInput.urlImage.imgDataUrl.length)}`}
&nbsp;
<Chip component='span' size='sm' color='success' variant='soft' startDecorator={<VisibilityIcon />} onClick={(event) => {
if (draftInput?.urlImage?.imgDataUrl) {
// Invoke the viewer but with a virtual 'temp' part description to see this preview image
handleViewImageRefPart(event, {
@@ -332,8 +355,8 @@ export function LLMAttachmentMenu(props: {
height: draftInput.urlImage.height || undefined,
});
}
}}>
view
}} sx={{ ml: 'auto' }}>
view input
</Chip>
</Typography>
)}
@@ -342,45 +365,79 @@ export function LLMAttachmentMenu(props: {
{/* Converters: {draft.converters.map(((converter, idx) => ` ${converter.id}${converter.isActive ? '*' : ''}`)).join(', ')}*/}
{/*</Typography>*/}
{/* Downward arrow */}
<Divider color='success'>
<KeyboardArrowDownIcon color='success' />
</Divider>
{/* -> Outputs */}
<Box sx={{ mt: 1 }}>
<Box>
{isOutputMissing ? (
<Typography level='body-sm' startDecorator={<ReadMoreIcon sx={indicatorSx} />}>...</Typography>
<Typography level='body-sm' color={isConverting ? 'primary' : 'danger'}>{isConverting ? '...' : '... nothing ...'}</Typography>
) : (
draft.outputFragments.map(({ part }, index) => {
if (isDocPart(part)) {
return (
<Typography key={index} level='body-sm' sx={{ color: 'text.primary' }} startDecorator={<ReadMoreIcon sx={indicatorSx} />}>
<span>{part.data.mimeType /* part.type: big-agi type, not source mime */} · {part.data.text.length.toLocaleString()} bytes ·&nbsp;</span>
<Chip component='span' size='sm' color='primary' variant='outlined' startDecorator={<VisibilityIcon />} onClick={(event) => handleViewDocPart(event, part)}>
view
</Chip>
<Chip component='span' size='sm' color='success' variant='outlined' startDecorator={<ContentCopyIcon />} onClick={(event) => handleCopyToClipboard(event, part.data.text)}>
copy
</Chip>
<Typography key={index} component='div' level='body-sm' textColor='primary.softColor' sx={{ display: 'flex', alignItems: 'center' }}>
<span>{part.data.mimeType /* part.type: big-agi type, not source mime */} · {humanReadableBytes(part.data.text.length)} &nbsp;</span>
{/*<Chip component='span' size='sm' color='primary' variant='outlined' startDecorator={<VisibilityIcon />} onClick={(event) => handleViewDocPart(event, part)} sx={{ ml: 'auto' }}>*/}
{/* view*/}
{/*</Chip>*/}
{/*<Chip component='span' size='sm' color='primary' variant='outlined' startDecorator={<ContentCopyIcon />} onClick={(event) => handleCopyToClipboard(event, part.data.text)}>*/}
{/* copy*/}
{/*</Chip>*/}
<ButtonGroup size='sm' color='primary' variant='outlined' sx={actionButtonsSx}>
<Button startDecorator={<VisibilityIcon sx={{ fontSize: 'md' }} />} onClick={(event) => handleViewDocPart(event, part)}>
view
</Button>
<Button onClick={(event) => handleCopyToClipboard(event, part.data.text)}/* endDecorator={<ContentCopyIcon />} */>
copy
</Button>
</ButtonGroup>
</Typography>
);
} else if (isZyncAssetImageReferencePartWithLegacyDBlob(part) || isImageRefPart(part)) {
// Unified Image Reference handling (both Zync Asset References with legacy fallback and legacy image_ref)
const legacyImageRefPart = isZyncAssetImageReferencePartWithLegacyDBlob(part) ? part._legacyImageRefPart! : part;
const { dataRef, width, height } = legacyImageRefPart;
const resolution = width && height ? `${width} x ${height}` : 'no resolution';
const resolution = width && height ? `${width}x${height}` : 'no resolution';
const mime = dataRef.reftype === 'dblob' ? dataRef.mimeType : 'unknown image';
return (
<Typography key={index} level='body-sm' sx={{ color: 'text.primary' }} startDecorator={<ReadMoreIcon sx={indicatorSx} />}>
<span>{mime /*.replace('image/', 'img: ')*/} · {resolution} · {dataRef.reftype === 'dblob' ? (dataRef.bytesSize?.toLocaleString() || 'no size') : '(remote)'} ·&nbsp;</span>
<Chip component='span' size={isOutputMultiple ? 'sm' : 'md'} color='primary' variant='outlined' startDecorator={<VisibilityIcon />}
onClick={(event) => handleViewImageRefPart(event, legacyImageRefPart)}>
view
</Chip>
{isOutputMultiple && <Chip component='span' size={isOutputMultiple ? 'sm' : 'md'} color='danger' variant='outlined' startDecorator={<DeleteForeverIcon />} onClick={(event) => handleDeleteOutputFragment(event, index)}>
del
</Chip>}
<Typography key={index} component='div' level='body-sm' textColor='primary.softColor' sx={{ display: 'flex', alignItems: 'center' }}>
<span>{mime /*.replace('image/', 'img: ')*/} · {resolution} · {
dataRef.reftype !== 'dblob' ? '(remote)'
: !dataRef.bytesSize ? 'no size'
: humanReadableBytes(dataRef.bytesSize)} &nbsp;</span>
{/*<Chip component='span' size={isOutputMultiple ? 'sm' : 'md'} color='primary' variant='outlined' startDecorator={<VisibilityIcon />}*/}
{/* onClick={(event) => handleViewImageRefPart(event, legacyImageRefPart)}>*/}
{/* view*/}
{/*</Chip>*/}
{/*{isOutputMultiple && <Chip component='span' size={isOutputMultiple ? 'sm' : 'md'} color='danger' variant='outlined' startDecorator={<DeleteForeverIcon />} onClick={(event) => handleDeleteOutputFragment(event, index)}>*/}
{/* del*/}
{/*</Chip>}*/}
<ButtonGroup size='sm' color='primary' variant='outlined' sx={actionButtonsSx}>
<Button
startDecorator={<VisibilityIcon sx={{ fontSize: 'md' }} />}
onClick={(event) => handleViewImageRefPart(event, legacyImageRefPart)}
>
view
</Button>
{isOutputMultiple && (
<Button
color='warning'
endDecorator={<DeleteOutlineIcon sx={{ fontSize: 'md' }} />}
onClick={(event) => handleDeleteOutputFragment(event, index)}
// sx={{ width: 48 }}
>
del
</Button>
)}
</ButtonGroup>
</Typography>
);
} else {
return (
<Typography key={index} level='body-sm' sx={{ color: 'text.primary' }} startDecorator={<ReadMoreIcon sx={indicatorSx} />}>
<Typography key={index} level='body-sm' textColor='primary.softColor'>
{(part as DMessageAttachmentFragment['part']).pt}: (other)
</Typography>
);
@@ -388,8 +445,8 @@ export function LLMAttachmentMenu(props: {
})
)}
{!!llmTokenCountApprox && (
<Typography level='body-xs' mt={0.5} sx={indicatorGapSx}>
~{llmTokenCountApprox.toLocaleString()} tokens
<Typography level='body-xs' mt={0.5} textColor='primary.softColor'>
&nbsp; ~ {llmTokenCountApprox.toLocaleString()} tokens
</Typography>
)}
</Box>
@@ -11,6 +11,7 @@ import { convert_Base64DataURL_To_Base64WithMimeType, convert_Base64WithMimeType
import { getDomainModelConfiguration } from '~/common/stores/llms/hooks/useModelDomain';
import { htmlTableToMarkdown } from '~/common/util/htmlTableToMarkdown';
import { humanReadableHyphenated } from '~/common/util/textUtils';
import { ocrImageWithProgress, ocrPdfPagesWithProgress } from '~/common/util/ocrUtils';
import { pdfToImageDataURLs, pdfToText } from '~/common/util/pdfUtils';
import { createDMessageDataInlineText, createDocAttachmentFragment, DMessageAttachmentFragment, DMessageDataInline, DMessageDocPart, DVMimeType, isContentOrAttachmentFragment, isDocPart, specialContentPartToDocAttachmentFragment } from '~/common/stores/chat/chat.fragments';
@@ -28,7 +29,8 @@ const ENABLE_TEXT_AND_IMAGES = false; // [PROD] ?
const DOCPART_DEFAULT_VERSION = 1;
// PDF text extraction quality thresholds
const PDF_LOW_TEXT_THRESHOLD = 100; // chars per page - below this, consider the PDF as scanned/image-based
const IMAGE_LOW_TEXT_THRESHOLD = 80; // chars per image - below this, consider the image as low-text (photo-like) rather than document-like
const PDF_LOW_TEXT_THRESHOLD = 160; // chars per page - below this, consider the PDF as scanned/image-based
const PDF_FALLBACK_MAX_IMAGES = 32; // max pages to convert to images when auto-falling back (to respect LLM limits)
@@ -288,16 +290,18 @@ export function attachmentDefineConverters(source: AttachmentDraftSource, input:
converters.push({ id: 'image-original', name: 'Image (original quality)', disabled: !inputImageMimeSupported });
if (!inputImageMimeSupported)
converters.push({ id: 'image-to-default', name: `As Image (${PLATFORM_IMAGE_MIMETYPE})` });
converters.push({ id: 'image-caption', name: 'Caption (Text)', disabled: visionModelMissing });
converters.push({ id: 'image-caption', name: 'AI Caption (Text)', disabled: visionModelMissing });
converters.push({ id: 'unhandled', name: 'No Image' });
converters.push({ id: 'image-ocr', name: 'Add Text (OCR)', isCheckbox: true });
break;
// PDF
case mimeTypeIsPDF(input.mimeType):
converters.push({ id: 'pdf-text', name: 'PDF To Text', isActive: !autoAddImages || undefined });
converters.push({ id: 'pdf-images', name: 'PDF To Images' });
converters.push({ id: 'pdf-text-and-images', name: 'PDF Text & Images (best)', isActive: autoAddImages });
converters.push({ id: 'pdf-auto', name: 'Auto', isActive: !autoAddImages });
converters.push({ id: 'pdf-text', name: 'PDF Text' });
converters.push({ id: 'pdf-images-ocr', name: 'PDF -> OCR (for scans)' });
converters.push({ id: 'pdf-images', name: 'PDF -> Images' });
converters.push({ id: 'pdf-text-and-images', name: 'PDF -> Text + Images', isActive: autoAddImages });
break;
// DOCX
@@ -483,6 +487,8 @@ export async function attachmentPerformConversion(
edit(attachment.id, {
outputsConverting: true,
outputsConversionProgress: null,
outputWarnings: undefined,
outputsHeuristic: undefined,
});
// apply converter to the input
@@ -575,23 +581,14 @@ export async function attachmentPerformConversion(
case 'image-ocr':
if (!_expectBlob(input.data, 'Image OCR converter')) break;
try {
let lastProgress = -1;
const { recognize } = await import('tesseract.js');
const result = await recognize(input.data, undefined, {
errorHandler: e => console.error(e),
logger: (message) => {
if (message.status === 'recognizing text') {
if (message.progress > lastProgress + 0.01) {
lastProgress = message.progress;
edit(attachment.id, { outputsConversionProgress: lastProgress });
}
}
},
});
const imageText = result.data.text;
// Image -> OCR -> Inline text doc
const imageText = await ocrImageWithProgress(input.data, (progress) => edit(attachment.id, { outputsConversionProgress: progress }));
newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.TextPlain, createDMessageDataInlineText(imageText, 'text/plain'), refString, DOCPART_DEFAULT_VERSION, { ...docMeta, srcOcrFrom: 'image' }));
// warn if very little text was extracted (likely a photo/diagram rather than text)
if (imageText.trim().length < IMAGE_LOW_TEXT_THRESHOLD)
edit(attachment.id, { outputWarnings: ['Very little text extracted - this image may not contain readable text.'] });
} catch (error) {
console.error(error);
console.error('[Image OCR Error]', error);
}
break;
@@ -620,65 +617,111 @@ export async function attachmentPerformConversion(
} catch (error: any) {
console.log('[DEV] Failed to caption image:', error);
const errorText = `[Captioning failed: ${error?.message || String(error)}]`;
edit(attachment.id, { outputWarnings: [errorText] });
newFragments.push(createDocAttachmentFragment(title, caption + ' (Error)', DVMimeType.TextPlain, createDMessageDataInlineText(errorText, 'text/plain'), refString, DOCPART_DEFAULT_VERSION, { ...docMeta, srcOcrFrom: 'image-caption' }));
}
break;
// pdf to text (with auto-fallback to images for scanned/image-based PDFs)
case 'pdf-text':
if (!_expectBlob(input.data, 'PDF text converter')) break;
// pdf-auto: intelligent conversion with fallback chain (text → OCR → images)
case 'pdf-auto':
if (!_expectBlob(input.data, 'PDF auto converter')) break;
try {
// Convert Blob to ArrayBuffer for PDF.js
// Phase 1: Try text extraction (0-20% progress)
const pdfArrayBuffer = await input.data.arrayBuffer();
// Extract text with quality metadata
// [pdf-text] Extract text with quality metadata
const pdfTextResult = await pdfToText(pdfArrayBuffer, (progress: number) => {
// Reserve 0-30% for text extraction attempt, 30-100% for potential image fallback
edit(attachment.id, { outputsConversionProgress: progress * 0.3 });
// Reserve 0-20% for text extraction attempt, 20-100% for potential image fallback
edit(attachment.id, { outputsConversionProgress: progress * 0.2 });
});
// Check text density to detect scanned/image-based PDFs
if (pdfTextResult.avgCharsPerPage >= PDF_LOW_TEXT_THRESHOLD) {
// Good text extraction - use it
newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.TextPlain, createDMessageDataInlineText(pdfTextResult.text, 'text/plain'), refString, DOCPART_DEFAULT_VERSION, { ...docMeta, srcOcrFrom: 'pdf' }));
edit(attachment.id, {
outputsHeuristic: { isAuto: true, actualConverterId: 'pdf-text', explain: `${pdfTextResult.avgCharsPerPage.toFixed(0)} chars/page` },
});
} else {
// Low text density detected - auto-fallback to images
console.log(`[PDF] Low text density (${pdfTextResult.avgCharsPerPage.toFixed(0)} chars/page across ${pdfTextResult.pageCount} pages), falling back to images`);
// Low text density - try OCR
// console.log(`[PDF Auto] Low text density (${pdfTextResult.avgCharsPerPage.toFixed(0)} chars/page), trying OCR...`);
// Need fresh ArrayBuffer for image rendering (previous one may be consumed)
// [pdf-images] Phase 2: Render pages to images (20-40% progress)
const pdfArrayBufferForImages = await input.data.arrayBuffer();
const imageDataURLs = await pdfToImageDataURLs(pdfArrayBufferForImages, PLATFORM_IMAGE_MIMETYPE, PDF_IMAGE_QUALITY, PDF_IMAGE_PAGE_SCALE, (progress) => {
edit(attachment.id, { outputsConversionProgress: 0.3 + progress * 0.7 }); // 30-100%
edit(attachment.id, { outputsConversionProgress: 0.2 + progress * 0.2 });
});
// Limit pages to respect LLM image limits
const pagesToAttach = Math.min(imageDataURLs.length, PDF_FALLBACK_MAX_IMAGES);
for (let i = 0; i < pagesToAttach; i++) {
const pdfPageImage = imageDataURLs[i];
const pdfPageImageF = await imageDataToImageAttachmentFragmentViaDBlob(pdfPageImage.mimeType, pdfPageImage.base64Data, source, `${title} (pg. ${i + 1})`, caption, false, false);
if (pdfPageImageF)
newFragments.push(pdfPageImageF);
// Limit pages for OCR (performance)
const pagesToProcess = Math.min(imageDataURLs.length, PDF_FALLBACK_MAX_IMAGES);
const imagesToOcr = imageDataURLs.slice(0, pagesToProcess);
// Phase 3: Try OCR on rendered pages (40-90% progress)
try {
// [pdf-images-ocr] OCR the images
const ocrResult = await ocrPdfPagesWithProgress(imagesToOcr, (progress) => {
edit(attachment.id, { outputsConversionProgress: 0.4 + progress * 0.5 });
});
if (ocrResult.avgCharsPerPage >= PDF_LOW_TEXT_THRESHOLD) {
// OCR yielded good text - use it
newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.TextPlain, createDMessageDataInlineText(ocrResult.text, 'text/plain'), refString, DOCPART_DEFAULT_VERSION, { ...docMeta, srcOcrFrom: 'pdf' }));
const truncNote = pdfTextResult.pageCount > pagesToProcess ? ` (${pagesToProcess}/${pdfTextResult.pageCount} pages)` : '';
edit(attachment.id, {
outputsHeuristic: { isAuto: true, actualConverterId: 'pdf-images-ocr', explain: /*OCR extracted */`${ocrResult.avgCharsPerPage.toFixed(0)} chars/page${truncNote}` },
});
} else {
// OCR also yielded poor results - fall back to images
// console.log(`[PDF Auto] OCR also sparse (${ocrResult.avgCharsPerPage.toFixed(0)} chars/page), falling back to images`);
for (let i = 0; i < pagesToProcess; i++) {
const pdfPageImage = imageDataURLs[i];
const pdfPageImageF = await imageDataToImageAttachmentFragmentViaDBlob(pdfPageImage.mimeType, pdfPageImage.base64Data, source, `${title} (pg. ${i + 1})`, caption, false, false);
if (pdfPageImageF)
newFragments.push(pdfPageImageF);
}
const truncNote = pdfTextResult.pageCount > pagesToProcess ? ` (${pagesToProcess}/${pdfTextResult.pageCount} pages)` : '';
edit(attachment.id, {
outputsHeuristic: { isAuto: true, actualConverterId: 'pdf-images', explain: `not a text page${truncNote}` },
});
}
} catch (ocrError) {
// OCR failed - fall back to images
console.warn('[PDF Auto] OCR failed, falling back to images:', ocrError);
for (let i = 0; i < pagesToProcess; i++) {
const pdfPageImage = imageDataURLs[i];
const pdfPageImageF = await imageDataToImageAttachmentFragmentViaDBlob(pdfPageImage.mimeType, pdfPageImage.base64Data, source, `${title} (pg. ${i + 1})`, caption, false, false);
if (pdfPageImageF)
newFragments.push(pdfPageImageF);
}
edit(attachment.id, {
outputsHeuristic: { isAuto: true, actualConverterId: 'pdf-images', explain: 'OCR failed, attached as images' },
});
}
// Set fallback info for UI display
const truncatedNote = pdfTextResult.pageCount > PDF_FALLBACK_MAX_IMAGES
? ` (first ${pagesToAttach} of ${pdfTextResult.pageCount} pages)`
: '';
edit(attachment.id, {
conversionFallback: {
from: 'pdf-text',
to: 'pdf-images',
reason: `Low text density (${pdfTextResult.avgCharsPerPage.toFixed(0)} chars/page) - converted to images${truncatedNote}`,
},
});
}
} catch (error) {
console.error('Error in PDF auto conversion:', error);
}
break;
// pdf-text: strict text extraction, no fallback (honors user choice)
case 'pdf-text':
if (!_expectBlob(input.data, 'PDF text converter')) break;
try {
const pdfTextResult = await pdfToText(await input.data.arrayBuffer(), progress => edit(attachment.id, { outputsConversionProgress: progress }));
// Always output text, even if sparse (user explicitly chose this)
newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.TextPlain, createDMessageDataInlineText(pdfTextResult.text, 'text/plain'), refString, DOCPART_DEFAULT_VERSION, { ...docMeta, srcOcrFrom: 'pdf' }));
edit(attachment.id, {
// warn if very little text was extracted (likely a scanned PDF)
outputWarnings: pdfTextResult.avgCharsPerPage >= 20 ? undefined : ['Very little text extracted - this PDF may be scanned. Try "Auto" or "OCR (for scans)" mode.'],
outputsHeuristic: { isAuto: false, actualConverterId: 'pdf-text', explain: `${pdfTextResult.avgCharsPerPage.toFixed(0)} chars/page` },
});
} catch (error) {
console.error('Error in PDF text extraction:', error);
}
break;
// pdf to images
// pdf-images: render all pages as images (honors user choice)
case 'pdf-images':
if (!_expectBlob(input.data, 'PDF images converter')) break;
// Convert Blob to ArrayBuffer for PDF.js
@@ -691,11 +734,39 @@ export async function attachmentPerformConversion(
if (pdfPageImageF)
newFragments.push(pdfPageImageF);
}
edit(attachment.id, {
outputsHeuristic: { isAuto: false, actualConverterId: 'pdf-images', explain: `${imageDataURLs.length} pages` },
});
} catch (error) {
console.error('Error converting PDF to images:', error);
}
break;
// pdf-images-ocr: force OCR on all pages (for scanned documents)
case 'pdf-images-ocr':
if (!_expectBlob(input.data, 'PDF OCR converter')) break;
try {
// Render pages to images (0-40% progress)
const imageDataURLs = await pdfToImageDataURLs(await input.data.arrayBuffer(), PLATFORM_IMAGE_MIMETYPE, PDF_IMAGE_QUALITY, PDF_IMAGE_PAGE_SCALE, (progress) => {
edit(attachment.id, { outputsConversionProgress: progress * 0.4 });
});
// OCR all pages (40-100% progress)
const ocrResult = await ocrPdfPagesWithProgress(imageDataURLs, (progress) => {
edit(attachment.id, { outputsConversionProgress: 0.4 + progress * 0.6 });
});
newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.TextPlain, createDMessageDataInlineText(ocrResult.text, 'text/plain'), refString, DOCPART_DEFAULT_VERSION, { ...docMeta, srcOcrFrom: 'pdf' }));
edit(attachment.id, {
// warn if very little text was extracted (likely a scanned PDF)
outputWarnings: ocrResult.avgCharsPerPage >= 20 ? undefined : ['Very little text extracted via OCR - this PDF may contain mostly images/diagrams.'],
outputsHeuristic: { isAuto: false, actualConverterId: 'pdf-images-ocr', explain: `${ocrResult.avgCharsPerPage.toFixed(0)} chars/page from ${ocrResult.pageCount} pages` },
});
} catch (error) {
console.error('Error in PDF OCR:', error);
}
break;
// pdf to text and images
case 'pdf-text-and-images':
if (!_expectBlob(input.data, 'PDF text and images converter')) break;
@@ -725,10 +796,13 @@ export async function attachmentPerformConversion(
const textFragment = createDocAttachmentFragment(title, caption, DVMimeType.TextPlain, createDMessageDataInlineText(pdfTextResult.text, 'text/plain'), refString, DOCPART_DEFAULT_VERSION, { ...docMeta, srcOcrFrom: 'pdf' });
newFragments.push(textFragment);
}
// Note: if text is sparse, images are still attached (user explicitly chose text+images), so we don't consider density here
// Note: if text is sparse, images are still attached (user explicitly chose text+images)
// Add the text fragment first, then the image fragments
newFragments.push(...imageFragments);
edit(attachment.id, {
outputsHeuristic: { isAuto: false, actualConverterId: 'pdf-text-and-images', explain: `${pdfTextResult.avgCharsPerPage.toFixed(0)} chars/page + ${imageFragments.length} images` },
});
} catch (error) {
console.error('Error converting PDF to text and images:', error);
}
@@ -24,11 +24,14 @@ export type AttachmentDraft = {
outputsConversionProgress: number | null;
outputFragments: DMessageAttachmentFragment[];
// Auto-fallback info: set when a converter auto-switches due to quality issues (e.g., PDF with low text density)
conversionFallback?: {
from: AttachmentDraftConverterType;
to: AttachmentDraftConverterType;
reason: string;
// Warnings for poor conversions (e.g. scanned PDF with text extraction rather than OCR)
outputWarnings?: string[];
// Tracks what method was actually used (especially for Auto mode)
outputsHeuristic?: {
isAuto: boolean;
actualConverterId: AttachmentDraftConverterType;
explain?: string; // e.g., "42 chars/page detected"
};
// metadata: {
@@ -144,7 +147,7 @@ export type AttachmentDraftConverter = {
export type AttachmentDraftConverterType =
| 'text' | 'rich-text' | 'rich-text-cleaner' | 'rich-text-table'
| 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-caption' | 'image-to-default'
| 'pdf-text' | 'pdf-images' | 'pdf-text-and-images'
| 'pdf-auto' | 'pdf-text' | 'pdf-images' | 'pdf-images-ocr' | 'pdf-text-and-images'
| 'docx-to-html'
| 'url-page-text' | 'url-page-markdown' | 'url-page-html' | 'url-page-null' | 'url-page-image'
| 'youtube-transcript' | 'youtube-transcript-simple'