pdfjs: image generation (just in case)

This commit is contained in:
Enrico Ros
2024-05-10 02:46:28 -07:00
parent b8aaa4bb42
commit e4e7ac260a
4 changed files with 88 additions and 10 deletions
@@ -153,7 +153,11 @@ export function AttachmentMenu(props: {
{/* Converters: {aConverters.map(((converter, idx) => ` ${converter.id}${(idx === aConverterIdx) ? '*' : ''}`)).join(', ')}*/}
{/*</Typography>*/}
<Typography level='body-xs'>
🡒 {isOutputMissing ? 'empty' : aOutputs.map(output => `${output.type}, ${output.type === 'text-block' ? output.text.length.toLocaleString() : '(base64 image)'} bytes`).join(' · ')}
🡒 {isOutputMissing ? 'empty' : aOutputs.map(output => `${output.type}, ${output.type === 'text-block'
? output.text.length.toLocaleString()
: output.type === 'image-part'
? output.base64Url.length.toLocaleString()
: '(other)'} bytes`).join(' · ')}
</Typography>
{!!tokenCountApprox && <Typography level='body-xs'>
🡒 {tokenCountApprox.toLocaleString()} tokens
@@ -2,7 +2,7 @@ import { callBrowseFetchPage } from '~/modules/browse/browse.client';
import { createBase36Uid } from '~/common/util/textUtils';
import { htmlTableToMarkdown } from '~/common/util/htmlTableToMarkdown';
import { pdfToText } from '~/common/util/pdfUtils';
import { pdfToImageDataURLs, pdfToText } from '~/common/util/pdfUtils';
import type { Attachment, AttachmentConverter, AttachmentId, AttachmentInput, AttachmentSource } from './store-attachments';
import type { ComposerOutputMultiPart } from '../composer.types';
@@ -297,7 +297,7 @@ export async function attachmentPerformConversion(attachment: Readonly<Attachmen
case 'pdf-text':
if (!(input.data instanceof ArrayBuffer)) {
console.log('Expected ArrayBuffer for PDF converter, got:', typeof input.data);
console.log('Expected ArrayBuffer for PDF text converter, got:', typeof input.data);
break;
}
// duplicate the ArrayBuffer to avoid mutation
@@ -312,7 +312,29 @@ export async function attachmentPerformConversion(attachment: Readonly<Attachmen
break;
case 'pdf-images':
// TODO: extract all pages as individual images
if (!(input.data instanceof ArrayBuffer)) {
console.log('Expected ArrayBuffer for PDF images converter, got:', typeof input.data);
break;
}
// duplicate the ArrayBuffer to avoid mutation
const pdfData2 = new Uint8Array(input.data.slice(0));
try {
const imageDataURLs = await pdfToImageDataURLs(pdfData2);
imageDataURLs.forEach((pdfImg, index) => {
outputs.push({
type: 'image-part',
base64Url: pdfImg.base64Url,
metadata: {
title: `Page ${index + 1}`,
width: pdfImg.width,
height: pdfImg.height,
},
collapsible: false,
});
});
} catch (error) {
console.error('Error converting PDF to images:', error);
}
break;
case 'image':
@@ -9,6 +9,13 @@ export type ComposerOutputPart = {
// TODO: not implemented yet
type: 'image-part',
base64Url: string,
metadata: {
title?: string,
generatedBy?: string,
altText?: string,
width?: number,
height?: number,
},
collapsible: false,
};
+51 -6
View File
@@ -10,12 +10,7 @@
* @param pdfBuffer The content of a PDF file
*/
export async function pdfToText(pdfBuffer: ArrayBuffer): Promise<string> {
// Dynamically import the 'pdfjs-dist' library [nextjs]
const { getDocument, GlobalWorkerOptions } = await import('pdfjs-dist');
// Set the worker script path
GlobalWorkerOptions.workerSrc = '/workers/pdf.worker.min.mjs';
const { getDocument } = await dynamicImportPdfJs();
const pdf = await getDocument(pdfBuffer).promise;
const textPages: string[] = []; // Initialize an array to hold text from all pages
@@ -52,6 +47,56 @@ export async function pdfToText(pdfBuffer: ArrayBuffer): Promise<string> {
return textPages.join('\n\n'); // Join all the page texts at the end
}
type PdfPageImage = { base64Url: string, scale: number, width: number, height: number };
/**
* Renders all pages of a PDF to images
*
* @param pdfBuffer The content of a PDF file
* @param scale The scale factor for the image resolution (default 1.5 for moderate quality)
*/
export async function pdfToImageDataURLs(pdfBuffer: ArrayBuffer, scale = 1.5): Promise<PdfPageImage[]> {
const { getDocument } = await dynamicImportPdfJs();
const pdf = await getDocument({ data: pdfBuffer }).promise;
const images: PdfPageImage[] = [];
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const viewport = page.getViewport({ scale });
const canvas = document.createElement('canvas');
const context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;
await page.render({
canvasContext: context!,
viewport,
}).promise;
images.push({
base64Url: canvas.toDataURL('image/jpeg'),
scale,
width: viewport.width,
height: viewport.height,
});
}
return images;
}
// Dynamically import the 'pdfjs-dist' library
async function dynamicImportPdfJs() {
// Dynamically import the 'pdfjs-dist' library [nextjs]
const { getDocument, GlobalWorkerOptions } = await import('pdfjs-dist');
// Set the worker script path
GlobalWorkerOptions.workerSrc = '/workers/pdf.worker.min.mjs';
return { getDocument };
}
// Type guard to check if an item has a 'str' property
function isTextItem(item: any): item is { str: string } {
return 'str' in item && typeof item.str === 'string';