mirror of
https://github.com/enricoros/big-AGI.git
synced 2026-05-11 06:00:15 -07:00
23a3185696
Cleaned up the code a bit (thx gpt4), with: - dynamic module load: JS is chunked up and deferred to PDF loading, which improves all the sessions where PDFs are not loaded - unified path for drag/drop and 'load file' (shall call it "magic drop" so PDFs are text'ified upon drag/dop as well - fixed "not being able to load the same doc twice" (thx gpt4) - using minified worker, as it's loaded dynamically, we save ~50% bandwidth
39 lines
1.5 KiB
TypeScript
39 lines
1.5 KiB
TypeScript
// Type guard to check if an item has a 'str' property
|
|
function isTextItem(item: any): item is { str: string } {
|
|
return 'str' in item && typeof item.str === 'string';
|
|
}
|
|
|
|
/**
|
|
* Extracts text from a PDF file
|
|
*
|
|
* Uses the Next.js dynamic import feature to import the 'pdfjs-dist' library
|
|
* only when this function is called. This allows the 'pdfjs-dist' library to
|
|
* be bundled into a separate chunk, which is only loaded when this function
|
|
* is called. This is useful because the 'pdfjs-dist' library is quite large,
|
|
* and we don't want to load it unless we need to. [Faster startup time!]
|
|
*
|
|
* @param file - The PDF file to extract text from
|
|
*/
|
|
export const extractPdfText = async (file: File): Promise<string> => {
|
|
|
|
// Dynamically import the 'pdfjs-dist' library [nextjs]
|
|
const { getDocument, GlobalWorkerOptions } = await import('pdfjs-dist');
|
|
|
|
// Set the worker script path
|
|
GlobalWorkerOptions.workerSrc = '/workers/pdf.worker.min.js';
|
|
|
|
const arrayBuffer = await file.arrayBuffer();
|
|
const pdf = await getDocument(arrayBuffer).promise;
|
|
let text = '';
|
|
|
|
for (let i = 1; i <= pdf.numPages; i++) {
|
|
const page = await pdf.getPage(i);
|
|
const content = await page.getTextContent();
|
|
const strings = content.items
|
|
.filter(isTextItem) // Use the type guard to filter out items with the 'str' property
|
|
.map(item => (item as { str: string }).str); // Use type assertion to ensure that the item has the 'str' property
|
|
text += strings.join(' ') + '\n';
|
|
}
|
|
|
|
return text;
|
|
}; |