From 1ebb04bae6eda45ba95807ca51cd4cec1fdd8d78 Mon Sep 17 00:00:00 2001 From: Enrico Ros Date: Tue, 9 Jul 2024 07:35:47 -0700 Subject: [PATCH] Support Docx. #588 --- package-lock.json | 177 +++++++++++++++++- package.json | 1 + .../llmattachments/LLMAttachmentItem.tsx | 6 +- .../attachment-drafts/attachment.pipeline.ts | 33 +++- .../attachment-drafts/attachment.types.ts | 3 +- .../file-converters/DocxToMarkdown.ts | 22 +++ 6 files changed, 237 insertions(+), 5 deletions(-) create mode 100644 src/common/attachment-drafts/file-converters/DocxToMarkdown.ts diff --git a/package-lock.json b/package-lock.json index d160e0570..bcbc1a6d8 100644 --- a/package-lock.json +++ b/package-lock.json @@ -34,6 +34,7 @@ "dexie-react-hooks": "^1.1.7", "eventsource-parser": "^1.1.2", "idb-keyval": "^6.2.1", + "mammoth": "^1.8.0", "nanoid": "^5.0.7", "next": "~14.2.4", "nprogress": "^0.2.0", @@ -2222,6 +2223,14 @@ } } }, + "node_modules/@xmldom/xmldom": { + "version": "0.8.10", + "resolved": "https://registry.npmjs.org/@xmldom/xmldom/-/xmldom-0.8.10.tgz", + "integrity": "sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==", + "engines": { + "node": ">=10.0.0" + } + }, "node_modules/abbrev": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz", @@ -2604,6 +2613,30 @@ "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", "devOptional": true }, + "node_modules/base64-js": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", + "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ] + }, + "node_modules/bluebird": { + "version": "3.4.7", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.4.7.tgz", + "integrity": "sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==" + }, "node_modules/bmp-js": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/bmp-js/-/bmp-js-0.1.0.tgz", @@ -3259,6 +3292,11 @@ "react": ">=16" } }, + "node_modules/dingbat-to-unicode": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dingbat-to-unicode/-/dingbat-to-unicode-1.0.1.tgz", + "integrity": "sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==" + }, "node_modules/dir-glob": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/dir-glob/-/dir-glob-3.0.1.tgz", @@ -3343,6 +3381,14 @@ "url": "https://github.com/fb55/domutils?sponsor=1" } }, + "node_modules/duck": { + "version": "0.1.12", + "resolved": "https://registry.npmjs.org/duck/-/duck-0.1.12.tgz", + "integrity": "sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==", + "dependencies": { + "underscore": "^1.13.1" + } + }, "node_modules/duplexer": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/duplexer/-/duplexer-0.1.2.tgz", @@ -4932,6 +4978,11 @@ "node": ">= 4" } }, + "node_modules/immediate": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", + "integrity": "sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==" + }, "node_modules/import-fresh": { "version": "3.3.0", "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.0.tgz", @@ -5570,6 +5621,49 @@ "node": ">=4.0" } }, + "node_modules/jszip": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", + "integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==", + "dependencies": { + "lie": "~3.3.0", + "pako": "~1.0.2", + "readable-stream": "~2.3.6", + "setimmediate": "^1.0.5" + } + }, + "node_modules/jszip/node_modules/isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==" + }, + "node_modules/jszip/node_modules/readable-stream": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/jszip/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" + }, + "node_modules/jszip/node_modules/string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "dependencies": { + "safe-buffer": "~5.1.0" + } + }, "node_modules/katex": { "version": "0.16.11", "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.11.tgz", @@ -5625,6 +5719,14 @@ "node": ">= 0.8.0" } }, + "node_modules/lie": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz", + "integrity": "sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==", + "dependencies": { + "immediate": "~3.0.5" + } + }, "node_modules/lines-and-columns": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", @@ -5676,6 +5778,16 @@ "loose-envify": "cli.js" } }, + "node_modules/lop": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/lop/-/lop-0.4.1.tgz", + "integrity": "sha512-9xyho9why2A2tzm5aIcMWKvzqKsnxrf9B5I+8O30olh6lQU8PH978LqZoI4++37RBgS1Em5i54v1TFs/3wnmXQ==", + "dependencies": { + "duck": "^0.1.12", + "option": "~0.2.1", + "underscore": "^1.13.1" + } + }, "node_modules/lru-cache": { "version": "10.3.1", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.3.1.tgz", @@ -5709,6 +5821,37 @@ "semver": "bin/semver.js" } }, + "node_modules/mammoth": { + "version": "1.8.0", + "resolved": "https://registry.npmjs.org/mammoth/-/mammoth-1.8.0.tgz", + "integrity": "sha512-pJNfxSk9IEGVpau+tsZFz22ofjUsl2mnA5eT8PjPs2n0BP+rhVte4Nez6FdgEuxv3IGI3afiV46ImKqTGDVlbA==", + "dependencies": { + "@xmldom/xmldom": "^0.8.6", + "argparse": "~1.0.3", + "base64-js": "^1.5.1", + "bluebird": "~3.4.0", + "dingbat-to-unicode": "^1.0.1", + "jszip": "^3.7.1", + "lop": "^0.4.1", + "path-is-absolute": "^1.0.0", + "underscore": "^1.13.1", + "xmlbuilder": "^10.0.0" + }, + "bin": { + "mammoth": "bin/mammoth" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/mammoth/node_modules/argparse": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", + "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==", + "dependencies": { + "sprintf-js": "~1.0.2" + } + }, "node_modules/markdown-table": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.3.tgz", @@ -6991,6 +7134,11 @@ "opener": "bin/opener-bin.js" } }, + "node_modules/option": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/option/-/option-0.2.4.tgz", + "integrity": "sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==" + }, "node_modules/optionator": { "version": "0.9.4", "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz", @@ -7038,6 +7186,11 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/pako": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", + "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==" + }, "node_modules/parent-module": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", @@ -7126,7 +7279,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", - "devOptional": true, "engines": { "node": ">=0.10.0" } @@ -7938,6 +8090,11 @@ "node": ">= 0.4" } }, + "node_modules/setimmediate": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", + "integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==" + }, "node_modules/sharp": { "version": "0.33.4", "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.33.4.tgz", @@ -8119,6 +8276,11 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/sprintf-js": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", + "integrity": "sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==" + }, "node_modules/stop-iteration-iterator": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/stop-iteration-iterator/-/stop-iteration-iterator-1.0.0.tgz", @@ -8741,6 +8903,11 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/underscore": { + "version": "1.13.6", + "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.6.tgz", + "integrity": "sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==" + }, "node_modules/undici-types": { "version": "5.26.5", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", @@ -9300,6 +9467,14 @@ } } }, + "node_modules/xmlbuilder": { + "version": "10.1.1", + "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-10.1.1.tgz", + "integrity": "sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg==", + "engines": { + "node": ">=4.0" + } + }, "node_modules/xtend": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/xtend/-/xtend-2.1.2.tgz", diff --git a/package.json b/package.json index 11a6b34d3..bb18072ac 100644 --- a/package.json +++ b/package.json @@ -43,6 +43,7 @@ "dexie-react-hooks": "^1.1.7", "eventsource-parser": "^1.1.2", "idb-keyval": "^6.2.1", + "mammoth": "^1.8.0", "nanoid": "^5.0.7", "next": "~14.2.4", "nprogress": "^0.2.0", diff --git a/src/apps/chat/components/composer/llmattachments/LLMAttachmentItem.tsx b/src/apps/chat/components/composer/llmattachments/LLMAttachmentItem.tsx index 30129e789..e47b0f0d7 100644 --- a/src/apps/chat/components/composer/llmattachments/LLMAttachmentItem.tsx +++ b/src/apps/chat/components/composer/llmattachments/LLMAttachmentItem.tsx @@ -3,6 +3,7 @@ import * as React from 'react'; import { Box, Button, CircularProgress, ColorPaletteProp, Sheet, Typography } from '@mui/joy'; import AbcIcon from '@mui/icons-material/Abc'; import CodeIcon from '@mui/icons-material/Code'; +import DescriptionOutlinedIcon from '@mui/icons-material/DescriptionOutlined'; import ImageOutlinedIcon from '@mui/icons-material/ImageOutlined'; import PermMediaOutlinedIcon from '@mui/icons-material/PermMediaOutlined'; import PhotoSizeSelectLargeOutlinedIcon from '@mui/icons-material/PhotoSizeSelectLargeOutlined'; @@ -73,13 +74,14 @@ const converterTypeToIconMap: { [key in AttachmentDraftConverterType]: React.Com 'text': TextFieldsIcon, 'rich-text': CodeIcon, 'rich-text-table': PivotTableChartIcon, - 'pdf-text': PictureAsPdfIcon, - 'pdf-images': PermMediaOutlinedIcon, 'image-original': ImageOutlinedIcon, 'image-resized-high': PhotoSizeSelectLargeOutlinedIcon, 'image-resized-low': PhotoSizeSelectSmallOutlinedIcon, 'image-to-default': ImageOutlinedIcon, 'image-ocr': AbcIcon, + 'pdf-text': PictureAsPdfIcon, + 'pdf-images': PermMediaOutlinedIcon, + 'docx-to-html': DescriptionOutlinedIcon, 'ego-fragments-inlined': TelegramIcon, 'unhandled': TextureIcon, }; diff --git a/src/common/attachment-drafts/attachment.pipeline.ts b/src/common/attachment-drafts/attachment.pipeline.ts index 25df926c8..54ceb55db 100644 --- a/src/common/attachment-drafts/attachment.pipeline.ts +++ b/src/common/attachment-drafts/attachment.pipeline.ts @@ -88,6 +88,16 @@ const IMAGE_MIMETYPES: string[] = [ 'image/gif', ]; +// mimetypes to treat as PDFs +const PDF_MIMETYPES: string[] = [ + 'application/pdf', 'application/x-pdf', 'application/acrobat', +]; + +// mimetypes to treat as images +const DOCX_MIMETYPES: string[] = [ + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', +]; + /** * Creates a new AttachmentDraft object. @@ -258,11 +268,16 @@ export function attachmentDefineConverters(sourceType: AttachmentDraftSource['me break; // PDF - case ['application/pdf', 'application/x-pdf', 'application/acrobat'].includes(input.mimeType): + case PDF_MIMETYPES.includes(input.mimeType): converters.push({ id: 'pdf-text', name: 'PDF To Text (OCR)' }); converters.push({ id: 'pdf-images', name: 'PDF To Images' }); break; + // DOCX + case DOCX_MIMETYPES.includes(input.mimeType): + converters.push({ id: 'docx-to-html', name: 'DOCX to HTML' }); + break; + // EGO case input.mimeType === 'application/vnd.agi.ego.fragments': converters.push({ id: 'ego-fragments-inlined', name: 'Message' }); @@ -531,6 +546,22 @@ export async function attachmentPerformConversion( break; + // docx to markdown + case 'docx-to-html': + if (!(input.data instanceof ArrayBuffer)) { + console.log('Expected ArrayBuffer for DOCX converter, got:', typeof input.data); + break; + } + try { + const { convertDocxToHTML } = await import('./file-converters/DocxToMarkdown'); + const { html } = await convertDocxToHTML(input.data); + newFragments.push(createDocAttachmentFragment(title, caption, 'text/html', createDMessageDataInlineText(html, 'text/html'), refString, docMeta)); + } catch (error) { + console.error('Error in DOCX to Markdown conversion:', error); + } + break; + + // self: message case 'ego-fragments-inlined': if (!Array.isArray(input.data)) { diff --git a/src/common/attachment-drafts/attachment.types.ts b/src/common/attachment-drafts/attachment.types.ts index dcde9d62b..ffafbae87 100644 --- a/src/common/attachment-drafts/attachment.types.ts +++ b/src/common/attachment-drafts/attachment.types.ts @@ -95,8 +95,9 @@ export type AttachmentDraftConverter = { export type AttachmentDraftConverterType = | 'text' | 'rich-text' | 'rich-text-table' - | 'pdf-text' | 'pdf-images' | 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-to-default' + | 'pdf-text' | 'pdf-images' + | 'docx-to-html' | 'ego-fragments-inlined' | 'unhandled'; diff --git a/src/common/attachment-drafts/file-converters/DocxToMarkdown.ts b/src/common/attachment-drafts/file-converters/DocxToMarkdown.ts new file mode 100644 index 000000000..665b8fbe4 --- /dev/null +++ b/src/common/attachment-drafts/file-converters/DocxToMarkdown.ts @@ -0,0 +1,22 @@ +import { convertToHtml, images } from 'mammoth'; + + +export async function convertDocxToHTML(input: ArrayBuffer): Promise<{ html: string }> { + try { + // Dynamically import mammoth + const result = await convertToHtml({ arrayBuffer: input }, { + convertImage: images.imgElement(function ignoreImage(image) { + throw new Error('Images are not supported in DOCX to Markdown conversion'); + }), + }); + if (result.messages?.length) { + console.log('Messages from DOCX to Markdown conversion:', result.messages); + } + return { + html: result.value, + }; + } catch (error) { + console.error('Error converting DOCX to Markdown:', error); + throw error; + } +}