From 1d91e9da03bd60cba697fb688e75523dc9b4f091 Mon Sep 17 00:00:00 2001 From: Enrico Ros Date: Thu, 22 Jan 2026 17:17:57 -0800 Subject: [PATCH] Attachments: client-side Markdown conversion and Text/HTML cleanup & Markdown conversion --- .../llmattachments/LLMAttachmentButton.tsx | 21 +++- .../attachment-drafts/attachment.pipeline.ts | 92 +++++++++++---- .../attachment-drafts/attachment.types.ts | 3 +- .../file-converters/HtmlToMarkdown.ts | 109 ++++++++++++++++++ 4 files changed, 199 insertions(+), 26 deletions(-) create mode 100644 src/common/attachment-drafts/file-converters/HtmlToMarkdown.ts diff --git a/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx b/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx index ccc6b43ec..92b88e40d 100644 --- a/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx +++ b/src/apps/chat/components/composer/llmattachments/LLMAttachmentButton.tsx @@ -1,7 +1,7 @@ import * as React from 'react'; import TimeAgo from 'react-timeago'; -import { Box, Button, CircularProgress, ColorPaletteProp, ListItem, Sheet, Typography, VariantProp } from '@mui/joy'; +import { Box, Button, CircularProgress, ColorPaletteProp, Sheet, Typography, VariantProp } from '@mui/joy'; import AbcIcon from '@mui/icons-material/Abc'; import CodeIcon from '@mui/icons-material/Code'; import DescriptionOutlinedIcon from '@mui/icons-material/DescriptionOutlined'; @@ -91,8 +91,11 @@ function InputErrorIndicator() { const converterTypeToIconMap: { [key in AttachmentDraftConverterType]: React.ComponentType | null } = { 'text': TextFieldsIcon, + 'text-cleaner': CodeIcon, + 'text-markdown': TextFieldsIcon, 'rich-text': CodeIcon, 'rich-text-cleaner': CodeIcon, + 'rich-text-markdown': TextFieldsIcon, 'rich-text-table': PivotTableChartIcon, 'image-original': ImageOutlinedIcon, 'image-resized-high': PhotoSizeSelectLargeOutlinedIcon, @@ -201,13 +204,21 @@ function attachmentIcons(attachmentDraft: AttachmentDraft, noTooltips: boolean, function attachmentLabelText(attachmentDraft: AttachmentDraft): string { const converter = attachmentDraft.converters.find(c => c.isActive) ?? null; - if (converter && attachmentDraft.label === 'Rich Text') { - if (converter.id === 'rich-text-table') - return 'Rich Table'; - if (converter.id === 'rich-text-cleaner') + if (converter && attachmentDraft.label === 'Text') { + if (converter.id === 'text-markdown') + return 'Markdown'; + if (converter.id === 'text-cleaner') return 'Clean HTML'; + } + if (converter && attachmentDraft.label === 'Rich Text') { if (converter.id === 'rich-text') return 'Rich HTML'; + if (converter.id === 'rich-text-markdown') + return 'Markdown'; + if (converter.id === 'rich-text-cleaner') + return 'Clean HTML'; + if (converter.id === 'rich-text-table') + return 'Rich Table'; } return ellipsizeFront(attachmentDraft.label, 22); } diff --git a/src/common/attachment-drafts/attachment.pipeline.ts b/src/common/attachment-drafts/attachment.pipeline.ts index 2959469db..77e1f9502 100644 --- a/src/common/attachment-drafts/attachment.pipeline.ts +++ b/src/common/attachment-drafts/attachment.pipeline.ts @@ -265,6 +265,7 @@ export function attachmentDefineConverters(source: AttachmentDraftSource, input: case mimeTypeIsPlainText(input.mimeType): // handle a secondary layer of HTML 'text' origins: drop, paste, and clipboard-read const textOriginHtml = source.media === 'text' && input.altMimeType === 'text/html' && !!input.altData; + const textOriginClipboard = source.media === 'text' && ['clipboard-read', 'paste'].includes(source.method); const isHtmlTable = !!input.altData?.startsWith(' Markdown' }); + converters.push({ id: 'text-cleaner', name: 'Text -> Clean HTML' }); + } - // p3: Html + // p3: Html -> Markdown, and Html if (textOriginHtml) { - converters.push({ id: 'rich-text-cleaner', name: 'Cleaner HTML' }); converters.push({ id: 'rich-text', name: 'HTML ยท Heavy' }); + converters.push({ id: 'rich-text-markdown', name: 'HTML -> Markdown' }); + converters.push({ id: 'rich-text-cleaner', name: 'HTML -> Clean HTML' }); } break; @@ -501,35 +507,63 @@ export async function attachmentPerformConversion( switch (converter.id) { - // text as-is + // text case 'text': + case 'text-cleaner': + case 'text-markdown': const possibleLiveFileId = await attachmentGetLiveFileId(source); - const textContent = await _inputDataToString(input.data, 'text'); - const textualInlineData = createDMessageDataInlineText(textContent, input.mimeType); + let textContent = await _inputDataToString(input.data, 'text'); + let textContentMime = input.mimeType || 'text/plain'; + + switch (converter.id) { + case 'text-cleaner': + textContent = _cleanPossibleHtmlText(textContent); + break; + case 'text-markdown': + try { + const { convertHtmlToMarkdown } = await import('./file-converters/HtmlToMarkdown'); + textContent = convertHtmlToMarkdown(textContent); + textContentMime = 'text/markdown'; + } catch (error) { + console.log('[DEV] Error converting Text (HTML) to Markdown:', error); + } + break; + } + + const textualInlineData = createDMessageDataInlineText(textContent, textContentMime); newFragments.push(createDocAttachmentFragment(title, caption, _guessDocVDT(input.mimeType), textualInlineData, refString, DOCPART_DEFAULT_VERSION, docMeta, possibleLiveFileId)); break; - // html as-is + // html case 'rich-text': + case 'rich-text-cleaner': + case 'rich-text-markdown': + let richText = input.altData || ''; + let richTextMimeType = input.altMimeType || 'text/html'; + + // html -> cleaner/html or markdown + switch (converter.id) { + case 'rich-text-cleaner': + richText = _cleanPossibleHtmlText(richText); + richTextMimeType = 'text/html'; + break; + case 'rich-text-markdown': + try { + const { convertHtmlToMarkdown } = await import('./file-converters/HtmlToMarkdown'); + richText = convertHtmlToMarkdown(richText); + richTextMimeType = 'text/markdown'; + } catch (error) { + console.log('[DEV] Error converting HTML to Markdown:', error); + } + break; + } + // NOTE: before we had the following: createTextAttachmentFragment(ref || '\n', input.altData!), which // was used to wrap the HTML in a code block to facilitate AutoRenderBlocks's parser. Historic note, for future debugging. - const richTextData = createDMessageDataInlineText(input.altData || '', input.altMimeType); + const richTextData = createDMessageDataInlineText(richText, richTextMimeType); newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.VndAgiCode, richTextData, refString, DOCPART_DEFAULT_VERSION, docMeta)); break; - // html cleaned - case 'rich-text-cleaner': - const cleanerHtml = (input.altData || '') - // remove class and style attributes - .replace(/<[^>]+>/g, (tag) => - tag.replace(/ class="[^"]*"/g, '').replace(/ style="[^"]*"/g, ''), - ) - // remove svg elements - .replace(/]*>.*?<\/svg>/g, ''); - const cleanedHtmlData = createDMessageDataInlineText(cleanerHtml, 'text/html'); - newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.VndAgiCode, cleanedHtmlData, refString, DOCPART_DEFAULT_VERSION, docMeta)); - break; - // html to markdown table case 'rich-text-table': let tableData: DMessageDataInline; @@ -919,6 +953,11 @@ export async function attachmentPerformConversion( case 'unhandled': // force the user to explicitly select 'as text' if they want to proceed break; + + + default: + const _exhaustiveCheck: never = converter.id; + break; } } @@ -961,6 +1000,19 @@ async function _inputDataToString(data: AttachmentDraftInput['data'], debugLocat return ''; } +/** + * Simple Client-side cleaning of possible HTML + */ +function _cleanPossibleHtmlText(inputStr: string): string { + return inputStr + // remove class and style attributes + .replace(/<[^>]+>/g, (tag) => + tag.replace(/ class="[^"]*"/g, '').replace(/ style="[^"]*"/g, ''), + ) + // remove svg elements + .replace(/]*>.*?<\/svg>/g, ''); +} + /** * Special function to convert a list of files to Attachment Fragments, without passing through the attachments system diff --git a/src/common/attachment-drafts/attachment.types.ts b/src/common/attachment-drafts/attachment.types.ts index b2fcafcc3..67e0a49e6 100644 --- a/src/common/attachment-drafts/attachment.types.ts +++ b/src/common/attachment-drafts/attachment.types.ts @@ -145,7 +145,8 @@ export type AttachmentDraftConverter = { } export type AttachmentDraftConverterType = - | 'text' | 'rich-text' | 'rich-text-cleaner' | 'rich-text-table' + | 'text' | 'text-cleaner' | 'text-markdown' + | 'rich-text' | 'rich-text-cleaner' | 'rich-text-markdown' | 'rich-text-table' | 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-caption' | 'image-to-default' | 'pdf-auto' | 'pdf-text' | 'pdf-images' | 'pdf-images-ocr' | 'pdf-text-and-images' | 'docx-to-html' diff --git a/src/common/attachment-drafts/file-converters/HtmlToMarkdown.ts b/src/common/attachment-drafts/file-converters/HtmlToMarkdown.ts new file mode 100644 index 000000000..5df2a5d0f --- /dev/null +++ b/src/common/attachment-drafts/file-converters/HtmlToMarkdown.ts @@ -0,0 +1,109 @@ +import { default as TurndownService } from 'turndown'; + + +// Cached Turndown service instance +let _turndownService: TurndownService | null = null; + +function getTurndownService(): TurndownService { + if (!_turndownService) { + _turndownService = new TurndownService({ + headingStyle: 'atx', + codeBlockStyle: 'fenced', + emDelimiter: '_', + }); + + // Remove script and style elements + _turndownService.remove(['script', 'style', 'noscript']); + } + return _turndownService; +} + + +/** + * Convert HTML string to Markdown using Turndown. + * Performs basic HTML cleaning before conversion. + */ +export function convertHtmlToMarkdown(html: string): string { + // Basic client-side HTML cleaning using DOMParser + const cleanedHtml = cleanHtmlForMarkdown(html); + return getTurndownService().turndown(cleanedHtml); +} + + +/** + * Client-side HTML cleaning optimized for Markdown conversion. + * Uses DOMParser (browser-native) instead of Cheerio (server-only). + */ +function cleanHtmlForMarkdown(html: string): string { + try { + const parser = new DOMParser(); + const doc = parser.parseFromString(html, 'text/html'); + + // Remove unwanted elements + const unwantedSelectors = [ + 'script', 'style', 'link', 'noscript', 'iframe', 'svg', 'canvas', + 'nav:not(main nav)', 'aside', 'footer:not(article footer)', + '.ad', '.ads', '.advertisement', '.banner', '.popup', '.modal', '.overlay', + '.cookie-banner', '.newsletter-signup', '.social-share', '.comments', + '.sidebar', '.widget', '.carousel', '.slider', + '[aria-hidden="true"]', '[hidden]', + '[data-analytics]', '[data-tracking]', '[data-gtm]', + ]; + + for (const selector of unwantedSelectors) { + try { + doc.querySelectorAll(selector).forEach(el => el.remove()); + } catch { + // Skip invalid selectors (e.g., complex :not() selectors may fail in some browsers) + } + } + + // Remove hidden elements via inline styles + doc.querySelectorAll('[style]').forEach(el => { + const style = el.getAttribute('style') || ''; + if (style.includes('display: none') || style.includes('display:none') || + style.includes('visibility: hidden') || style.includes('visibility:hidden')) + el.remove(); + }); + + // Clean up anchor hrefs (remove tracking parameters) + doc.querySelectorAll('a[href]').forEach(el => { + const href = el.getAttribute('href'); + if (!href) return; + + // Remove javascript: links + if (href.toLowerCase().startsWith('javascript:')) { + el.removeAttribute('href'); + return; + } + + // Remove tracking parameters + if (href.includes('?')) { + try { + const url = new URL(href, 'http://placeholder'); + const cleanParams = new URLSearchParams(); + url.searchParams.forEach((value, key) => { + if (!key.match(/^(utm_|fbclid|gclid|msclkid)/i)) + cleanParams.append(key, value); + }); + const cleanHref = `${url.pathname}${cleanParams.toString() ? '?' + cleanParams.toString() : ''}${url.hash}`; + el.setAttribute('href', cleanHref); + } catch { + // Keep original href if URL parsing fails + } + } + }); + + // Remove comments (HTML comment nodes) + const walker = document.createTreeWalker(doc.body, NodeFilter.SHOW_COMMENT); + const comments: Comment[] = []; + while (walker.nextNode()) + comments.push(walker.currentNode as Comment); + comments.forEach(comment => comment.remove()); + + return doc.body.innerHTML; + } catch (error) { + console.error('HTML cleaning error:', error); + return html; // Return original if cleaning fails + } +}