Attachments: client-side Markdown conversion and Text/HTML cleanup & Markdown conversion

This commit is contained in:
Enrico Ros
2026-01-22 17:17:57 -08:00
parent 7eac409ec6
commit 1d91e9da03
4 changed files with 199 additions and 26 deletions
@@ -1,7 +1,7 @@
import * as React from 'react';
import TimeAgo from 'react-timeago';
import { Box, Button, CircularProgress, ColorPaletteProp, ListItem, Sheet, Typography, VariantProp } from '@mui/joy';
import { Box, Button, CircularProgress, ColorPaletteProp, Sheet, Typography, VariantProp } from '@mui/joy';
import AbcIcon from '@mui/icons-material/Abc';
import CodeIcon from '@mui/icons-material/Code';
import DescriptionOutlinedIcon from '@mui/icons-material/DescriptionOutlined';
@@ -91,8 +91,11 @@ function InputErrorIndicator() {
const converterTypeToIconMap: { [key in AttachmentDraftConverterType]: React.ComponentType<any> | null } = {
'text': TextFieldsIcon,
'text-cleaner': CodeIcon,
'text-markdown': TextFieldsIcon,
'rich-text': CodeIcon,
'rich-text-cleaner': CodeIcon,
'rich-text-markdown': TextFieldsIcon,
'rich-text-table': PivotTableChartIcon,
'image-original': ImageOutlinedIcon,
'image-resized-high': PhotoSizeSelectLargeOutlinedIcon,
@@ -201,13 +204,21 @@ function attachmentIcons(attachmentDraft: AttachmentDraft, noTooltips: boolean,
function attachmentLabelText(attachmentDraft: AttachmentDraft): string {
const converter = attachmentDraft.converters.find(c => c.isActive) ?? null;
if (converter && attachmentDraft.label === 'Rich Text') {
if (converter.id === 'rich-text-table')
return 'Rich Table';
if (converter.id === 'rich-text-cleaner')
if (converter && attachmentDraft.label === 'Text') {
if (converter.id === 'text-markdown')
return 'Markdown';
if (converter.id === 'text-cleaner')
return 'Clean HTML';
}
if (converter && attachmentDraft.label === 'Rich Text') {
if (converter.id === 'rich-text')
return 'Rich HTML';
if (converter.id === 'rich-text-markdown')
return 'Markdown';
if (converter.id === 'rich-text-cleaner')
return 'Clean HTML';
if (converter.id === 'rich-text-table')
return 'Rich Table';
}
return ellipsizeFront(attachmentDraft.label, 22);
}
@@ -265,6 +265,7 @@ export function attachmentDefineConverters(source: AttachmentDraftSource, input:
case mimeTypeIsPlainText(input.mimeType):
// handle a secondary layer of HTML 'text' origins: drop, paste, and clipboard-read
const textOriginHtml = source.media === 'text' && input.altMimeType === 'text/html' && !!input.altData;
const textOriginClipboard = source.media === 'text' && ['clipboard-read', 'paste'].includes(source.method);
const isHtmlTable = !!input.altData?.startsWith('<table');
// p1: Tables
@@ -273,11 +274,16 @@ export function attachmentDefineConverters(source: AttachmentDraftSource, input:
// p2: Text
converters.push({ id: 'text', name: attachmentSourceSupportsLiveFile(source) ? 'Text (Live)' : 'Text' });
if (!textOriginHtml && textOriginClipboard) {
converters.push({ id: 'text-markdown', name: 'Text -> Markdown' });
converters.push({ id: 'text-cleaner', name: 'Text -> Clean HTML' });
}
// p3: Html
// p3: Html -> Markdown, and Html
if (textOriginHtml) {
converters.push({ id: 'rich-text-cleaner', name: 'Cleaner HTML' });
converters.push({ id: 'rich-text', name: 'HTML · Heavy' });
converters.push({ id: 'rich-text-markdown', name: 'HTML -> Markdown' });
converters.push({ id: 'rich-text-cleaner', name: 'HTML -> Clean HTML' });
}
break;
@@ -501,35 +507,63 @@ export async function attachmentPerformConversion(
switch (converter.id) {
// text as-is
// text
case 'text':
case 'text-cleaner':
case 'text-markdown':
const possibleLiveFileId = await attachmentGetLiveFileId(source);
const textContent = await _inputDataToString(input.data, 'text');
const textualInlineData = createDMessageDataInlineText(textContent, input.mimeType);
let textContent = await _inputDataToString(input.data, 'text');
let textContentMime = input.mimeType || 'text/plain';
switch (converter.id) {
case 'text-cleaner':
textContent = _cleanPossibleHtmlText(textContent);
break;
case 'text-markdown':
try {
const { convertHtmlToMarkdown } = await import('./file-converters/HtmlToMarkdown');
textContent = convertHtmlToMarkdown(textContent);
textContentMime = 'text/markdown';
} catch (error) {
console.log('[DEV] Error converting Text (HTML) to Markdown:', error);
}
break;
}
const textualInlineData = createDMessageDataInlineText(textContent, textContentMime);
newFragments.push(createDocAttachmentFragment(title, caption, _guessDocVDT(input.mimeType), textualInlineData, refString, DOCPART_DEFAULT_VERSION, docMeta, possibleLiveFileId));
break;
// html as-is
// html
case 'rich-text':
case 'rich-text-cleaner':
case 'rich-text-markdown':
let richText = input.altData || '';
let richTextMimeType = input.altMimeType || 'text/html';
// html -> cleaner/html or markdown
switch (converter.id) {
case 'rich-text-cleaner':
richText = _cleanPossibleHtmlText(richText);
richTextMimeType = 'text/html';
break;
case 'rich-text-markdown':
try {
const { convertHtmlToMarkdown } = await import('./file-converters/HtmlToMarkdown');
richText = convertHtmlToMarkdown(richText);
richTextMimeType = 'text/markdown';
} catch (error) {
console.log('[DEV] Error converting HTML to Markdown:', error);
}
break;
}
// NOTE: before we had the following: createTextAttachmentFragment(ref || '\n<!DOCTYPE html>', input.altData!), which
// was used to wrap the HTML in a code block to facilitate AutoRenderBlocks's parser. Historic note, for future debugging.
const richTextData = createDMessageDataInlineText(input.altData || '', input.altMimeType);
const richTextData = createDMessageDataInlineText(richText, richTextMimeType);
newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.VndAgiCode, richTextData, refString, DOCPART_DEFAULT_VERSION, docMeta));
break;
// html cleaned
case 'rich-text-cleaner':
const cleanerHtml = (input.altData || '')
// remove class and style attributes
.replace(/<[^>]+>/g, (tag) =>
tag.replace(/ class="[^"]*"/g, '').replace(/ style="[^"]*"/g, ''),
)
// remove svg elements
.replace(/<svg[^>]*>.*?<\/svg>/g, '');
const cleanedHtmlData = createDMessageDataInlineText(cleanerHtml, 'text/html');
newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.VndAgiCode, cleanedHtmlData, refString, DOCPART_DEFAULT_VERSION, docMeta));
break;
// html to markdown table
case 'rich-text-table':
let tableData: DMessageDataInline;
@@ -919,6 +953,11 @@ export async function attachmentPerformConversion(
case 'unhandled':
// force the user to explicitly select 'as text' if they want to proceed
break;
default:
const _exhaustiveCheck: never = converter.id;
break;
}
}
@@ -961,6 +1000,19 @@ async function _inputDataToString(data: AttachmentDraftInput['data'], debugLocat
return '';
}
/**
* Simple Client-side cleaning of possible HTML
*/
function _cleanPossibleHtmlText(inputStr: string): string {
return inputStr
// remove class and style attributes
.replace(/<[^>]+>/g, (tag) =>
tag.replace(/ class="[^"]*"/g, '').replace(/ style="[^"]*"/g, ''),
)
// remove svg elements
.replace(/<svg[^>]*>.*?<\/svg>/g, '');
}
/**
* Special function to convert a list of files to Attachment Fragments, without passing through the attachments system
@@ -145,7 +145,8 @@ export type AttachmentDraftConverter = {
}
export type AttachmentDraftConverterType =
| 'text' | 'rich-text' | 'rich-text-cleaner' | 'rich-text-table'
| 'text' | 'text-cleaner' | 'text-markdown'
| 'rich-text' | 'rich-text-cleaner' | 'rich-text-markdown' | 'rich-text-table'
| 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-caption' | 'image-to-default'
| 'pdf-auto' | 'pdf-text' | 'pdf-images' | 'pdf-images-ocr' | 'pdf-text-and-images'
| 'docx-to-html'
@@ -0,0 +1,109 @@
import { default as TurndownService } from 'turndown';
// Cached Turndown service instance
let _turndownService: TurndownService | null = null;
function getTurndownService(): TurndownService {
if (!_turndownService) {
_turndownService = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced',
emDelimiter: '_',
});
// Remove script and style elements
_turndownService.remove(['script', 'style', 'noscript']);
}
return _turndownService;
}
/**
* Convert HTML string to Markdown using Turndown.
* Performs basic HTML cleaning before conversion.
*/
export function convertHtmlToMarkdown(html: string): string {
// Basic client-side HTML cleaning using DOMParser
const cleanedHtml = cleanHtmlForMarkdown(html);
return getTurndownService().turndown(cleanedHtml);
}
/**
* Client-side HTML cleaning optimized for Markdown conversion.
* Uses DOMParser (browser-native) instead of Cheerio (server-only).
*/
function cleanHtmlForMarkdown(html: string): string {
try {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
// Remove unwanted elements
const unwantedSelectors = [
'script', 'style', 'link', 'noscript', 'iframe', 'svg', 'canvas',
'nav:not(main nav)', 'aside', 'footer:not(article footer)',
'.ad', '.ads', '.advertisement', '.banner', '.popup', '.modal', '.overlay',
'.cookie-banner', '.newsletter-signup', '.social-share', '.comments',
'.sidebar', '.widget', '.carousel', '.slider',
'[aria-hidden="true"]', '[hidden]',
'[data-analytics]', '[data-tracking]', '[data-gtm]',
];
for (const selector of unwantedSelectors) {
try {
doc.querySelectorAll(selector).forEach(el => el.remove());
} catch {
// Skip invalid selectors (e.g., complex :not() selectors may fail in some browsers)
}
}
// Remove hidden elements via inline styles
doc.querySelectorAll('[style]').forEach(el => {
const style = el.getAttribute('style') || '';
if (style.includes('display: none') || style.includes('display:none') ||
style.includes('visibility: hidden') || style.includes('visibility:hidden'))
el.remove();
});
// Clean up anchor hrefs (remove tracking parameters)
doc.querySelectorAll('a[href]').forEach(el => {
const href = el.getAttribute('href');
if (!href) return;
// Remove javascript: links
if (href.toLowerCase().startsWith('javascript:')) {
el.removeAttribute('href');
return;
}
// Remove tracking parameters
if (href.includes('?')) {
try {
const url = new URL(href, 'http://placeholder');
const cleanParams = new URLSearchParams();
url.searchParams.forEach((value, key) => {
if (!key.match(/^(utm_|fbclid|gclid|msclkid)/i))
cleanParams.append(key, value);
});
const cleanHref = `${url.pathname}${cleanParams.toString() ? '?' + cleanParams.toString() : ''}${url.hash}`;
el.setAttribute('href', cleanHref);
} catch {
// Keep original href if URL parsing fails
}
}
});
// Remove comments (HTML comment nodes)
const walker = document.createTreeWalker(doc.body, NodeFilter.SHOW_COMMENT);
const comments: Comment[] = [];
while (walker.nextNode())
comments.push(walker.currentNode as Comment);
comments.forEach(comment => comment.remove());
return doc.body.innerHTML;
} catch (error) {
console.error('HTML cleaning error:', error);
return html; // Return original if cleaning fails
}
}