Attachments: client-side Markdown conversion and Text/HTML cleanup & Markdown conversion

2026-05-10 21:50:14 -07:00 · 2026-01-22 17:17:57 -08:00
parent 7eac409ec6
commit 1d91e9da03
4 changed files with 199 additions and 26 deletions
@@ -1,7 +1,7 @@
 import * as React from 'react';
 import TimeAgo from 'react-timeago';

-import { Box, Button, CircularProgress, ColorPaletteProp, ListItem, Sheet, Typography, VariantProp } from '@mui/joy';
+import { Box, Button, CircularProgress, ColorPaletteProp, Sheet, Typography, VariantProp } from '@mui/joy';
 import AbcIcon from '@mui/icons-material/Abc';
 import CodeIcon from '@mui/icons-material/Code';
 import DescriptionOutlinedIcon from '@mui/icons-material/DescriptionOutlined';
@@ -91,8 +91,11 @@ function InputErrorIndicator() {

 const converterTypeToIconMap: { [key in AttachmentDraftConverterType]: React.ComponentType<any> | null } = {
  'text': TextFieldsIcon,
+  'text-cleaner': CodeIcon,
+  'text-markdown': TextFieldsIcon,
  'rich-text': CodeIcon,
  'rich-text-cleaner': CodeIcon,
+  'rich-text-markdown': TextFieldsIcon,
  'rich-text-table': PivotTableChartIcon,
  'image-original': ImageOutlinedIcon,
  'image-resized-high': PhotoSizeSelectLargeOutlinedIcon,
@@ -201,13 +204,21 @@ function attachmentIcons(attachmentDraft: AttachmentDraft, noTooltips: boolean,

 function attachmentLabelText(attachmentDraft: AttachmentDraft): string {
  const converter = attachmentDraft.converters.find(c => c.isActive) ?? null;
-  if (converter && attachmentDraft.label === 'Rich Text') {
-    if (converter.id === 'rich-text-table')
-      return 'Rich Table';
-    if (converter.id === 'rich-text-cleaner')
+  if (converter && attachmentDraft.label === 'Text') {
+    if (converter.id === 'text-markdown')
+      return 'Markdown';
+    if (converter.id === 'text-cleaner')
      return 'Clean HTML';
+  }
+  if (converter && attachmentDraft.label === 'Rich Text') {
    if (converter.id === 'rich-text')
      return 'Rich HTML';
+    if (converter.id === 'rich-text-markdown')
+      return 'Markdown';
+    if (converter.id === 'rich-text-cleaner')
+      return 'Clean HTML';
+    if (converter.id === 'rich-text-table')
+      return 'Rich Table';
  }
  return ellipsizeFront(attachmentDraft.label, 22);
 }
@@ -265,6 +265,7 @@ export function attachmentDefineConverters(source: AttachmentDraftSource, input:
    case mimeTypeIsPlainText(input.mimeType):
      // handle a secondary layer of HTML 'text' origins: drop, paste, and clipboard-read
      const textOriginHtml = source.media === 'text' && input.altMimeType === 'text/html' && !!input.altData;
+      const textOriginClipboard = source.media === 'text' && ['clipboard-read', 'paste'].includes(source.method);
      const isHtmlTable = !!input.altData?.startsWith('<table');

      // p1: Tables
@@ -273,11 +274,16 @@ export function attachmentDefineConverters(source: AttachmentDraftSource, input:

      // p2: Text
      converters.push({ id: 'text', name: attachmentSourceSupportsLiveFile(source) ? 'Text (Live)' : 'Text' });
+      if (!textOriginHtml && textOriginClipboard) {
+        converters.push({ id: 'text-markdown', name: 'Text -> Markdown' });
+        converters.push({ id: 'text-cleaner', name: 'Text -> Clean HTML' });
+      }

-      // p3: Html
+      // p3: Html -> Markdown, and Html
      if (textOriginHtml) {
-        converters.push({ id: 'rich-text-cleaner', name: 'Cleaner HTML' });
        converters.push({ id: 'rich-text', name: 'HTML · Heavy' });
+        converters.push({ id: 'rich-text-markdown', name: 'HTML -> Markdown' });
+        converters.push({ id: 'rich-text-cleaner', name: 'HTML -> Clean HTML' });
      }
      break;

@@ -501,35 +507,63 @@ export async function attachmentPerformConversion(

    switch (converter.id) {

-      // text as-is
+      // text
      case 'text':
+      case 'text-cleaner':
+      case 'text-markdown':
        const possibleLiveFileId = await attachmentGetLiveFileId(source);
-        const textContent = await _inputDataToString(input.data, 'text');
-        const textualInlineData = createDMessageDataInlineText(textContent, input.mimeType);
+        let textContent = await _inputDataToString(input.data, 'text');
+        let textContentMime = input.mimeType || 'text/plain';
+
+        switch (converter.id) {
+          case 'text-cleaner':
+            textContent = _cleanPossibleHtmlText(textContent);
+            break;
+          case 'text-markdown':
+            try {
+              const { convertHtmlToMarkdown } = await import('./file-converters/HtmlToMarkdown');
+              textContent = convertHtmlToMarkdown(textContent);
+              textContentMime = 'text/markdown';
+            } catch (error) {
+              console.log('[DEV] Error converting Text (HTML) to Markdown:', error);
+            }
+            break;
+        }
+
+        const textualInlineData = createDMessageDataInlineText(textContent, textContentMime);
        newFragments.push(createDocAttachmentFragment(title, caption, _guessDocVDT(input.mimeType), textualInlineData, refString, DOCPART_DEFAULT_VERSION, docMeta, possibleLiveFileId));
        break;

-      // html as-is
+      // html
      case 'rich-text':
+      case 'rich-text-cleaner':
+      case 'rich-text-markdown':
+        let richText = input.altData || '';
+        let richTextMimeType = input.altMimeType || 'text/html';
+
+        // html -> cleaner/html or markdown
+        switch (converter.id) {
+          case 'rich-text-cleaner':
+            richText = _cleanPossibleHtmlText(richText);
+            richTextMimeType = 'text/html';
+            break;
+          case 'rich-text-markdown':
+            try {
+              const { convertHtmlToMarkdown } = await import('./file-converters/HtmlToMarkdown');
+              richText = convertHtmlToMarkdown(richText);
+              richTextMimeType = 'text/markdown';
+            } catch (error) {
+              console.log('[DEV] Error converting HTML to Markdown:', error);
+            }
+            break;
+        }
+
        // NOTE: before we had the following: createTextAttachmentFragment(ref || '\n<!DOCTYPE html>', input.altData!), which
        //       was used to wrap the HTML in a code block to facilitate AutoRenderBlocks's parser. Historic note, for future debugging.
-        const richTextData = createDMessageDataInlineText(input.altData || '', input.altMimeType);
+        const richTextData = createDMessageDataInlineText(richText, richTextMimeType);
        newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.VndAgiCode, richTextData, refString, DOCPART_DEFAULT_VERSION, docMeta));
        break;

-      // html cleaned
-      case 'rich-text-cleaner':
-        const cleanerHtml = (input.altData || '')
-          // remove class and style attributes
-          .replace(/<[^>]+>/g, (tag) =>
-            tag.replace(/ class="[^"]*"/g, '').replace(/ style="[^"]*"/g, ''),
-          )
-          // remove svg elements
-          .replace(/<svg[^>]*>.*?<\/svg>/g, '');
-        const cleanedHtmlData = createDMessageDataInlineText(cleanerHtml, 'text/html');
-        newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.VndAgiCode, cleanedHtmlData, refString, DOCPART_DEFAULT_VERSION, docMeta));
-        break;
-
      // html to markdown table
      case 'rich-text-table':
        let tableData: DMessageDataInline;
@@ -919,6 +953,11 @@ export async function attachmentPerformConversion(
      case 'unhandled':
        // force the user to explicitly select 'as text' if they want to proceed
        break;
+
+
+      default:
+        const _exhaustiveCheck: never = converter.id;
+        break;
    }
  }

@@ -961,6 +1000,19 @@ async function _inputDataToString(data: AttachmentDraftInput['data'], debugLocat
  return '';
 }

+/**
+ * Simple Client-side cleaning of possible HTML
+ */
+function _cleanPossibleHtmlText(inputStr: string): string {
+  return inputStr
+    // remove class and style attributes
+    .replace(/<[^>]+>/g, (tag) =>
+      tag.replace(/ class="[^"]*"/g, '').replace(/ style="[^"]*"/g, ''),
+    )
+    // remove svg elements
+    .replace(/<svg[^>]*>.*?<\/svg>/g, '');
+}
+

 /**
 * Special function to convert a list of files to Attachment Fragments, without passing through the attachments system
@@ -145,7 +145,8 @@ export type AttachmentDraftConverter = {
 }

 export type AttachmentDraftConverterType =
-  | 'text' | 'rich-text' | 'rich-text-cleaner' | 'rich-text-table'
+  | 'text' | 'text-cleaner' | 'text-markdown'
+  | 'rich-text' | 'rich-text-cleaner' | 'rich-text-markdown' | 'rich-text-table'
  | 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-caption' | 'image-to-default'
  | 'pdf-auto' | 'pdf-text' | 'pdf-images' | 'pdf-images-ocr' | 'pdf-text-and-images'
  | 'docx-to-html'
@@ -0,0 +1,109 @@
+import { default as TurndownService } from 'turndown';
+
+
+// Cached Turndown service instance
+let _turndownService: TurndownService | null = null;
+
+function getTurndownService(): TurndownService {
+  if (!_turndownService) {
+    _turndownService = new TurndownService({
+      headingStyle: 'atx',
+      codeBlockStyle: 'fenced',
+      emDelimiter: '_',
+    });
+
+    // Remove script and style elements
+    _turndownService.remove(['script', 'style', 'noscript']);
+  }
+  return _turndownService;
+}
+
+
+/**
+ * Convert HTML string to Markdown using Turndown.
+ * Performs basic HTML cleaning before conversion.
+ */
+export function convertHtmlToMarkdown(html: string): string {
+  // Basic client-side HTML cleaning using DOMParser
+  const cleanedHtml = cleanHtmlForMarkdown(html);
+  return getTurndownService().turndown(cleanedHtml);
+}
+
+
+/**
+ * Client-side HTML cleaning optimized for Markdown conversion.
+ * Uses DOMParser (browser-native) instead of Cheerio (server-only).
+ */
+function cleanHtmlForMarkdown(html: string): string {
+  try {
+    const parser = new DOMParser();
+    const doc = parser.parseFromString(html, 'text/html');
+
+    // Remove unwanted elements
+    const unwantedSelectors = [
+      'script', 'style', 'link', 'noscript', 'iframe', 'svg', 'canvas',
+      'nav:not(main nav)', 'aside', 'footer:not(article footer)',
+      '.ad', '.ads', '.advertisement', '.banner', '.popup', '.modal', '.overlay',
+      '.cookie-banner', '.newsletter-signup', '.social-share', '.comments',
+      '.sidebar', '.widget', '.carousel', '.slider',
+      '[aria-hidden="true"]', '[hidden]',
+      '[data-analytics]', '[data-tracking]', '[data-gtm]',
+    ];
+
+    for (const selector of unwantedSelectors) {
+      try {
+        doc.querySelectorAll(selector).forEach(el => el.remove());
+      } catch {
+        // Skip invalid selectors (e.g., complex :not() selectors may fail in some browsers)
+      }
+    }
+
+    // Remove hidden elements via inline styles
+    doc.querySelectorAll('[style]').forEach(el => {
+      const style = el.getAttribute('style') || '';
+      if (style.includes('display: none') || style.includes('display:none') ||
+          style.includes('visibility: hidden') || style.includes('visibility:hidden'))
+        el.remove();
+    });
+
+    // Clean up anchor hrefs (remove tracking parameters)
+    doc.querySelectorAll('a[href]').forEach(el => {
+      const href = el.getAttribute('href');
+      if (!href) return;
+
+      // Remove javascript: links
+      if (href.toLowerCase().startsWith('javascript:')) {
+        el.removeAttribute('href');
+        return;
+      }
+
+      // Remove tracking parameters
+      if (href.includes('?')) {
+        try {
+          const url = new URL(href, 'http://placeholder');
+          const cleanParams = new URLSearchParams();
+          url.searchParams.forEach((value, key) => {
+            if (!key.match(/^(utm_|fbclid|gclid|msclkid)/i))
+              cleanParams.append(key, value);
+          });
+          const cleanHref = `${url.pathname}${cleanParams.toString() ? '?' + cleanParams.toString() : ''}${url.hash}`;
+          el.setAttribute('href', cleanHref);
+        } catch {
+          // Keep original href if URL parsing fails
+        }
+      }
+    });
+
+    // Remove comments (HTML comment nodes)
+    const walker = document.createTreeWalker(doc.body, NodeFilter.SHOW_COMMENT);
+    const comments: Comment[] = [];
+    while (walker.nextNode())
+      comments.push(walker.currentNode as Comment);
+    comments.forEach(comment => comment.remove());
+
+    return doc.body.innerHTML;
+  } catch (error) {
+    console.error('HTML cleaning error:', error);
+    return html; // Return original if cleaning fails
+  }
+}