mirror of
https://github.com/enricoros/big-AGI.git
synced 2026-05-10 21:50:14 -07:00
Attachments: client-side Markdown conversion and Text/HTML cleanup & Markdown conversion
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import * as React from 'react';
|
||||
import TimeAgo from 'react-timeago';
|
||||
|
||||
import { Box, Button, CircularProgress, ColorPaletteProp, ListItem, Sheet, Typography, VariantProp } from '@mui/joy';
|
||||
import { Box, Button, CircularProgress, ColorPaletteProp, Sheet, Typography, VariantProp } from '@mui/joy';
|
||||
import AbcIcon from '@mui/icons-material/Abc';
|
||||
import CodeIcon from '@mui/icons-material/Code';
|
||||
import DescriptionOutlinedIcon from '@mui/icons-material/DescriptionOutlined';
|
||||
@@ -91,8 +91,11 @@ function InputErrorIndicator() {
|
||||
|
||||
const converterTypeToIconMap: { [key in AttachmentDraftConverterType]: React.ComponentType<any> | null } = {
|
||||
'text': TextFieldsIcon,
|
||||
'text-cleaner': CodeIcon,
|
||||
'text-markdown': TextFieldsIcon,
|
||||
'rich-text': CodeIcon,
|
||||
'rich-text-cleaner': CodeIcon,
|
||||
'rich-text-markdown': TextFieldsIcon,
|
||||
'rich-text-table': PivotTableChartIcon,
|
||||
'image-original': ImageOutlinedIcon,
|
||||
'image-resized-high': PhotoSizeSelectLargeOutlinedIcon,
|
||||
@@ -201,13 +204,21 @@ function attachmentIcons(attachmentDraft: AttachmentDraft, noTooltips: boolean,
|
||||
|
||||
function attachmentLabelText(attachmentDraft: AttachmentDraft): string {
|
||||
const converter = attachmentDraft.converters.find(c => c.isActive) ?? null;
|
||||
if (converter && attachmentDraft.label === 'Rich Text') {
|
||||
if (converter.id === 'rich-text-table')
|
||||
return 'Rich Table';
|
||||
if (converter.id === 'rich-text-cleaner')
|
||||
if (converter && attachmentDraft.label === 'Text') {
|
||||
if (converter.id === 'text-markdown')
|
||||
return 'Markdown';
|
||||
if (converter.id === 'text-cleaner')
|
||||
return 'Clean HTML';
|
||||
}
|
||||
if (converter && attachmentDraft.label === 'Rich Text') {
|
||||
if (converter.id === 'rich-text')
|
||||
return 'Rich HTML';
|
||||
if (converter.id === 'rich-text-markdown')
|
||||
return 'Markdown';
|
||||
if (converter.id === 'rich-text-cleaner')
|
||||
return 'Clean HTML';
|
||||
if (converter.id === 'rich-text-table')
|
||||
return 'Rich Table';
|
||||
}
|
||||
return ellipsizeFront(attachmentDraft.label, 22);
|
||||
}
|
||||
|
||||
@@ -265,6 +265,7 @@ export function attachmentDefineConverters(source: AttachmentDraftSource, input:
|
||||
case mimeTypeIsPlainText(input.mimeType):
|
||||
// handle a secondary layer of HTML 'text' origins: drop, paste, and clipboard-read
|
||||
const textOriginHtml = source.media === 'text' && input.altMimeType === 'text/html' && !!input.altData;
|
||||
const textOriginClipboard = source.media === 'text' && ['clipboard-read', 'paste'].includes(source.method);
|
||||
const isHtmlTable = !!input.altData?.startsWith('<table');
|
||||
|
||||
// p1: Tables
|
||||
@@ -273,11 +274,16 @@ export function attachmentDefineConverters(source: AttachmentDraftSource, input:
|
||||
|
||||
// p2: Text
|
||||
converters.push({ id: 'text', name: attachmentSourceSupportsLiveFile(source) ? 'Text (Live)' : 'Text' });
|
||||
if (!textOriginHtml && textOriginClipboard) {
|
||||
converters.push({ id: 'text-markdown', name: 'Text -> Markdown' });
|
||||
converters.push({ id: 'text-cleaner', name: 'Text -> Clean HTML' });
|
||||
}
|
||||
|
||||
// p3: Html
|
||||
// p3: Html -> Markdown, and Html
|
||||
if (textOriginHtml) {
|
||||
converters.push({ id: 'rich-text-cleaner', name: 'Cleaner HTML' });
|
||||
converters.push({ id: 'rich-text', name: 'HTML · Heavy' });
|
||||
converters.push({ id: 'rich-text-markdown', name: 'HTML -> Markdown' });
|
||||
converters.push({ id: 'rich-text-cleaner', name: 'HTML -> Clean HTML' });
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -501,35 +507,63 @@ export async function attachmentPerformConversion(
|
||||
|
||||
switch (converter.id) {
|
||||
|
||||
// text as-is
|
||||
// text
|
||||
case 'text':
|
||||
case 'text-cleaner':
|
||||
case 'text-markdown':
|
||||
const possibleLiveFileId = await attachmentGetLiveFileId(source);
|
||||
const textContent = await _inputDataToString(input.data, 'text');
|
||||
const textualInlineData = createDMessageDataInlineText(textContent, input.mimeType);
|
||||
let textContent = await _inputDataToString(input.data, 'text');
|
||||
let textContentMime = input.mimeType || 'text/plain';
|
||||
|
||||
switch (converter.id) {
|
||||
case 'text-cleaner':
|
||||
textContent = _cleanPossibleHtmlText(textContent);
|
||||
break;
|
||||
case 'text-markdown':
|
||||
try {
|
||||
const { convertHtmlToMarkdown } = await import('./file-converters/HtmlToMarkdown');
|
||||
textContent = convertHtmlToMarkdown(textContent);
|
||||
textContentMime = 'text/markdown';
|
||||
} catch (error) {
|
||||
console.log('[DEV] Error converting Text (HTML) to Markdown:', error);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
const textualInlineData = createDMessageDataInlineText(textContent, textContentMime);
|
||||
newFragments.push(createDocAttachmentFragment(title, caption, _guessDocVDT(input.mimeType), textualInlineData, refString, DOCPART_DEFAULT_VERSION, docMeta, possibleLiveFileId));
|
||||
break;
|
||||
|
||||
// html as-is
|
||||
// html
|
||||
case 'rich-text':
|
||||
case 'rich-text-cleaner':
|
||||
case 'rich-text-markdown':
|
||||
let richText = input.altData || '';
|
||||
let richTextMimeType = input.altMimeType || 'text/html';
|
||||
|
||||
// html -> cleaner/html or markdown
|
||||
switch (converter.id) {
|
||||
case 'rich-text-cleaner':
|
||||
richText = _cleanPossibleHtmlText(richText);
|
||||
richTextMimeType = 'text/html';
|
||||
break;
|
||||
case 'rich-text-markdown':
|
||||
try {
|
||||
const { convertHtmlToMarkdown } = await import('./file-converters/HtmlToMarkdown');
|
||||
richText = convertHtmlToMarkdown(richText);
|
||||
richTextMimeType = 'text/markdown';
|
||||
} catch (error) {
|
||||
console.log('[DEV] Error converting HTML to Markdown:', error);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// NOTE: before we had the following: createTextAttachmentFragment(ref || '\n<!DOCTYPE html>', input.altData!), which
|
||||
// was used to wrap the HTML in a code block to facilitate AutoRenderBlocks's parser. Historic note, for future debugging.
|
||||
const richTextData = createDMessageDataInlineText(input.altData || '', input.altMimeType);
|
||||
const richTextData = createDMessageDataInlineText(richText, richTextMimeType);
|
||||
newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.VndAgiCode, richTextData, refString, DOCPART_DEFAULT_VERSION, docMeta));
|
||||
break;
|
||||
|
||||
// html cleaned
|
||||
case 'rich-text-cleaner':
|
||||
const cleanerHtml = (input.altData || '')
|
||||
// remove class and style attributes
|
||||
.replace(/<[^>]+>/g, (tag) =>
|
||||
tag.replace(/ class="[^"]*"/g, '').replace(/ style="[^"]*"/g, ''),
|
||||
)
|
||||
// remove svg elements
|
||||
.replace(/<svg[^>]*>.*?<\/svg>/g, '');
|
||||
const cleanedHtmlData = createDMessageDataInlineText(cleanerHtml, 'text/html');
|
||||
newFragments.push(createDocAttachmentFragment(title, caption, DVMimeType.VndAgiCode, cleanedHtmlData, refString, DOCPART_DEFAULT_VERSION, docMeta));
|
||||
break;
|
||||
|
||||
// html to markdown table
|
||||
case 'rich-text-table':
|
||||
let tableData: DMessageDataInline;
|
||||
@@ -919,6 +953,11 @@ export async function attachmentPerformConversion(
|
||||
case 'unhandled':
|
||||
// force the user to explicitly select 'as text' if they want to proceed
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
const _exhaustiveCheck: never = converter.id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -961,6 +1000,19 @@ async function _inputDataToString(data: AttachmentDraftInput['data'], debugLocat
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple Client-side cleaning of possible HTML
|
||||
*/
|
||||
function _cleanPossibleHtmlText(inputStr: string): string {
|
||||
return inputStr
|
||||
// remove class and style attributes
|
||||
.replace(/<[^>]+>/g, (tag) =>
|
||||
tag.replace(/ class="[^"]*"/g, '').replace(/ style="[^"]*"/g, ''),
|
||||
)
|
||||
// remove svg elements
|
||||
.replace(/<svg[^>]*>.*?<\/svg>/g, '');
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Special function to convert a list of files to Attachment Fragments, without passing through the attachments system
|
||||
|
||||
@@ -145,7 +145,8 @@ export type AttachmentDraftConverter = {
|
||||
}
|
||||
|
||||
export type AttachmentDraftConverterType =
|
||||
| 'text' | 'rich-text' | 'rich-text-cleaner' | 'rich-text-table'
|
||||
| 'text' | 'text-cleaner' | 'text-markdown'
|
||||
| 'rich-text' | 'rich-text-cleaner' | 'rich-text-markdown' | 'rich-text-table'
|
||||
| 'image-original' | 'image-resized-high' | 'image-resized-low' | 'image-ocr' | 'image-caption' | 'image-to-default'
|
||||
| 'pdf-auto' | 'pdf-text' | 'pdf-images' | 'pdf-images-ocr' | 'pdf-text-and-images'
|
||||
| 'docx-to-html'
|
||||
|
||||
@@ -0,0 +1,109 @@
|
||||
import { default as TurndownService } from 'turndown';
|
||||
|
||||
|
||||
// Cached Turndown service instance
|
||||
let _turndownService: TurndownService | null = null;
|
||||
|
||||
function getTurndownService(): TurndownService {
|
||||
if (!_turndownService) {
|
||||
_turndownService = new TurndownService({
|
||||
headingStyle: 'atx',
|
||||
codeBlockStyle: 'fenced',
|
||||
emDelimiter: '_',
|
||||
});
|
||||
|
||||
// Remove script and style elements
|
||||
_turndownService.remove(['script', 'style', 'noscript']);
|
||||
}
|
||||
return _turndownService;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Convert HTML string to Markdown using Turndown.
|
||||
* Performs basic HTML cleaning before conversion.
|
||||
*/
|
||||
export function convertHtmlToMarkdown(html: string): string {
|
||||
// Basic client-side HTML cleaning using DOMParser
|
||||
const cleanedHtml = cleanHtmlForMarkdown(html);
|
||||
return getTurndownService().turndown(cleanedHtml);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Client-side HTML cleaning optimized for Markdown conversion.
|
||||
* Uses DOMParser (browser-native) instead of Cheerio (server-only).
|
||||
*/
|
||||
function cleanHtmlForMarkdown(html: string): string {
|
||||
try {
|
||||
const parser = new DOMParser();
|
||||
const doc = parser.parseFromString(html, 'text/html');
|
||||
|
||||
// Remove unwanted elements
|
||||
const unwantedSelectors = [
|
||||
'script', 'style', 'link', 'noscript', 'iframe', 'svg', 'canvas',
|
||||
'nav:not(main nav)', 'aside', 'footer:not(article footer)',
|
||||
'.ad', '.ads', '.advertisement', '.banner', '.popup', '.modal', '.overlay',
|
||||
'.cookie-banner', '.newsletter-signup', '.social-share', '.comments',
|
||||
'.sidebar', '.widget', '.carousel', '.slider',
|
||||
'[aria-hidden="true"]', '[hidden]',
|
||||
'[data-analytics]', '[data-tracking]', '[data-gtm]',
|
||||
];
|
||||
|
||||
for (const selector of unwantedSelectors) {
|
||||
try {
|
||||
doc.querySelectorAll(selector).forEach(el => el.remove());
|
||||
} catch {
|
||||
// Skip invalid selectors (e.g., complex :not() selectors may fail in some browsers)
|
||||
}
|
||||
}
|
||||
|
||||
// Remove hidden elements via inline styles
|
||||
doc.querySelectorAll('[style]').forEach(el => {
|
||||
const style = el.getAttribute('style') || '';
|
||||
if (style.includes('display: none') || style.includes('display:none') ||
|
||||
style.includes('visibility: hidden') || style.includes('visibility:hidden'))
|
||||
el.remove();
|
||||
});
|
||||
|
||||
// Clean up anchor hrefs (remove tracking parameters)
|
||||
doc.querySelectorAll('a[href]').forEach(el => {
|
||||
const href = el.getAttribute('href');
|
||||
if (!href) return;
|
||||
|
||||
// Remove javascript: links
|
||||
if (href.toLowerCase().startsWith('javascript:')) {
|
||||
el.removeAttribute('href');
|
||||
return;
|
||||
}
|
||||
|
||||
// Remove tracking parameters
|
||||
if (href.includes('?')) {
|
||||
try {
|
||||
const url = new URL(href, 'http://placeholder');
|
||||
const cleanParams = new URLSearchParams();
|
||||
url.searchParams.forEach((value, key) => {
|
||||
if (!key.match(/^(utm_|fbclid|gclid|msclkid)/i))
|
||||
cleanParams.append(key, value);
|
||||
});
|
||||
const cleanHref = `${url.pathname}${cleanParams.toString() ? '?' + cleanParams.toString() : ''}${url.hash}`;
|
||||
el.setAttribute('href', cleanHref);
|
||||
} catch {
|
||||
// Keep original href if URL parsing fails
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Remove comments (HTML comment nodes)
|
||||
const walker = document.createTreeWalker(doc.body, NodeFilter.SHOW_COMMENT);
|
||||
const comments: Comment[] = [];
|
||||
while (walker.nextNode())
|
||||
comments.push(walker.currentNode as Comment);
|
||||
comments.forEach(comment => comment.remove());
|
||||
|
||||
return doc.body.innerHTML;
|
||||
} catch (error) {
|
||||
console.error('HTML cleaning error:', error);
|
||||
return html; // Return original if cleaning fails
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user