From f7aed8dea6fd5eb6a9377c0fbf7a8498bdfd4f6d Mon Sep 17 00:00:00 2001 From: Enrico Ros Date: Fri, 25 Aug 2023 19:19:00 -0700 Subject: [PATCH] Improve block parsing, now with inline images, multiple-interleaved blocks support --- src/apps/chat/components/message/blocks.ts | 66 ++++++++++++++++------ 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/src/apps/chat/components/message/blocks.ts b/src/apps/chat/components/message/blocks.ts index e1523c57f..f6fffb922 100644 --- a/src/apps/chat/components/message/blocks.ts +++ b/src/apps/chat/components/message/blocks.ts @@ -1,7 +1,8 @@ -type Block = CodeBlock | HtmlBlock | ImageBlock | TextBlock; +type Block = CodeBlock | HtmlBlock | ImageBlock | /*LatexBlock |*/ TextBlock; export type CodeBlock = { type: 'code'; blockTitle: string; blockCode: string; complete: boolean; }; export type HtmlBlock = { type: 'html'; html: string; }; export type ImageBlock = { type: 'image'; url: string; }; +// export type LatexBlock = { type: 'latex'; latex: string; }; export type TextBlock = { type: 'text'; content: string; }; // for Text or Markdown @@ -9,28 +10,61 @@ export function parseBlocks(forceText: boolean, text: string): Block[] { if (forceText) return [{ type: 'text', content: text }]; - if (text.startsWith('https://images.prodia.xyz/') && text.endsWith('.png') && text.length > 60) - return [{ type: 'image', url: text.trim() }]; + const regexPatterns = { + codeBlock: /`{3,}([\w\\.+-_]+)?\n([\s\S]*?)(`{3,}\n?|$)/g, + imageBlock: /(https:\/\/images\.prodia\.xyz\/.*?\.png)/g, // NOTE: only Prodia for now - but this shall be expanded to markdown images ![alt](url) or any png/jpeg + latexBlock: /\$\$(.*?)\$\$\n?/g, + }; - // noinspection HtmlRequiredTitleElement - if (text.startsWith('\n')) - return [{ type: 'html', html: text }]; - - const codeBlockRegex = /`{3,}([\w\\.+-_]+)?\n([\s\S]*?)(`{3,}|$)/g; const blocks: Block[] = []; - let lastIndex = 0; - let match; - while ((match = codeBlockRegex.exec(text)) !== null) { - blocks.push({ type: 'text', content: text.slice(lastIndex, match.index) }); - const blockTitle: string = (match[1] || '').trim(); - const blockCode: string = match[2].trim(); - const blockEnd: string = match[3]; - blocks.push({ type: 'code', blockTitle, blockCode, complete: blockEnd.startsWith('```') }); + while (true) { + + // find the first match (if any) trying all the regexes + let match: RegExpExecArray | null = null; + let matchType: keyof typeof regexPatterns | null = null; + for (const type in regexPatterns) { + const regex = regexPatterns[type as keyof typeof regexPatterns]; + regex.lastIndex = lastIndex; + const currentMatch = regex.exec(text); + if (currentMatch && (match === null || currentMatch.index < match.index)) { + match = currentMatch; + matchType = type as keyof typeof regexPatterns; + } + } + if (match === null) + break; + + // anything leftover before the match is text + if (match.index > lastIndex) + blocks.push({ type: 'text', content: text.slice(lastIndex, match.index) }); + + // add the block + switch (matchType) { + case 'codeBlock': + const blockTitle: string = (match[1] || '').trim(); + const blockCode: string = match[2].trim(); + const blockEnd: string = match[3]; + blocks.push({ type: 'code', blockTitle, blockCode, complete: blockEnd.startsWith('```') }); + break; + + case 'imageBlock': + const url: string = match[1]; + blocks.push({ type: 'image', url }); + break; + + case 'latexBlock': + const latex: string = match[1]; + blocks.push({ type: 'text', content: latex }); + break; + } + + // advance the pointer lastIndex = match.index + match[0].length; } + // remainder is text if (lastIndex < text.length) blocks.push({ type: 'text', content: text.slice(lastIndex) });